Update aosp/master LLVM for rebase to r235153

Change-Id: I9bf53792f9fc30570e81a8d80d296c681d005ea7 (cherry picked from commit 0c7f116bb6950ef819323d855415b2f2b0aad987)
author: Pirama Arumuga Nainar <pirama@google.com> 2015-05-06 11:46:36 -0700
committer: Pirama Arumuga Nainar <pirama@google.com> 2015-05-18 10:52:30 -0700
commit: 2c3e0051c31c3f5b2328b447eadf1cf9c4427442 (patch)
tree: c0104029af14e9f47c2ef58ca60e6137691f3c9b /test/CodeGen
parent: e1bc145815f4334641be19f1c45ecf85d25b6e5a (diff)
download: external_llvm-2c3e0051c31c3f5b2328b447eadf1cf9c4427442.zip
external_llvm-2c3e0051c31c3f5b2328b447eadf1cf9c4427442.tar.gz
external_llvm-2c3e0051c31c3f5b2328b447eadf1cf9c4427442.tar.bz2
554 files changed, 11010 insertions, 2088 deletions
diff --git a/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
new file mode 100644
index 0000000..a31c66b
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
@@ -0,0 +1,491 @@
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+; This test aims to check basic correctness of frame layout &
+; frame access code. There are 8 functions in this test file,
+; each function implements one element in the cartesian product
+; of:
+; . a function having a VLA/noVLA
+; . a function with dynamic stack realignment/no dynamic stack realignment.
+; . a function needing a frame pionter/no frame pointer,
+; since the presence/absence of these has influence on the frame
+; layout and which pointer to use to access various part of the
+; frame (bp,sp,fp).
+;
+; Furthermore: in every test function:
+; . there is always one integer and 1 floating point argument to be able
+;   to check those are accessed correctly.
+; . there is always one local variable to check that is accessed
+;   correctly
+;
+; The LLVM-IR below was produced by clang on the following C++ code:
+;extern "C" int g();
+;extern "C" int novla_nodynamicrealign_call(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+;                                             double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+;  // use an argument passed on the stack.
+;  volatile int l1;
+;  return i10 + (int)d10 + l1 + g();
+;}
+;extern "C" int novla_nodynamicrealign_nocall(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+;                                             double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+;  // use an argument passed on the stack.
+;  volatile int l1;
+;  return i10 + (int)d10 + l1;
+;}
+;extern "C" int novla_dynamicrealign_call(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+;                                         double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+;  // use an argument passed on the stack.
+;  alignas(128) volatile int l1;
+;  return i10 + (int)d10 + l1 + g();
+;}
+;extern "C" int novla_dynamicrealign_nocall(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+;                                           double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+;  // use an argument passed on the stack.
+;  alignas(128) volatile int l1;
+;  return i10 + (int)d10 + l1;
+;}
+;
+;extern "C" int vla_nodynamicrealign_call(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+;                                         double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+;  // use an argument passed on the stack.
+;  volatile int l1;
+;  volatile int vla[i1];
+;  return i10 + (int)d10 + l1 + g() + vla[0];
+;}
+;extern "C" int vla_nodynamicrealign_nocall(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+;                                           double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+;  // use an argument passed on the stack.
+;  volatile int l1;
+;  volatile int vla[i1];
+;  return i10 + (int)d10 + l1 + vla[0];
+;}
+;extern "C" int vla_dynamicrealign_call(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+;                                       double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+;  // use an argument passed on the stack.
+;  alignas(128) volatile int l1;
+;  volatile int vla[i1];
+;  return i10 + (int)d10 + l1 + g() + vla[0];
+;}
+;extern "C" int vla_dynamicrealign_nocall(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+;                                         double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+;  // use an argument passed on the stack.
+;  alignas(128) volatile int l1;
+;  volatile int vla[i1];
+;  return i10 + (int)d10 + l1 + vla[0];
+;}
+
+
+
+define i32 @novla_nodynamicrealign_call(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #0 {
+entry:
+  %l1 = alloca i32, align 4
+  %conv = fptosi double %d10 to i32
+  %add = add nsw i32 %conv, %i10
+  %l1.0.l1.0. = load volatile i32, i32* %l1, align 4
+  %add1 = add nsw i32 %add, %l1.0.l1.0.
+  %call = tail call i32 @g()
+  %add2 = add nsw i32 %add1, %call
+  ret i32 %add2
+}
+; CHECK-LABEL: novla_nodynamicrealign_call
+; CHECK: .cfi_startproc
+;   Check that used callee-saved registers are saved
+; CHECK: stp	x20, x19, [sp, #-32]!
+;   Check that the frame pointer is created:
+; CHECK: stp	x29, x30, [sp, #16]
+; CHECK: add	x29, sp, #16
+;   Check correctness of cfi pseudo-instructions
+; CHECK: .cfi_def_cfa w29, 16
+; CHECK: .cfi_offset w30, -8
+; CHECK: .cfi_offset w29, -16
+; CHECK: .cfi_offset w19, -24
+; CHECK: .cfi_offset w20, -32
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
+; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #24]
+;   Check correct access to local variable on the stack, through stack pointer
+; CHECK: ldr	w[[ILOC:[0-9]+]], [sp, #12]
+;   Check epilogue:
+; CHECK: ldp	x29, x30, [sp, #16]
+; CHECK: ldp	x20, x19, [sp], #32
+; CHECK: ret
+; CHECK: .cfi_endproc
+
+
+declare i32 @g() #0
+
+; Function Attrs: nounwind
+define i32 @novla_nodynamicrealign_nocall(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 {
+entry:
+  %l1 = alloca i32, align 4
+  %conv = fptosi double %d10 to i32
+  %add = add nsw i32 %conv, %i10
+  %l1.0.l1.0. = load volatile i32, i32* %l1, align 4
+  %add1 = add nsw i32 %add, %l1.0.l1.0.
+  ret i32 %add1
+}
+; CHECK-LABEL: novla_nodynamicrealign_nocall
+;   Check that space is reserved for one local variable on the stack.
+; CHECK:	sub	sp, sp, #16             // =16
+;   Check correct access to arguments passed on the stack, through stack pointer
+; CHECK: ldr	d[[DARG:[0-9]+]], [sp, #40]
+; CHECK: ldr	w[[IARG:[0-9]+]], [sp, #24]
+;   Check correct access to local variable on the stack, through stack pointer
+; CHECK: ldr	w[[ILOC:[0-9]+]], [sp, #12]
+;   Check epilogue:
+; CHECK: add	sp, sp, #16             // =16
+; CHECK: ret
+
+
+define i32 @novla_dynamicrealign_call(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #0 {
+entry:
+  %l1 = alloca i32, align 128
+  %conv = fptosi double %d10 to i32
+  %add = add nsw i32 %conv, %i10
+  %l1.0.l1.0. = load volatile i32, i32* %l1, align 128
+  %add1 = add nsw i32 %add, %l1.0.l1.0.
+  %call = tail call i32 @g()
+  %add2 = add nsw i32 %add1, %call
+  ret i32 %add2
+}
+
+; CHECK-LABEL: novla_dynamicrealign_call
+; CHECK: .cfi_startproc
+;   Check that used callee-saved registers are saved
+; CHECK: stp	x20, x19, [sp, #-32]!
+;   Check that the frame pointer is created:
+; CHECK: stp	x29, x30, [sp, #16]
+; CHECK: add	x29, sp, #16
+;   Check the dynamic realignment of the stack pointer to a 128-byte boundary
+; CHECK: sub	x9, sp, #96
+; CHECK: and	sp, x9, #0xffffffffffffff80
+;   Check correctness of cfi pseudo-instructions
+; CHECK: .cfi_def_cfa w29, 16
+; CHECK: .cfi_offset w30, -8
+; CHECK: .cfi_offset w29, -16
+; CHECK: .cfi_offset w19, -24
+; CHECK: .cfi_offset w20, -32
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
+; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #24]
+;   Check correct access to local variable on the stack, through re-aligned stack pointer
+; CHECK: ldr	w[[ILOC:[0-9]+]], [sp]
+;   Check epilogue:
+;     Check that stack pointer get restored from frame pointer.
+; CHECK: sub	sp, x29, #16            // =16
+; CHECK: ldp	x29, x30, [sp, #16]
+; CHECK: ldp	x20, x19, [sp], #32
+; CHECK: ret
+; CHECK: .cfi_endproc
+
+
+; Function Attrs: nounwind
+define i32 @novla_dynamicrealign_nocall(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 {
+entry:
+  %l1 = alloca i32, align 128
+  %conv = fptosi double %d10 to i32
+  %add = add nsw i32 %conv, %i10
+  %l1.0.l1.0. = load volatile i32, i32* %l1, align 128
+  %add1 = add nsw i32 %add, %l1.0.l1.0.
+  ret i32 %add1
+}
+
+; CHECK-LABEL: novla_dynamicrealign_nocall
+;   Check that the frame pointer is created:
+; CHECK: stp	x29, x30, [sp, #-16]!
+; CHECK: mov	x29, sp
+;   Check the dynamic realignment of the stack pointer to a 128-byte boundary
+; CHECK: sub	x9, sp, #112
+; CHECK: and	sp, x9, #0xffffffffffffff80
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
+; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #24]
+;   Check correct access to local variable on the stack, through re-aligned stack pointer
+; CHECK: ldr	w[[ILOC:[0-9]+]], [sp]
+;   Check epilogue:
+;     Check that stack pointer get restored from frame pointer.
+; CHECK: mov	sp, x29
+; CHECK: ldp	x29, x30, [sp], #16
+; CHECK: ret
+
+
+define i32 @vla_nodynamicrealign_call(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #0 {
+entry:
+  %l1 = alloca i32, align 4
+  %0 = zext i32 %i1 to i64
+  %vla = alloca i32, i64 %0, align 4
+  %conv = fptosi double %d10 to i32
+  %add = add nsw i32 %conv, %i10
+  %l1.0.l1.0. = load volatile i32, i32* %l1, align 4
+  %add1 = add nsw i32 %add, %l1.0.l1.0.
+  %call = tail call i32 @g()
+  %add2 = add nsw i32 %add1, %call
+  %1 = load volatile i32, i32* %vla, align 4, !tbaa !1
+  %add3 = add nsw i32 %add2, %1
+  ret i32 %add3
+}
+
+; CHECK-LABEL: vla_nodynamicrealign_call
+; CHECK: .cfi_startproc
+;   Check that used callee-saved registers are saved
+; CHECK: stp	x20, x19, [sp, #-32]!
+;   Check that the frame pointer is created:
+; CHECK: stp	x29, x30, [sp, #16]
+; CHECK: add	x29, sp, #16
+;   Check that space is reserved on the stack for the local variable,
+;   rounded up to a multiple of 16 to keep the stack pointer 16-byte aligned.
+; CHECK: sub	sp, sp, #16
+;   Check correctness of cfi pseudo-instructions
+; CHECK: .cfi_def_cfa w29, 16
+; CHECK: .cfi_offset w30, -8
+; CHECK: .cfi_offset w29, -16
+; CHECK: .cfi_offset w19, -24
+; CHECK: .cfi_offset w20, -32
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #24]
+; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
+;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
+; CHECK: ubfx	x9, x0, #0, #32
+; CHECK: lsl	x9, x9, #2
+; CHECK: add	x9, x9, #15
+; CHECK: and	x9, x9, #0xfffffffffffffff0
+; CHECK: mov	 x10, sp
+; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK: mov	 sp, x[[VLASPTMP]]
+;   Check correct access to local variable, through frame pointer
+; CHECK: ldur	w[[ILOC:[0-9]+]], [x29, #-20]
+;   Check correct accessing of the VLA variable through the base pointer
+; CHECK: ldr	w[[VLA:[0-9]+]], [x[[VLASPTMP]]]
+;   Check epilogue:
+;     Check that stack pointer get restored from frame pointer.
+; CHECK: sub	sp, x29, #16            // =16
+; CHECK: ldp	x29, x30, [sp, #16]
+; CHECK: ldp	x20, x19, [sp], #32
+; CHECK: ret
+; CHECK: .cfi_endproc
+
+
+; Function Attrs: nounwind
+define i32 @vla_nodynamicrealign_nocall(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 {
+entry:
+  %l1 = alloca i32, align 4
+  %0 = zext i32 %i1 to i64
+  %vla = alloca i32, i64 %0, align 4
+  %conv = fptosi double %d10 to i32
+  %add = add nsw i32 %conv, %i10
+  %l1.0.l1.0. = load volatile i32, i32* %l1, align 4
+  %add1 = add nsw i32 %add, %l1.0.l1.0.
+  %1 = load volatile i32, i32* %vla, align 4, !tbaa !1
+  %add2 = add nsw i32 %add1, %1
+  ret i32 %add2
+}
+
+; CHECK-LABEL: vla_nodynamicrealign_nocall
+;   Check that the frame pointer is created:
+; CHECK: stp	x29, x30, [sp, #-16]!
+; CHECK: mov	x29, sp
+;   Check that space is reserved on the stack for the local variable,
+;   rounded up to a multiple of 16 to keep the stack pointer 16-byte aligned.
+; CHECK: sub	sp, sp, #16
+;   Check correctness of cfi pseudo-instructions
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #24]
+; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
+;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
+; CHECK: ubfx	x9, x0, #0, #32
+; CHECK: lsl	x9, x9, #2
+; CHECK: add	x9, x9, #15
+; CHECK: and	x9, x9, #0xfffffffffffffff0
+; CHECK: mov	 x10, sp
+; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK: mov	 sp, x[[VLASPTMP]]
+;   Check correct access to local variable, through frame pointer
+; CHECK: ldur	w[[ILOC:[0-9]+]], [x29, #-4]
+;   Check correct accessing of the VLA variable through the base pointer
+; CHECK: ldr	w[[VLA:[0-9]+]], [x[[VLASPTMP]]]
+;   Check epilogue:
+;     Check that stack pointer get restored from frame pointer.
+; CHECK: mov    sp, x29
+; CHECK: ldp	x29, x30, [sp], #16
+; CHECK: ret
+
+
+define i32 @vla_dynamicrealign_call(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #0 {
+entry:
+  %l1 = alloca i32, align 128
+  %0 = zext i32 %i1 to i64
+  %vla = alloca i32, i64 %0, align 4
+  %conv = fptosi double %d10 to i32
+  %add = add nsw i32 %conv, %i10
+  %l1.0.l1.0. = load volatile i32, i32* %l1, align 128
+  %add1 = add nsw i32 %add, %l1.0.l1.0.
+  %call = tail call i32 @g()
+  %add2 = add nsw i32 %add1, %call
+  %1 = load volatile i32, i32* %vla, align 4, !tbaa !1
+  %add3 = add nsw i32 %add2, %1
+  ret i32 %add3
+}
+
+; CHECK-LABEL: vla_dynamicrealign_call
+; CHECK: .cfi_startproc
+;   Check that used callee-saved registers are saved
+; CHECK: stp	x22, x21, [sp, #-48]!
+; CHECK: stp	x20, x19, [sp, #16]
+;   Check that the frame pointer is created:
+; CHECK: stp	x29, x30, [sp, #32]
+; CHECK: add	x29, sp, #32
+;   Check that the stack pointer gets re-aligned to 128
+;   bytes & the base pointer (x19) gets initialized to
+;   this 128-byte aligned area for local variables &
+;   spill slots
+; CHECK: sub	x9, sp, #80            // =80
+; CHECK: and	sp, x9, #0xffffffffffffff80
+; CHECK: mov    x19, sp
+;   Check correctness of cfi pseudo-instructions
+; CHECK: .cfi_def_cfa w29, 16
+; CHECK: .cfi_offset w30, -8
+; CHECK: .cfi_offset w29, -16
+; CHECK: .cfi_offset w19, -24
+; CHECK: .cfi_offset w20, -32
+; CHECK: .cfi_offset w21, -40
+; CHECK: .cfi_offset w22, -48
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #24]
+; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
+;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
+;   and set-up of base pointer (x19).
+; CHECK: ubfx	x9, x0, #0, #32
+; CHECK: lsl	x9, x9, #2
+; CHECK: add	x9, x9, #15
+; CHECK: and	x9, x9, #0xfffffffffffffff0
+; CHECK: mov	 x10, sp
+; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK: mov	 sp, x[[VLASPTMP]]
+;   Check correct access to local variable, through base pointer
+; CHECK: ldr	w[[ILOC:[0-9]+]], [x19]
+; CHECK: ldr	 w[[VLA:[0-9]+]], [x[[VLASPTMP]]]
+;   Check epilogue:
+;     Check that stack pointer get restored from frame pointer.
+; CHECK: sub	sp, x29, #32
+; CHECK: ldp	x29, x30, [sp, #32]
+; CHECK: ldp	x20, x19, [sp, #16]
+; CHECK: ldp	x22, x21, [sp], #48
+; CHECK: ret
+; CHECK: .cfi_endproc
+
+
+; Function Attrs: nounwind
+define i32 @vla_dynamicrealign_nocall(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 {
+entry:
+  %l1 = alloca i32, align 128
+  %0 = zext i32 %i1 to i64
+  %vla = alloca i32, i64 %0, align 4
+  %conv = fptosi double %d10 to i32
+  %add = add nsw i32 %conv, %i10
+  %l1.0.l1.0. = load volatile i32, i32* %l1, align 128
+  %add1 = add nsw i32 %add, %l1.0.l1.0.
+  %1 = load volatile i32, i32* %vla, align 4, !tbaa !1
+  %add2 = add nsw i32 %add1, %1
+  ret i32 %add2
+}
+
+; CHECK-LABEL: vla_dynamicrealign_nocall
+;   Check that used callee-saved registers are saved
+; CHECK: stp	x20, x19, [sp, #-32]!
+;   Check that the frame pointer is created:
+; CHECK: stp	x29, x30, [sp, #16]
+; CHECK: add	x29, sp, #16
+;   Check that the stack pointer gets re-aligned to 128
+;   bytes & the base pointer (x19) gets initialized to
+;   this 128-byte aligned area for local variables &
+;   spill slots
+; CHECK: sub	x9, sp, #96
+; CHECK: and	sp, x9, #0xffffffffffffff80
+; CHECK: mov    x19, sp
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #24]
+; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
+;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
+;   and set-up of base pointer (x19).
+; CHECK: ubfx	x9, x0, #0, #32
+; CHECK: lsl	x9, x9, #2
+; CHECK: add	x9, x9, #15
+; CHECK: and	x9, x9, #0xfffffffffffffff0
+; CHECK: mov	 x10, sp
+; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK: mov	 sp, x[[VLASPTMP]]
+;   Check correct access to local variable, through base pointer
+; CHECK: ldr	w[[ILOC:[0-9]+]], [x19]
+; CHECK: ldr	 w[[VLA:[0-9]+]], [x[[VLASPTMP]]]
+;   Check epilogue:
+;     Check that stack pointer get restored from frame pointer.
+; CHECK: sub	sp, x29, #16
+; CHECK: ldp	x29, x30, [sp, #16]
+; CHECK: ldp	x20, x19, [sp], #32
+; CHECK: ret
+
+
+; Function Attrs: nounwind
+define i32 @vla_dynamicrealign_nocall_large_align(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 {
+entry:
+  %l1 = alloca i32, align 32768
+  %0 = zext i32 %i1 to i64
+  %vla = alloca i32, i64 %0, align 4
+  %conv = fptosi double %d10 to i32
+  %add = add nsw i32 %conv, %i10
+  %l1.0.l1.0. = load volatile i32, i32* %l1, align 32768
+  %add1 = add nsw i32 %add, %l1.0.l1.0.
+  %1 = load volatile i32, i32* %vla, align 4, !tbaa !1
+  %add2 = add nsw i32 %add1, %1
+  ret i32 %add2
+}
+
+; CHECK-LABEL: vla_dynamicrealign_nocall_large_align
+;   Check that used callee-saved registers are saved
+; CHECK: stp	x20, x19, [sp, #-32]!
+;   Check that the frame pointer is created:
+; CHECK: stp	x29, x30, [sp, #16]
+; CHECK: add	x29, sp, #16
+;   Check that the stack pointer gets re-aligned to 128
+;   bytes & the base pointer (x19) gets initialized to
+;   this 128-byte aligned area for local variables &
+;   spill slots
+; CHECK: sub	x9, sp, #7, lsl #12
+; CHECK: and	sp, x9, #0xffffffffffff8000
+; CHECK: mov    x19, sp
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #24]
+; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
+;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
+;   and set-up of base pointer (x19).
+; CHECK: ubfx	x9, x0, #0, #32
+; CHECK: lsl	x9, x9, #2
+; CHECK: add	x9, x9, #15
+; CHECK: and	x9, x9, #0xfffffffffffffff0
+; CHECK: mov	 x10, sp
+; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK: mov	 sp, x[[VLASPTMP]]
+;   Check correct access to local variable, through base pointer
+; CHECK: ldr	w[[ILOC:[0-9]+]], [x19]
+; CHECK: ldr	 w[[VLA:[0-9]+]], [x[[VLASPTMP]]]
+;   Check epilogue:
+;     Check that stack pointer get restored from frame pointer.
+; CHECK: sub	sp, x29, #16
+; CHECK: ldp	x29, x30, [sp, #16]
+; CHECK: ldp	x20, x19, [sp], #32
+; CHECK: ret
+
+attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/AArch64/addsub.ll b/test/CodeGen/AArch64/addsub.ll
index 09b9f62..d6350a6 100644
--- a/test/CodeGen/AArch64/addsub.ll
+++ b/test/CodeGen/AArch64/addsub.ll
@@ -24,6 +24,34 @@ define void @add_small() {
   ret void
 }
 
+; Make sure we grab the imm variant when the register operand
+; can be implicitly zero-extend.
+; We used to generate something horrible like this:
+; wA = ldrb
+; xB = ldimm 12
+; xC = add xB, wA, uxtb
+; whereas this can be achieved with:
+; wA = ldrb
+; xC = add xA, #12 ; <- xA implicitly zero extend wA.
+define void @add_small_imm(i8* %p, i64* %q, i32 %b, i32* %addr) {
+; CHECK-LABEL: add_small_imm:
+entry:
+
+; CHECK: ldrb w[[LOAD32:[0-9]+]], [x0]
+  %t = load i8, i8* %p
+  %promoted = zext i8 %t to i64
+  %zextt = zext i8 %t to i32
+  %add = add nuw i32 %zextt, %b
+
+; CHECK: add [[ADD2:x[0-9]+]], x[[LOAD32]], #12
+  %add2 = add nuw i64 %promoted, 12
+  store i32 %add, i32* %addr
+
+; CHECK: str [[ADD2]], [x1]
+  store i64 %add2, i64* %q
+  ret void
+}
+
 ; Add 12-bit immediates, shifted left by 12 bits
 define void @add_med() {
 ; CHECK-LABEL: add_med:
diff --git a/test/CodeGen/AArch64/argument-blocks.ll b/test/CodeGen/AArch64/argument-blocks.ll
index f1dcfa6..3169abc 100644
--- a/test/CodeGen/AArch64/argument-blocks.ll
+++ b/test/CodeGen/AArch64/argument-blocks.ll
@@ -64,7 +64,7 @@ define void @test_varargs_stackalign() {
 ; CHECK-LABEL: test_varargs_stackalign:
 ; CHECK-DARWINPCS: stp {{w[0-9]+}}, {{w[0-9]+}}, [sp, #16]
 
-  call void(...)* @callee([3 x float] undef, [2 x float] [float 1.0, float 2.0])
+  call void(...) @callee([3 x float] undef, [2 x float] [float 1.0, float 2.0])
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
index 41e22e9..b760261 100644
--- a/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
+++ b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
@@ -16,11 +16,11 @@ entry:
   %0 = load double, double* %d.addr, align 8
   %1 = load double, double* %d.addr, align 8
   %conv = fptoui double %1 to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), double %0, i64 %conv)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), double %0, i64 %conv)
   %2 = load double, double* %d.addr, align 8
   %3 = load double, double* %d.addr, align 8
   %conv1 = fptoui double %3 to i32
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str1, i32 0, i32 0), double %2, i32 %conv1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str1, i32 0, i32 0), double %2, i32 %conv1)
   ret void
 }
 
@@ -37,12 +37,12 @@ entry:
   %conv = fpext float %0 to double
   %1 = load float, float* %f.addr, align 4
   %conv1 = fptoui float %1 to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str2, i32 0, i32 0), double %conv, i64 %conv1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str2, i32 0, i32 0), double %conv, i64 %conv1)
   %2 = load float, float* %f.addr, align 4
   %conv2 = fpext float %2 to double
   %3 = load float, float* %f.addr, align 4
   %conv3 = fptoui float %3 to i32
-  %call4 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str3, i32 0, i32 0), double %conv2, i32 %conv3)
+  %call4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str3, i32 0, i32 0), double %conv2, i32 %conv3)
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
index 6266d1c..8784abd 100644
--- a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
+++ b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
@@ -7,13 +7,13 @@ define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
 ; CHECK-LABEL: bar:
 ; CHECK: add.2d	v[[REG:[0-9]+]], v0, v1
 ; CHECK: add	d[[REG3:[0-9]+]], d[[REG]], d1
+; CHECK: sub	d[[REG2:[0-9]+]], d[[REG]], d1
 ; Without advanced copy optimization, we end up with cross register
 ; banks copies that cannot be coalesced.
 ; CHECK-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]]
 ; With advanced copy optimization, we end up with just one copy
 ; to insert the computed high part into the V register. 
 ; CHECK-OPT-NOT: fmov
-; CHECK: sub	d[[REG2:[0-9]+]], d[[REG]], d1
 ; CHECK: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]]
 ; CHECK-NOOPT: fmov d0, [[COPY_REG3]]
 ; CHECK-OPT-NOT: fmov
@@ -23,9 +23,9 @@ define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
 ; GENERIC-LABEL: bar:
 ; GENERIC: add	v[[REG:[0-9]+]].2d, v0.2d, v1.2d
 ; GENERIC: add	d[[REG3:[0-9]+]], d[[REG]], d1
+; GENERIC: sub	d[[REG2:[0-9]+]], d[[REG]], d1
 ; GENERIC-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]]
 ; GENERIC-OPT-NOT: fmov
-; GENERIC: sub	d[[REG2:[0-9]+]], d[[REG]], d1
 ; GENERIC: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]]
 ; GENERIC-NOOPT: fmov d0, [[COPY_REG3]]
 ; GENERIC-OPT-NOT: fmov
diff --git a/test/CodeGen/AArch64/arm64-aapcs.ll b/test/CodeGen/AArch64/arm64-aapcs.ll
index 41c3ad5..390a3c7 100644
--- a/test/CodeGen/AArch64/arm64-aapcs.ll
+++ b/test/CodeGen/AArch64/arm64-aapcs.ll
@@ -78,7 +78,7 @@ declare void @variadic(i32 %a, ...)
   ; Under AAPCS variadic functions have the same calling convention as
   ; others. The extra arguments should go in registers rather than on the stack.
 define void @test_variadic() {
-  call void(i32, ...)* @variadic(i32 0, i64 1, double 2.0)
+  call void(i32, ...) @variadic(i32 0, i64 1, double 2.0)
 ; CHECK: fmov d0, #2.0
 ; CHECK: orr w1, wzr, #0x1
 ; CHECK: bl variadic
diff --git a/test/CodeGen/AArch64/arm64-abi-varargs.ll b/test/CodeGen/AArch64/arm64-abi-varargs.ll
index f95fec6..03414b5 100644
--- a/test/CodeGen/AArch64/arm64-abi-varargs.ll
+++ b/test/CodeGen/AArch64/arm64-abi-varargs.ll
@@ -94,7 +94,7 @@ define i32 @main() nounwind ssp {
   %10 = load i32, i32* %a10, align 4
   %11 = load i32, i32* %a11, align 4
   %12 = load i32, i32* %a12, align 4
-  call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...)* @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
+  call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...) @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
   ret i32 0
 }
 
@@ -133,7 +133,7 @@ entry:
   store <4 x i32> %y, <4 x i32>* %y.addr, align 16
   %0 = load i32, i32* %x.addr, align 4
   %1 = load <4 x i32>, <4 x i32>* %y.addr, align 16
-  call void (i8*, ...)* @foo(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %0, <4 x i32> %1)
+  call void (i8*, ...) @foo(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %0, <4 x i32> %1)
   ret void
 }
 
@@ -186,6 +186,6 @@ entry:
   %1 = load i32, i32* %x.addr, align 4
   %2 = bitcast %struct.s41* %s41 to i128*
   %3 = load i128, i128* %2, align 1
-  call void (i8*, ...)* @foo2(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %1, i128 %3)
+  call void (i8*, ...) @foo2(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %1, i128 %3)
   ret void
 }
diff --git a/test/CodeGen/AArch64/arm64-anyregcc-crash.ll b/test/CodeGen/AArch64/arm64-anyregcc-crash.ll
index 241cf97..56c62d5 100644
--- a/test/CodeGen/AArch64/arm64-anyregcc-crash.ll
+++ b/test/CodeGen/AArch64/arm64-anyregcc-crash.ll
@@ -8,7 +8,7 @@ define i64 @anyreglimit(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i6
                         i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
                         i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32) {
 entry:
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 32,
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 32,
                 i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
                 i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
                 i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
diff --git a/test/CodeGen/AArch64/arm64-anyregcc.ll b/test/CodeGen/AArch64/arm64-anyregcc.ll
index e26875d..2a2f451 100644
--- a/test/CodeGen/AArch64/arm64-anyregcc.ll
+++ b/test/CodeGen/AArch64/arm64-anyregcc.ll
@@ -55,7 +55,7 @@
 ; CHECK-NEXT:   .long 3
 define i64 @test() nounwind ssp uwtable {
 entry:
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 0, i32 16, i8* null, i32 2, i32 1, i32 2, i64 3)
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 0, i32 16, i8* null, i32 2, i32 1, i32 2, i64 3)
   ret i64 0
 }
 
@@ -77,7 +77,7 @@ entry:
 define i64 @property_access1(i8* %obj) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 1, i32 20, i8* %f, i32 1, i8* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 1, i32 20, i8* %f, i32 1, i8* %obj)
   ret i64 %ret
 }
 
@@ -100,7 +100,7 @@ define i64 @property_access2() nounwind ssp uwtable {
 entry:
   %obj = alloca i64, align 8
   %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %f, i32 1, i64* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %f, i32 1, i64* %obj)
   ret i64 %ret
 }
 
@@ -123,7 +123,7 @@ define i64 @property_access3() nounwind ssp uwtable {
 entry:
   %obj = alloca i64, align 8
   %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 3, i32 20, i8* %f, i32 0, i64* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 3, i32 20, i8* %f, i32 0, i64* %obj)
   ret i64 %ret
 }
 
@@ -205,7 +205,7 @@ entry:
 define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 4, i32 20, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 4, i32 20, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
   ret i64 %ret
 }
 
@@ -287,7 +287,7 @@ entry:
 define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
   ret i64 %ret
 }
 
@@ -315,7 +315,7 @@ entry:
 ; CHECK-NEXT: .long  0
 define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 12, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
   tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
   ret i64 %result
 }
@@ -355,7 +355,7 @@ entry:
 define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
   tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 13, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 13, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
   ret i64 %result
 }
 
diff --git a/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
index c280bef..d089767 100644
--- a/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
+++ b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
@@ -1,6 +1,10 @@
 ; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
 ; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -fast-isel=true -O0 -o - | FileCheck %s
 
+; Note, we split the functions in to multiple BBs below to isolate the call
+; instruction we want to test, from fast-isel failing to select instructions
+; after it.
+
 ; CHECK-LABEL: test_i64_f64:
 declare i64 @test_i64_f64_helper(double %p)
 define void @test_i64_f64(double* %p, i64* %q) {
@@ -8,6 +12,8 @@ define void @test_i64_f64(double* %p, i64* %q) {
     %1 = load double, double* %p
     %2 = fadd double %1, %1
     %3 = call i64 @test_i64_f64_helper(double %2)
+    br label %return_bb
+return_bb:
     %4 = add i64 %3, %3
     store i64 %4, i64* %q
     ret void
@@ -20,6 +26,8 @@ define void @test_i64_v1i64(<1 x i64>* %p, i64* %q) {
     %1 = load <1 x i64>, <1 x i64>* %p
     %2 = add <1 x i64> %1, %1
     %3 = call i64 @test_i64_v1i64_helper(<1 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = add i64 %3, %3
     store i64 %4, i64* %q
     ret void
@@ -32,6 +40,8 @@ define void @test_i64_v2f32(<2 x float>* %p, i64* %q) {
     %1 = load <2 x float>, <2 x float>* %p
     %2 = fadd <2 x float> %1, %1
     %3 = call i64 @test_i64_v2f32_helper(<2 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = add i64 %3, %3
     store i64 %4, i64* %q
     ret void
@@ -44,6 +54,8 @@ define void @test_i64_v2i32(<2 x i32>* %p, i64* %q) {
     %1 = load <2 x i32>, <2 x i32>* %p
     %2 = add <2 x i32> %1, %1
     %3 = call i64 @test_i64_v2i32_helper(<2 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = add i64 %3, %3
     store i64 %4, i64* %q
     ret void
@@ -56,6 +68,8 @@ define void @test_i64_v4i16(<4 x i16>* %p, i64* %q) {
     %1 = load <4 x i16>, <4 x i16>* %p
     %2 = add <4 x i16> %1, %1
     %3 = call i64 @test_i64_v4i16_helper(<4 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = add i64 %3, %3
     store i64 %4, i64* %q
     ret void
@@ -68,6 +82,8 @@ define void @test_i64_v8i8(<8 x i8>* %p, i64* %q) {
     %1 = load <8 x i8>, <8 x i8>* %p
     %2 = add <8 x i8> %1, %1
     %3 = call i64 @test_i64_v8i8_helper(<8 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = add i64 %3, %3
     store i64 %4, i64* %q
     ret void
@@ -80,6 +96,8 @@ define void @test_f64_i64(i64* %p, double* %q) {
     %1 = load i64, i64* %p
     %2 = add i64 %1, %1
     %3 = call double @test_f64_i64_helper(i64 %2)
+    br label %return_bb
+return_bb:
     %4 = fadd double %3, %3
     store double %4, double* %q
     ret void
@@ -92,6 +110,8 @@ define void @test_f64_v1i64(<1 x i64>* %p, double* %q) {
     %1 = load <1 x i64>, <1 x i64>* %p
     %2 = add <1 x i64> %1, %1
     %3 = call double @test_f64_v1i64_helper(<1 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd double %3, %3
     store double %4, double* %q
     ret void
@@ -104,6 +124,8 @@ define void @test_f64_v2f32(<2 x float>* %p, double* %q) {
     %1 = load <2 x float>, <2 x float>* %p
     %2 = fadd <2 x float> %1, %1
     %3 = call double @test_f64_v2f32_helper(<2 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd double %3, %3
     store double %4, double* %q
     ret void
@@ -116,6 +138,8 @@ define void @test_f64_v2i32(<2 x i32>* %p, double* %q) {
     %1 = load <2 x i32>, <2 x i32>* %p
     %2 = add <2 x i32> %1, %1
     %3 = call double @test_f64_v2i32_helper(<2 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd double %3, %3
     store double %4, double* %q
     ret void
@@ -128,6 +152,8 @@ define void @test_f64_v4i16(<4 x i16>* %p, double* %q) {
     %1 = load <4 x i16>, <4 x i16>* %p
     %2 = add <4 x i16> %1, %1
     %3 = call double @test_f64_v4i16_helper(<4 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd double %3, %3
     store double %4, double* %q
     ret void
@@ -140,6 +166,8 @@ define void @test_f64_v8i8(<8 x i8>* %p, double* %q) {
     %1 = load <8 x i8>, <8 x i8>* %p
     %2 = add <8 x i8> %1, %1
     %3 = call double @test_f64_v8i8_helper(<8 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd double %3, %3
     store double %4, double* %q
     ret void
@@ -152,6 +180,8 @@ define void @test_v1i64_i64(i64* %p, <1 x i64>* %q) {
     %1 = load i64, i64* %p
     %2 = add i64 %1, %1
     %3 = call <1 x i64> @test_v1i64_i64_helper(i64 %2)
+    br label %return_bb
+return_bb:
     %4 = add <1 x i64> %3, %3
     store <1 x i64> %4, <1 x i64>* %q
     ret void
@@ -164,6 +194,8 @@ define void @test_v1i64_f64(double* %p, <1 x i64>* %q) {
     %1 = load double, double* %p
     %2 = fadd double %1, %1
     %3 = call <1 x i64> @test_v1i64_f64_helper(double %2)
+    br label %return_bb
+return_bb:
     %4 = add <1 x i64> %3, %3
     store <1 x i64> %4, <1 x i64>* %q
     ret void
@@ -176,6 +208,8 @@ define void @test_v1i64_v2f32(<2 x float>* %p, <1 x i64>* %q) {
     %1 = load <2 x float>, <2 x float>* %p
     %2 = fadd <2 x float> %1, %1
     %3 = call <1 x i64> @test_v1i64_v2f32_helper(<2 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = add <1 x i64> %3, %3
     store <1 x i64> %4, <1 x i64>* %q
     ret void
@@ -188,6 +222,8 @@ define void @test_v1i64_v2i32(<2 x i32>* %p, <1 x i64>* %q) {
     %1 = load <2 x i32>, <2 x i32>* %p
     %2 = add <2 x i32> %1, %1
     %3 = call <1 x i64> @test_v1i64_v2i32_helper(<2 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = add <1 x i64> %3, %3
     store <1 x i64> %4, <1 x i64>* %q
     ret void
@@ -200,6 +236,8 @@ define void @test_v1i64_v4i16(<4 x i16>* %p, <1 x i64>* %q) {
     %1 = load <4 x i16>, <4 x i16>* %p
     %2 = add <4 x i16> %1, %1
     %3 = call <1 x i64> @test_v1i64_v4i16_helper(<4 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = add <1 x i64> %3, %3
     store <1 x i64> %4, <1 x i64>* %q
     ret void
@@ -212,6 +250,8 @@ define void @test_v1i64_v8i8(<8 x i8>* %p, <1 x i64>* %q) {
     %1 = load <8 x i8>, <8 x i8>* %p
     %2 = add <8 x i8> %1, %1
     %3 = call <1 x i64> @test_v1i64_v8i8_helper(<8 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = add <1 x i64> %3, %3
     store <1 x i64> %4, <1 x i64>* %q
     ret void
@@ -224,6 +264,8 @@ define void @test_v2f32_i64(i64* %p, <2 x float>* %q) {
     %1 = load i64, i64* %p
     %2 = add i64 %1, %1
     %3 = call <2 x float> @test_v2f32_i64_helper(i64 %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x float> %3, %3
     store <2 x float> %4, <2 x float>* %q
     ret void
@@ -236,6 +278,8 @@ define void @test_v2f32_f64(double* %p, <2 x float>* %q) {
     %1 = load double, double* %p
     %2 = fadd double %1, %1
     %3 = call <2 x float> @test_v2f32_f64_helper(double %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x float> %3, %3
     store <2 x float> %4, <2 x float>* %q
     ret void
@@ -248,6 +292,8 @@ define void @test_v2f32_v1i64(<1 x i64>* %p, <2 x float>* %q) {
     %1 = load <1 x i64>, <1 x i64>* %p
     %2 = add <1 x i64> %1, %1
     %3 = call <2 x float> @test_v2f32_v1i64_helper(<1 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x float> %3, %3
     store <2 x float> %4, <2 x float>* %q
     ret void
@@ -261,6 +307,8 @@ define void @test_v2f32_v2i32(<2 x i32>* %p, <2 x float>* %q) {
     %1 = load <2 x i32>, <2 x i32>* %p
     %2 = add <2 x i32> %1, %1
     %3 = call <2 x float> @test_v2f32_v2i32_helper(<2 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x float> %3, %3
     store <2 x float> %4, <2 x float>* %q
     ret void
@@ -274,6 +322,8 @@ define void @test_v2f32_v4i16(<4 x i16>* %p, <2 x float>* %q) {
     %1 = load <4 x i16>, <4 x i16>* %p
     %2 = add <4 x i16> %1, %1
     %3 = call <2 x float> @test_v2f32_v4i16_helper(<4 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x float> %3, %3
     store <2 x float> %4, <2 x float>* %q
     ret void
@@ -287,6 +337,8 @@ define void @test_v2f32_v8i8(<8 x i8>* %p, <2 x float>* %q) {
     %1 = load <8 x i8>, <8 x i8>* %p
     %2 = add <8 x i8> %1, %1
     %3 = call <2 x float> @test_v2f32_v8i8_helper(<8 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x float> %3, %3
     store <2 x float> %4, <2 x float>* %q
     ret void
@@ -299,6 +351,8 @@ define void @test_v2i32_i64(i64* %p, <2 x i32>* %q) {
     %1 = load i64, i64* %p
     %2 = add i64 %1, %1
     %3 = call <2 x i32> @test_v2i32_i64_helper(i64 %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i32> %3, %3
     store <2 x i32> %4, <2 x i32>* %q
     ret void
@@ -311,6 +365,8 @@ define void @test_v2i32_f64(double* %p, <2 x i32>* %q) {
     %1 = load double, double* %p
     %2 = fadd double %1, %1
     %3 = call <2 x i32> @test_v2i32_f64_helper(double %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i32> %3, %3
     store <2 x i32> %4, <2 x i32>* %q
     ret void
@@ -323,6 +379,8 @@ define void @test_v2i32_v1i64(<1 x i64>* %p, <2 x i32>* %q) {
     %1 = load <1 x i64>, <1 x i64>* %p
     %2 = add <1 x i64> %1, %1
     %3 = call <2 x i32> @test_v2i32_v1i64_helper(<1 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i32> %3, %3
     store <2 x i32> %4, <2 x i32>* %q
     ret void
@@ -336,6 +394,8 @@ define void @test_v2i32_v2f32(<2 x float>* %p, <2 x i32>* %q) {
     %1 = load <2 x float>, <2 x float>* %p
     %2 = fadd <2 x float> %1, %1
     %3 = call <2 x i32> @test_v2i32_v2f32_helper(<2 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i32> %3, %3
     store <2 x i32> %4, <2 x i32>* %q
     ret void
@@ -349,6 +409,8 @@ define void @test_v2i32_v4i16(<4 x i16>* %p, <2 x i32>* %q) {
     %1 = load <4 x i16>, <4 x i16>* %p
     %2 = add <4 x i16> %1, %1
     %3 = call <2 x i32> @test_v2i32_v4i16_helper(<4 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i32> %3, %3
     store <2 x i32> %4, <2 x i32>* %q
     ret void
@@ -362,6 +424,8 @@ define void @test_v2i32_v8i8(<8 x i8>* %p, <2 x i32>* %q) {
     %1 = load <8 x i8>, <8 x i8>* %p
     %2 = add <8 x i8> %1, %1
     %3 = call <2 x i32> @test_v2i32_v8i8_helper(<8 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i32> %3, %3
     store <2 x i32> %4, <2 x i32>* %q
     ret void
@@ -374,6 +438,8 @@ define void @test_v4i16_i64(i64* %p, <4 x i16>* %q) {
     %1 = load i64, i64* %p
     %2 = add i64 %1, %1
     %3 = call <4 x i16> @test_v4i16_i64_helper(i64 %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i16> %3, %3
     store <4 x i16> %4, <4 x i16>* %q
     ret void
@@ -386,6 +452,8 @@ define void @test_v4i16_f64(double* %p, <4 x i16>* %q) {
     %1 = load double, double* %p
     %2 = fadd double %1, %1
     %3 = call <4 x i16> @test_v4i16_f64_helper(double %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i16> %3, %3
     store <4 x i16> %4, <4 x i16>* %q
     ret void
@@ -398,6 +466,8 @@ define void @test_v4i16_v1i64(<1 x i64>* %p, <4 x i16>* %q) {
     %1 = load <1 x i64>, <1 x i64>* %p
     %2 = add <1 x i64> %1, %1
     %3 = call <4 x i16> @test_v4i16_v1i64_helper(<1 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i16> %3, %3
     store <4 x i16> %4, <4 x i16>* %q
     ret void
@@ -411,6 +481,8 @@ define void @test_v4i16_v2f32(<2 x float>* %p, <4 x i16>* %q) {
     %1 = load <2 x float>, <2 x float>* %p
     %2 = fadd <2 x float> %1, %1
     %3 = call <4 x i16> @test_v4i16_v2f32_helper(<2 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i16> %3, %3
     store <4 x i16> %4, <4 x i16>* %q
     ret void
@@ -424,6 +496,8 @@ define void @test_v4i16_v2i32(<2 x i32>* %p, <4 x i16>* %q) {
     %1 = load <2 x i32>, <2 x i32>* %p
     %2 = add <2 x i32> %1, %1
     %3 = call <4 x i16> @test_v4i16_v2i32_helper(<2 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i16> %3, %3
     store <4 x i16> %4, <4 x i16>* %q
     ret void
@@ -437,6 +511,8 @@ define void @test_v4i16_v8i8(<8 x i8>* %p, <4 x i16>* %q) {
     %1 = load <8 x i8>, <8 x i8>* %p
     %2 = add <8 x i8> %1, %1
     %3 = call <4 x i16> @test_v4i16_v8i8_helper(<8 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i16> %3, %3
     store <4 x i16> %4, <4 x i16>* %q
     ret void
@@ -449,6 +525,8 @@ define void @test_v8i8_i64(i64* %p, <8 x i8>* %q) {
     %1 = load i64, i64* %p
     %2 = add i64 %1, %1
     %3 = call <8 x i8> @test_v8i8_i64_helper(i64 %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i8> %3, %3
     store <8 x i8> %4, <8 x i8>* %q
     ret void
@@ -461,6 +539,8 @@ define void @test_v8i8_f64(double* %p, <8 x i8>* %q) {
     %1 = load double, double* %p
     %2 = fadd double %1, %1
     %3 = call <8 x i8> @test_v8i8_f64_helper(double %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i8> %3, %3
     store <8 x i8> %4, <8 x i8>* %q
     ret void
@@ -473,6 +553,8 @@ define void @test_v8i8_v1i64(<1 x i64>* %p, <8 x i8>* %q) {
     %1 = load <1 x i64>, <1 x i64>* %p
     %2 = add <1 x i64> %1, %1
     %3 = call <8 x i8> @test_v8i8_v1i64_helper(<1 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i8> %3, %3
     store <8 x i8> %4, <8 x i8>* %q
     ret void
@@ -486,6 +568,8 @@ define void @test_v8i8_v2f32(<2 x float>* %p, <8 x i8>* %q) {
     %1 = load <2 x float>, <2 x float>* %p
     %2 = fadd <2 x float> %1, %1
     %3 = call <8 x i8> @test_v8i8_v2f32_helper(<2 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i8> %3, %3
     store <8 x i8> %4, <8 x i8>* %q
     ret void
@@ -499,6 +583,8 @@ define void @test_v8i8_v2i32(<2 x i32>* %p, <8 x i8>* %q) {
     %1 = load <2 x i32>, <2 x i32>* %p
     %2 = add <2 x i32> %1, %1
     %3 = call <8 x i8> @test_v8i8_v2i32_helper(<2 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i8> %3, %3
     store <8 x i8> %4, <8 x i8>* %q
     ret void
@@ -512,6 +598,8 @@ define void @test_v8i8_v4i16(<4 x i16>* %p, <8 x i8>* %q) {
     %1 = load <4 x i16>, <4 x i16>* %p
     %2 = add <4 x i16> %1, %1
     %3 = call <8 x i8> @test_v8i8_v4i16_helper(<4 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i8> %3, %3
     store <8 x i8> %4, <8 x i8>* %q
     ret void
@@ -524,6 +612,8 @@ define void @test_f128_v2f64(<2 x double>* %p, fp128* %q) {
     %1 = load <2 x double>, <2 x double>* %p
     %2 = fadd <2 x double> %1, %1
     %3 = call fp128 @test_f128_v2f64_helper(<2 x double> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd fp128 %3, %3
     store fp128 %4, fp128* %q
     ret void
@@ -536,6 +626,8 @@ define void @test_f128_v2i64(<2 x i64>* %p, fp128* %q) {
     %1 = load <2 x i64>, <2 x i64>* %p
     %2 = add <2 x i64> %1, %1
     %3 = call fp128 @test_f128_v2i64_helper(<2 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd fp128 %3, %3
     store fp128 %4, fp128* %q
     ret void
@@ -549,6 +641,8 @@ define void @test_f128_v4f32(<4 x float>* %p, fp128* %q) {
     %1 = load <4 x float>, <4 x float>* %p
     %2 = fadd <4 x float> %1, %1
     %3 = call fp128 @test_f128_v4f32_helper(<4 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd fp128 %3, %3
     store fp128 %4, fp128* %q
     ret void
@@ -562,6 +656,8 @@ define void @test_f128_v4i32(<4 x i32>* %p, fp128* %q) {
     %1 = load <4 x i32>, <4 x i32>* %p
     %2 = add <4 x i32> %1, %1
     %3 = call fp128 @test_f128_v4i32_helper(<4 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd fp128 %3, %3
     store fp128 %4, fp128* %q
     ret void
@@ -575,6 +671,8 @@ define void @test_f128_v8i16(<8 x i16>* %p, fp128* %q) {
     %1 = load <8 x i16>, <8 x i16>* %p
     %2 = add <8 x i16> %1, %1
     %3 = call fp128 @test_f128_v8i16_helper(<8 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd fp128 %3, %3
     store fp128 %4, fp128* %q
     ret void
@@ -588,6 +686,8 @@ define void @test_f128_v16i8(<16 x i8>* %p, fp128* %q) {
     %1 = load <16 x i8>, <16 x i8>* %p
     %2 = add <16 x i8> %1, %1
     %3 = call fp128 @test_f128_v16i8_helper(<16 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd fp128 %3, %3
     store fp128 %4, fp128* %q
     ret void
@@ -600,6 +700,8 @@ define void @test_v2f64_f128(fp128* %p, <2 x double>* %q) {
     %1 = load fp128, fp128* %p
     %2 = fadd fp128 %1, %1
     %3 = call <2 x double> @test_v2f64_f128_helper(fp128 %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x double> %3, %3
     store <2 x double> %4, <2 x double>* %q
     ret void
@@ -613,6 +715,8 @@ define void @test_v2f64_v2i64(<2 x i64>* %p, <2 x double>* %q) {
     %1 = load <2 x i64>, <2 x i64>* %p
     %2 = add <2 x i64> %1, %1
     %3 = call <2 x double> @test_v2f64_v2i64_helper(<2 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x double> %3, %3
     store <2 x double> %4, <2 x double>* %q
     ret void
@@ -627,6 +731,8 @@ define void @test_v2f64_v4f32(<4 x float>* %p, <2 x double>* %q) {
     %1 = load <4 x float>, <4 x float>* %p
     %2 = fadd <4 x float> %1, %1
     %3 = call <2 x double> @test_v2f64_v4f32_helper(<4 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x double> %3, %3
     store <2 x double> %4, <2 x double>* %q
     ret void
@@ -641,6 +747,8 @@ define void @test_v2f64_v4i32(<4 x i32>* %p, <2 x double>* %q) {
     %1 = load <4 x i32>, <4 x i32>* %p
     %2 = add <4 x i32> %1, %1
     %3 = call <2 x double> @test_v2f64_v4i32_helper(<4 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x double> %3, %3
     store <2 x double> %4, <2 x double>* %q
     ret void
@@ -655,6 +763,8 @@ define void @test_v2f64_v8i16(<8 x i16>* %p, <2 x double>* %q) {
     %1 = load <8 x i16>, <8 x i16>* %p
     %2 = add <8 x i16> %1, %1
     %3 = call <2 x double> @test_v2f64_v8i16_helper(<8 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x double> %3, %3
     store <2 x double> %4, <2 x double>* %q
     ret void
@@ -669,6 +779,8 @@ define void @test_v2f64_v16i8(<16 x i8>* %p, <2 x double>* %q) {
     %1 = load <16 x i8>, <16 x i8>* %p
     %2 = add <16 x i8> %1, %1
     %3 = call <2 x double> @test_v2f64_v16i8_helper(<16 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x double> %3, %3
     store <2 x double> %4, <2 x double>* %q
     ret void
@@ -681,6 +793,8 @@ define void @test_v2i64_f128(fp128* %p, <2 x i64>* %q) {
     %1 = load fp128, fp128* %p
     %2 = fadd fp128 %1, %1
     %3 = call <2 x i64> @test_v2i64_f128_helper(fp128 %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i64> %3, %3
     store <2 x i64> %4, <2 x i64>* %q
     ret void
@@ -694,6 +808,8 @@ define void @test_v2i64_v2f64(<2 x double>* %p, <2 x i64>* %q) {
     %1 = load <2 x double>, <2 x double>* %p
     %2 = fadd <2 x double> %1, %1
     %3 = call <2 x i64> @test_v2i64_v2f64_helper(<2 x double> %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i64> %3, %3
     store <2 x i64> %4, <2 x i64>* %q
     ret void
@@ -708,6 +824,8 @@ define void @test_v2i64_v4f32(<4 x float>* %p, <2 x i64>* %q) {
     %1 = load <4 x float>, <4 x float>* %p
     %2 = fadd <4 x float> %1, %1
     %3 = call <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i64> %3, %3
     store <2 x i64> %4, <2 x i64>* %q
     ret void
@@ -722,6 +840,8 @@ define void @test_v2i64_v4i32(<4 x i32>* %p, <2 x i64>* %q) {
     %1 = load <4 x i32>, <4 x i32>* %p
     %2 = add <4 x i32> %1, %1
     %3 = call <2 x i64> @test_v2i64_v4i32_helper(<4 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i64> %3, %3
     store <2 x i64> %4, <2 x i64>* %q
     ret void
@@ -736,6 +856,8 @@ define void @test_v2i64_v8i16(<8 x i16>* %p, <2 x i64>* %q) {
     %1 = load <8 x i16>, <8 x i16>* %p
     %2 = add <8 x i16> %1, %1
     %3 = call <2 x i64> @test_v2i64_v8i16_helper(<8 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i64> %3, %3
     store <2 x i64> %4, <2 x i64>* %q
     ret void
@@ -750,6 +872,8 @@ define void @test_v2i64_v16i8(<16 x i8>* %p, <2 x i64>* %q) {
     %1 = load <16 x i8>, <16 x i8>* %p
     %2 = add <16 x i8> %1, %1
     %3 = call <2 x i64> @test_v2i64_v16i8_helper(<16 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i64> %3, %3
     store <2 x i64> %4, <2 x i64>* %q
     ret void
@@ -763,6 +887,8 @@ define void @test_v4f32_f128(fp128* %p, <4 x float>* %q) {
     %1 = load fp128, fp128* %p
     %2 = fadd fp128 %1, %1
     %3 = call <4 x float> @test_v4f32_f128_helper(fp128 %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <4 x float> %3, %3
     store <4 x float> %4, <4 x float>* %q
     ret void
@@ -777,6 +903,8 @@ define void @test_v4f32_v2f64(<2 x double>* %p, <4 x float>* %q) {
     %1 = load <2 x double>, <2 x double>* %p
     %2 = fadd <2 x double> %1, %1
     %3 = call <4 x float> @test_v4f32_v2f64_helper(<2 x double> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <4 x float> %3, %3
     store <4 x float> %4, <4 x float>* %q
     ret void
@@ -791,6 +919,8 @@ define void @test_v4f32_v2i64(<2 x i64>* %p, <4 x float>* %q) {
     %1 = load <2 x i64>, <2 x i64>* %p
     %2 = add <2 x i64> %1, %1
     %3 = call <4 x float> @test_v4f32_v2i64_helper(<2 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <4 x float> %3, %3
     store <4 x float> %4, <4 x float>* %q
     ret void
@@ -806,6 +936,8 @@ define void @test_v4f32_v4i32(<4 x i32>* %p, <4 x float>* %q) {
     %1 = load <4 x i32>, <4 x i32>* %p
     %2 = add <4 x i32> %1, %1
     %3 = call <4 x float> @test_v4f32_v4i32_helper(<4 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <4 x float> %3, %3
     store <4 x float> %4, <4 x float>* %q
     ret void
@@ -821,6 +953,8 @@ define void @test_v4f32_v8i16(<8 x i16>* %p, <4 x float>* %q) {
     %1 = load <8 x i16>, <8 x i16>* %p
     %2 = add <8 x i16> %1, %1
     %3 = call <4 x float> @test_v4f32_v8i16_helper(<8 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <4 x float> %3, %3
     store <4 x float> %4, <4 x float>* %q
     ret void
@@ -836,6 +970,8 @@ define void @test_v4f32_v16i8(<16 x i8>* %p, <4 x float>* %q) {
     %1 = load <16 x i8>, <16 x i8>* %p
     %2 = add <16 x i8> %1, %1
     %3 = call <4 x float> @test_v4f32_v16i8_helper(<16 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <4 x float> %3, %3
     store <4 x float> %4, <4 x float>* %q
     ret void
@@ -849,6 +985,8 @@ define void @test_v4i32_f128(fp128* %p, <4 x i32>* %q) {
     %1 = load fp128, fp128* %p
     %2 = fadd fp128 %1, %1
     %3 = call <4 x i32> @test_v4i32_f128_helper(fp128 %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i32> %3, %3
     store <4 x i32> %4, <4 x i32>* %q
     ret void
@@ -863,6 +1001,8 @@ define void @test_v4i32_v2f64(<2 x double>* %p, <4 x i32>* %q) {
     %1 = load <2 x double>, <2 x double>* %p
     %2 = fadd <2 x double> %1, %1
     %3 = call <4 x i32> @test_v4i32_v2f64_helper(<2 x double> %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i32> %3, %3
     store <4 x i32> %4, <4 x i32>* %q
     ret void
@@ -877,6 +1017,8 @@ define void @test_v4i32_v2i64(<2 x i64>* %p, <4 x i32>* %q) {
     %1 = load <2 x i64>, <2 x i64>* %p
     %2 = add <2 x i64> %1, %1
     %3 = call <4 x i32> @test_v4i32_v2i64_helper(<2 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i32> %3, %3
     store <4 x i32> %4, <4 x i32>* %q
     ret void
@@ -892,6 +1034,8 @@ define void @test_v4i32_v4f32(<4 x float>* %p, <4 x i32>* %q) {
     %1 = load <4 x float>, <4 x float>* %p
     %2 = fadd <4 x float> %1, %1
     %3 = call <4 x i32> @test_v4i32_v4f32_helper(<4 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i32> %3, %3
     store <4 x i32> %4, <4 x i32>* %q
     ret void
@@ -907,6 +1051,8 @@ define void @test_v4i32_v8i16(<8 x i16>* %p, <4 x i32>* %q) {
     %1 = load <8 x i16>, <8 x i16>* %p
     %2 = add <8 x i16> %1, %1
     %3 = call <4 x i32> @test_v4i32_v8i16_helper(<8 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i32> %3, %3
     store <4 x i32> %4, <4 x i32>* %q
     ret void
@@ -922,6 +1068,8 @@ define void @test_v4i32_v16i8(<16 x i8>* %p, <4 x i32>* %q) {
     %1 = load <16 x i8>, <16 x i8>* %p
     %2 = add <16 x i8> %1, %1
     %3 = call <4 x i32> @test_v4i32_v16i8_helper(<16 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i32> %3, %3
     store <4 x i32> %4, <4 x i32>* %q
     ret void
@@ -935,6 +1083,8 @@ define void @test_v8i16_f128(fp128* %p, <8 x i16>* %q) {
     %1 = load fp128, fp128* %p
     %2 = fadd fp128 %1, %1
     %3 = call <8 x i16> @test_v8i16_f128_helper(fp128 %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i16> %3, %3
     store <8 x i16> %4, <8 x i16>* %q
     ret void
@@ -949,6 +1099,8 @@ define void @test_v8i16_v2f64(<2 x double>* %p, <8 x i16>* %q) {
     %1 = load <2 x double>, <2 x double>* %p
     %2 = fadd <2 x double> %1, %1
     %3 = call <8 x i16> @test_v8i16_v2f64_helper(<2 x double> %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i16> %3, %3
     store <8 x i16> %4, <8 x i16>* %q
     ret void
@@ -963,6 +1115,8 @@ define void @test_v8i16_v2i64(<2 x i64>* %p, <8 x i16>* %q) {
     %1 = load <2 x i64>, <2 x i64>* %p
     %2 = add <2 x i64> %1, %1
     %3 = call <8 x i16> @test_v8i16_v2i64_helper(<2 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i16> %3, %3
     store <8 x i16> %4, <8 x i16>* %q
     ret void
@@ -978,6 +1132,8 @@ define void @test_v8i16_v4f32(<4 x float>* %p, <8 x i16>* %q) {
     %1 = load <4 x float>, <4 x float>* %p
     %2 = fadd <4 x float> %1, %1
     %3 = call <8 x i16> @test_v8i16_v4f32_helper(<4 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i16> %3, %3
     store <8 x i16> %4, <8 x i16>* %q
     ret void
@@ -993,6 +1149,8 @@ define void @test_v8i16_v4i32(<4 x i32>* %p, <8 x i16>* %q) {
     %1 = load <4 x i32>, <4 x i32>* %p
     %2 = add <4 x i32> %1, %1
     %3 = call <8 x i16> @test_v8i16_v4i32_helper(<4 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i16> %3, %3
     store <8 x i16> %4, <8 x i16>* %q
     ret void
@@ -1008,6 +1166,8 @@ define void @test_v8i16_v16i8(<16 x i8>* %p, <8 x i16>* %q) {
     %1 = load <16 x i8>, <16 x i8>* %p
     %2 = add <16 x i8> %1, %1
     %3 = call <8 x i16> @test_v8i16_v16i8_helper(<16 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i16> %3, %3
     store <8 x i16> %4, <8 x i16>* %q
     ret void
@@ -1021,6 +1181,8 @@ define void @test_v16i8_f128(fp128* %p, <16 x i8>* %q) {
     %1 = load fp128, fp128* %p
     %2 = fadd fp128 %1, %1
     %3 = call <16 x i8> @test_v16i8_f128_helper(fp128 %2)
+    br label %return_bb
+return_bb:
     %4 = add <16 x i8> %3, %3
     store <16 x i8> %4, <16 x i8>* %q
     ret void
@@ -1035,6 +1197,8 @@ define void @test_v16i8_v2f64(<2 x double>* %p, <16 x i8>* %q) {
     %1 = load <2 x double>, <2 x double>* %p
     %2 = fadd <2 x double> %1, %1
     %3 = call <16 x i8> @test_v16i8_v2f64_helper(<2 x double> %2)
+    br label %return_bb
+return_bb:
     %4 = add <16 x i8> %3, %3
     store <16 x i8> %4, <16 x i8>* %q
     ret void
@@ -1049,6 +1213,8 @@ define void @test_v16i8_v2i64(<2 x i64>* %p, <16 x i8>* %q) {
     %1 = load <2 x i64>, <2 x i64>* %p
     %2 = add <2 x i64> %1, %1
     %3 = call <16 x i8> @test_v16i8_v2i64_helper(<2 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = add <16 x i8> %3, %3
     store <16 x i8> %4, <16 x i8>* %q
     ret void
@@ -1064,6 +1230,8 @@ define void @test_v16i8_v4f32(<4 x float>* %p, <16 x i8>* %q) {
     %1 = load <4 x float>, <4 x float>* %p
     %2 = fadd <4 x float> %1, %1
     %3 = call <16 x i8> @test_v16i8_v4f32_helper(<4 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = add <16 x i8> %3, %3
     store <16 x i8> %4, <16 x i8>* %q
     ret void
@@ -1079,6 +1247,8 @@ define void @test_v16i8_v4i32(<4 x i32>* %p, <16 x i8>* %q) {
     %1 = load <4 x i32>, <4 x i32>* %p
     %2 = add <4 x i32> %1, %1
     %3 = call <16 x i8> @test_v16i8_v4i32_helper(<4 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = add <16 x i8> %3, %3
     store <16 x i8> %4, <16 x i8>* %q
     ret void
@@ -1094,6 +1264,8 @@ define void @test_v16i8_v8i16(<8 x i16>* %p, <16 x i8>* %q) {
     %1 = load <8 x i16>, <8 x i16>* %p
     %2 = add <8 x i16> %1, %1
     %3 = call <16 x i8> @test_v16i8_v8i16_helper(<8 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = add <16 x i8> %3, %3
     store <16 x i8> %4, <16 x i8>* %q
     ret void
diff --git a/test/CodeGen/AArch64/arm64-call-tailcalls.ll b/test/CodeGen/AArch64/arm64-call-tailcalls.ll
index 71d9327..6621db2 100644
--- a/test/CodeGen/AArch64/arm64-call-tailcalls.ll
+++ b/test/CodeGen/AArch64/arm64-call-tailcalls.ll
@@ -53,9 +53,9 @@ bb:                                               ; preds = %entry
 
 define i32 @t8(i32 %x) nounwind ssp {
 ; CHECK-LABEL: t8:
+; CHECK: b	_c
 ; CHECK: b	_a
 ; CHECK: b	_b
-; CHECK: b	_c
   %and = and i32 %x, 1
   %tobool = icmp eq i32 %and, 0
   br i1 %tobool, label %if.end, label %if.then
diff --git a/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll b/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll
new file mode 100644
index 0000000..f0b8299
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll
@@ -0,0 +1,638 @@
+; RUN: opt -codegenprepare < %s -mtriple=aarch64-apple-ios -S | FileCheck %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=NONSTRESS
+; RUN: opt -codegenprepare < %s -mtriple=aarch64-apple-ios -S -stress-cgp-ext-ld-promotion | FileCheck %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=STRESS
+; RUN: opt -codegenprepare < %s -mtriple=aarch64-apple-ios -S -disable-cgp-ext-ld-promotion | FileCheck %s --check-prefix=OPTALL --check-prefix=DISABLE
+
+; CodeGenPrepare should move the zext into the block with the load
+; so that SelectionDAG can select it with the load.
+;
+; OPTALL-LABEL: @foo
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+; OPTALL-NEXT: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; OPTALL: store i32 [[ZEXT]], i32* %q
+; OPTALL: ret
+define void @foo(i8* %p, i32* %q) {
+entry:
+  %t = load i8, i8* %p
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = zext i8 %t to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we manage to form a zextload is an operation with only one
+; argument to explicitly extend is in the the way.
+; OPTALL-LABEL: @promoteOneArg
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+; OPT-NEXT: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT]], 2
+; Make sure the operation is not promoted when the promotion pass is disabled.
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], 2
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteOneArg(i8* %p, i32* %q) {
+entry:
+  %t = load i8, i8* %p
+  %add = add nuw i8 %t, 2
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = zext i8 %add to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we manage to form a sextload is an operation with only one
+; argument to explicitly extend is in the the way.
+; Version with sext.
+; OPTALL-LABEL: @promoteOneArgSExt
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+; OPT-NEXT: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i8 [[LD]] to i32
+; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SEXT]], 2
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], 2
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteOneArgSExt(i8* %p, i32* %q) {
+entry:
+  %t = load i8, i8* %p
+  %add = add nsw i8 %t, 2
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = sext i8 %add to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we manage to form a zextload is an operation with two
+; arguments to explicitly extend is in the the way.
+; Extending %add will create two extensions:
+; 1. One for %b.
+; 2. One for %t.
+; #1 will not be removed as we do not know anything about %b.
+; #2 may not be merged with the load because %t is used in a comparison.
+; Since two extensions may be emitted in the end instead of one before the
+; transformation, the regular heuristic does not apply the optimization. 
+; 
+; OPTALL-LABEL: @promoteTwoArgZext
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i8 %b to i32
+; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]]
+;
+; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b
+; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+;
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+;
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteTwoArgZext(i8* %p, i32* %q, i8 %b) {
+entry:
+  %t = load i8, i8* %p
+  %add = add nuw i8 %t, %b
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = zext i8 %add to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we manage to form a sextload is an operation with two
+; arguments to explicitly extend is in the the way.
+; Version with sext.
+; OPTALL-LABEL: @promoteTwoArgSExt
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[SEXTLD:%[a-zA-Z_0-9-]+]] = sext i8 [[LD]] to i32
+; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i8 %b to i32
+; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SEXTLD]], [[SEXTB]]
+;
+; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], %b
+; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32
+;
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], %b
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteTwoArgSExt(i8* %p, i32* %q, i8 %b) {
+entry:
+  %t = load i8, i8* %p
+  %add = add nsw i8 %t, %b
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = sext i8 %add to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we do not a zextload if we need to introduce more than
+; one additional extension.
+; OPTALL-LABEL: @promoteThreeArgZext
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i8 %b to i32
+; STRESS-NEXT: [[TMP:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]]
+; STRESS-NEXT: [[ZEXTC:%[a-zA-Z_0-9-]+]] = zext i8 %c to i32
+; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[TMP]], [[ZEXTC]]
+;
+; NONSTRESS-NEXT: [[TMP:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b
+; NONSTRESS-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[TMP]], %c
+; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+;
+; DISABLE: add nuw i8
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+;
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteThreeArgZext(i8* %p, i32* %q, i8 %b, i8 %c) {
+entry:
+  %t = load i8, i8* %p
+  %tmp = add nuw i8 %t, %b
+  %add = add nuw i8 %tmp, %c
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = zext i8 %add to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we manage to form a zextload after promoting and merging
+; two extensions.
+; OPTALL-LABEL: @promoteMergeExtArgZExt
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i16 %b to i32
+; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]]
+;
+; NONSTRESS: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
+; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i16 [[ZEXTLD]], %b
+; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i16 [[ADD]] to i32
+;
+; DISABLE: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i16 [[ZEXTLD]], %b
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i16 [[ADD]] to i32
+;
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteMergeExtArgZExt(i8* %p, i32* %q, i16 %b) {
+entry:
+  %t = load i8, i8* %p
+  %ext = zext i8 %t to i16
+  %add = add nuw i16 %ext, %b
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = zext i16 %add to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we manage to form a sextload after promoting and merging
+; two extensions.
+; Version with sext.
+; OPTALL-LABEL: @promoteMergeExtArgSExt
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = sext i16 %b to i32
+; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXTLD]], [[ZEXTB]]
+;
+; NONSTRESS: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
+; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i16 [[ZEXTLD]], %b
+; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32
+;
+; DISABLE: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i16 [[ZEXTLD]], %b
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteMergeExtArgSExt(i8* %p, i32* %q, i16 %b) {
+entry:
+  %t = load i8, i8* %p
+  %ext = zext i8 %t to i16
+  %add = add nsw i16 %ext, %b
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = sext i16 %add to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we manage to catch all the extload opportunities that are exposed
+; by the different iterations of codegen prepare.
+; Moreover, check that we do not promote more than we need to.
+; Here is what is happening in this test (not necessarly in this order):
+; 1. We try to promote the operand of %sextadd.
+;    a. This creates one sext of %ld2 and one of %zextld
+;    b. The sext of %ld2 can be combine with %ld2, so we remove one sext but
+;       introduced one. This is fine with the current heuristic: neutral.
+;    => We have one zext of %zextld left and we created one sext of %ld2.
+; 2. We try to promote the operand of %sextaddza.
+;    a. This creates one sext of %zexta and one of %zextld
+;    b. The sext of %zexta does not lead to any load, it stays here, even if it
+;       could have been combine with the zext of %a.
+;    c. The sext of %zextld leads to %ld and can be combined with it. This is
+;       done by promoting %zextld. This is fine with the current heuristic:
+;       neutral.
+;    => We have created a new zext of %ld and we created one sext of %zexta.
+; 3. We try to promote the operand of %sextaddb.
+;    a. This creates one sext of %b and one of %zextld
+;    b. The sext of %b is a dead-end, nothing to be done.
+;    c. Same thing as 2.c. happens.
+;    => We have created a new zext of %ld and we created one sext of %b.
+; 4. We try to promote the operand of the zext of %zextld introduced in #1.
+;    a. Same thing as 2.c. happens.
+;    b. %zextld does not have any other uses. It is dead coded.
+;    => We have created a new zext of %ld and we removed a zext of %zextld and
+;       a zext of %ld.
+; Currently we do not try to reuse existing extensions, so in the end we have
+; 3 identical zext of %ld. The extensions will be CSE'ed by SDag.
+;
+; OPTALL-LABEL: @severalPromotions
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %addr1
+; OPT-NEXT: [[ZEXTLD1_1:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[ZEXTLD1_2:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[ZEXTLD1_3:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[LD2:%[a-zA-Z_0-9-]+]] = load i32, i32* %addr2
+; OPT-NEXT: [[SEXTLD2:%[a-zA-Z_0-9-]+]] = sext i32 [[LD2]] to i64
+; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTLD2]], [[ZEXTLD1_1]]
+; We do not combine this one: see 2.b.
+; OPT-NEXT: [[ZEXTA:%[a-zA-Z_0-9-]+]] = zext i8 %a to i32
+; OPT-NEXT: [[SEXTZEXTA:%[a-zA-Z_0-9-]+]] = sext i32 [[ZEXTA]] to i64
+; OPT-NEXT: [[RESZA:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTZEXTA]], [[ZEXTLD1_3]]
+; OPT-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
+; OPT-NEXT: [[RESB:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTB]], [[ZEXTLD1_2]]
+;
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]]  = sext i32 [[ADD]] to i64
+; DISABLE: [[ADDZA:%[a-zA-Z_0-9-]+]] = add nsw i32
+; DISABLE: [[RESZA:%[a-zA-Z_0-9-]+]]  = sext i32 [[ADDZA]] to i64
+; DISABLE: [[ADDB:%[a-zA-Z_0-9-]+]] = add nsw i32
+; DISABLE: [[RESB:%[a-zA-Z_0-9-]+]]  = sext i32 [[ADDB]] to i64
+;
+; OPTALL: call void @dummy(i64 [[RES]], i64 [[RESZA]], i64 [[RESB]])
+; OPTALL: ret
+define void @severalPromotions(i8* %addr1, i32* %addr2, i8 %a, i32 %b) {
+  %ld = load i8, i8* %addr1
+  %zextld = zext i8 %ld to i32
+  %ld2 = load i32, i32* %addr2
+  %add = add nsw i32 %ld2, %zextld
+  %sextadd = sext i32 %add to i64
+  %zexta = zext i8 %a to i32
+  %addza = add nsw i32 %zexta, %zextld
+  %sextaddza = sext i32 %addza to i64
+  %addb = add nsw i32 %b, %zextld
+  %sextaddb = sext i32 %addb to i64
+  call void @dummy(i64 %sextadd, i64 %sextaddza, i64 %sextaddb)
+  ret void
+}
+
+declare void @dummy(i64, i64, i64)
+
+; Make sure we do not try to promote vector types since the type promotion
+; helper does not support them for now.
+; OPTALL-LABEL: @vectorPromotion
+; OPTALL: [[SHL:%[a-zA-Z_0-9-]+]] = shl nuw nsw <2 x i32> zeroinitializer, <i32 8, i32 8>
+; OPTALL: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext <2 x i32> [[SHL]] to <2 x i64>
+; OPTALL: ret
+define void @vectorPromotion() {
+entry:
+  %a = shl nuw nsw <2 x i32> zeroinitializer, <i32 8, i32 8>
+  %b = zext <2 x i32> %a to <2 x i64>
+  ret void
+}
+
+@a = common global i32 0, align 4
+@c = common global [2 x i32] zeroinitializer, align 4
+
+; Make sure we support promotion of operands that produces a Value as opposed
+; to an instruction.
+; This used to cause a crash.
+; OPTALL-LABEL: @promotionOfArgEndsUpInValue
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i16, i16* %addr
+;
+; OPT-NEXT: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i16 [[LD]] to i32
+; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw nsw i32 [[SEXT]], zext (i1 icmp ne (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @c, i64 0, i64 1), i32* @a) to i32)
+;
+; DISABLE-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw nsw i16 [[LD]], zext (i1 icmp ne (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @c, i64 0, i64 1), i32* @a) to i16)
+; DISABLE-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32
+;
+; OPTALL-NEXT: ret i32 [[RES]]
+define i32 @promotionOfArgEndsUpInValue(i16* %addr) {
+entry:
+  %val = load i16, i16* %addr
+  %add = add nuw nsw i16 %val, zext (i1 icmp ne (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @c, i64 0, i64 1), i32* @a) to i16)
+  %conv3 = sext i16 %add to i32
+  ret i32 %conv3
+}
+
+; Check that we see that one zext can be derived from the other for free.
+; OPTALL-LABEL: @promoteTwoArgZextWithSourceExtendedTwice
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; OPT-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; OPT-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b
+; OPT-NEXT: [[RES64:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ZEXT64]], 12
+; OPT-NEXT: store i32 [[RES32]], i32* %addr
+; OPT-NEXT: store i64 [[RES64]], i64* %q
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[RES2_32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], 12
+; DISABLE-NEXT: store i32 [[RES32]], i32* %addr
+; DISABLE-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i32 [[RES2_32]] to i64
+; DISABLE-NEXT: store i64 [[ZEXT64]], i64* %q
+;
+; OPTALL-NEXT: ret void
+define void @promoteTwoArgZextWithSourceExtendedTwice(i8* %p, i64* %q, i32 %b, i32* %addr) {
+entry:
+  %t = load i8, i8* %p
+  %zextt = zext i8 %t to i32
+  %add = add nuw i32 %zextt, %b
+  %add2 = add nuw i32 %zextt, 12
+  store i32 %add, i32 *%addr
+  %s = zext i32 %add2 to i64
+  store i64 %s, i64* %q
+  ret void
+}
+
+; Check that we do not increase the cost of the code.
+; The input has one free zext and one free sext. If we would have promoted
+; all the way through the load we would end up with a free zext and a
+; non-free sext (of %b).
+; OPTALL-LABEL: @doNotPromoteFreeSExtFromAddrMode
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
+; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]]
+; STRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = trunc i64 [[IDX64]] to i32
+;
+; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i32, i32* %addr, i64 [[IDX64]]
+; OPTALL-NEXT: store i32 [[RES32]], i32* [[GEP]]
+; OPTALL-NEXT: ret void
+define void @doNotPromoteFreeSExtFromAddrMode(i8* %p, i32 %b, i32* %addr) {
+entry:
+  %t = load i8, i8* %p
+  %zextt = zext i8 %t to i32
+  %add = add nsw i32 %zextt, %b
+  %idx64 = sext i32 %add to i64
+  %staddr = getelementptr inbounds i32, i32* %addr, i64 %idx64
+  store i32 %add, i32 *%staddr
+  ret void
+}
+
+; Check that we do not increase the cost of the code.
+; The input has one free zext and one free sext. If we would have promoted
+; all the way through the load we would end up with a free zext and a
+; non-free sext (of %b).
+; OPTALL-LABEL: @doNotPromoteFreeSExtFromAddrMode64
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
+; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]]
+;
+; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i64, i64* %addr, i64 [[IDX64]]
+; OPTALL-NEXT: store i64 %stuff, i64* [[GEP]]
+; OPTALL-NEXT: ret void
+define void @doNotPromoteFreeSExtFromAddrMode64(i8* %p, i32 %b, i64* %addr, i64 %stuff) {
+entry:
+  %t = load i8, i8* %p
+  %zextt = zext i8 %t to i32
+  %add = add nsw i32 %zextt, %b
+  %idx64 = sext i32 %add to i64
+  %staddr = getelementptr inbounds i64, i64* %addr, i64 %idx64
+  store i64 %stuff, i64 *%staddr
+  ret void
+}
+
+; Check that we do not increase the cost of the code.
+; The input has one free zext and one free sext. If we would have promoted
+; all the way through the load we would end up with a free zext and a
+; non-free sext (of %b).
+; OPTALL-LABEL: @doNotPromoteFreeSExtFromAddrMode128
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
+; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]]
+;
+; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i128, i128* %addr, i64 [[IDX64]]
+; OPTALL-NEXT: store i128 %stuff, i128* [[GEP]]
+; OPTALL-NEXT: ret void
+define void @doNotPromoteFreeSExtFromAddrMode128(i8* %p, i32 %b, i128* %addr, i128 %stuff) {
+entry:
+  %t = load i8, i8* %p
+  %zextt = zext i8 %t to i32
+  %add = add nsw i32 %zextt, %b
+  %idx64 = sext i32 %add to i64
+  %staddr = getelementptr inbounds i128, i128* %addr, i64 %idx64
+  store i128 %stuff, i128 *%staddr
+  ret void
+}
+
+
+; Check that we do not increase the cost of the code.
+; The input has one free zext and one free sext. If we would have promoted
+; all the way through the load we would end up with a free zext and a
+; non-free sext (of %b).
+; OPTALL-LABEL: @promoteSExtFromAddrMode256
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; OPT-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
+; OPT-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]]
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i256, i256* %addr, i64 [[IDX64]]
+; OPTALL-NEXT: store i256 %stuff, i256* [[GEP]]
+; OPTALL-NEXT: ret void
+define void @promoteSExtFromAddrMode256(i8* %p, i32 %b, i256* %addr, i256 %stuff) {
+entry:
+  %t = load i8, i8* %p
+  %zextt = zext i8 %t to i32
+  %add = add nsw i32 %zextt, %b
+  %idx64 = sext i32 %add to i64
+  %staddr = getelementptr inbounds i256, i256* %addr, i64 %idx64
+  store i256 %stuff, i256 *%staddr
+  ret void
+}
+
+; Check that we do not increase the cost of the code.
+; The input has one free zext and one free zext.
+; When we promote all the way through the load, we end up with
+; a free zext and a non-free zext (of %b).
+; However, the current target lowering says zext i32 to i64 is free
+; so the promotion happens because the cost did not change and may
+; expose more opportunities.
+; This would need to be fixed at some point.
+; OPTALL-LABEL: @doNotPromoteFreeZExtFromAddrMode
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; This transformation should really happen only for stress mode.
+; OPT-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i32 %b to i64
+; OPT-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ZEXT64]], [[ZEXTB]]
+; OPT-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = trunc i64 [[IDX64]] to i32
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = zext i32 [[RES32]] to i64
+;
+; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i32, i32* %addr, i64 [[IDX64]]
+; OPTALL-NEXT: store i32 [[RES32]], i32* [[GEP]]
+; OPTALL-NEXT: ret void
+define void @doNotPromoteFreeZExtFromAddrMode(i8* %p, i32 %b, i32* %addr) {
+entry:
+  %t = load i8, i8* %p
+  %zextt = zext i8 %t to i32
+  %add = add nuw i32 %zextt, %b
+  %idx64 = zext i32 %add to i64
+  %staddr = getelementptr inbounds i32, i32* %addr, i64 %idx64
+  store i32 %add, i32 *%staddr
+  ret void
+}
+
+; OPTALL-LABEL: @doNotPromoteFreeSExtFromShift
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
+; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]]
+;
+; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; OPTALL-NEXT: [[RES64:%[a-zA-Z_0-9-]+]] = shl i64 [[IDX64]], 12
+; OPTALL-NEXT: ret i64 %staddr
+define i64 @doNotPromoteFreeSExtFromShift(i8* %p, i32 %b) {
+entry:
+  %t = load i8, i8* %p
+  %zextt = zext i8 %t to i32
+  %add = add nsw i32 %zextt, %b
+  %idx64 = sext i32 %add to i64
+  %staddr = shl i64 %idx64, 12
+  ret i64 %staddr
+}
+
+; Same comment as doNotPromoteFreeZExtFromAddrMode.
+; OPTALL-LABEL: @doNotPromoteFreeZExtFromShift
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; This transformation should really happen only for stress mode.
+; OPT-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i32 %b to i64
+; OPT-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ZEXT64]], [[ZEXTB]]
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = zext i32 [[RES32]] to i64
+;
+; OPTALL-NEXT: [[RES64:%[a-zA-Z_0-9-]+]] = shl i64 [[IDX64]], 12
+; OPTALL-NEXT: ret i64 %staddr
+define i64 @doNotPromoteFreeZExtFromShift(i8* %p, i32 %b) {
+entry:
+  %t = load i8, i8* %p
+  %zextt = zext i8 %t to i32
+  %add = add nuw i32 %zextt, %b
+  %idx64 = zext i32 %add to i64
+  %staddr = shl i64 %idx64, 12
+  ret i64 %staddr
+}
+
+; The input has one free zext and one non-free sext.
+; When we promote all the way through to the load, we end up with
+; a free zext, a free sext (%ld1), and a non-free sext (of %cst).
+; However, we when generate load pair and the free sext(%ld1) becomes
+; non-free. So technically, we trade a non-free sext to two non-free
+; sext.
+; This would need to be fixed at some point.
+; OPTALL-LABEL: @doNotPromoteBecauseOfPairedLoad
+; OPTALL: [[LD0:%[a-zA-Z_0-9-]+]] = load i32, i32* %p
+; OPTALL: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i32, i32* %p, i64 1
+; OPTALL: [[LD1:%[a-zA-Z_0-9-]+]] = load i32, i32* [[GEP]]
+;
+; This transformation should really happen only for stress mode.
+; OPT-NEXT: [[SEXTLD1:%[a-zA-Z_0-9-]+]] = sext i32 [[LD1]] to i64
+; OPT-NEXT: [[SEXTCST:%[a-zA-Z_0-9-]+]] = sext i32 %cst to i64
+; OPT-NEXT: [[SEXTRES:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTLD1]], [[SEXTCST]]
+;
+; DISABLE-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[LD1]], %cst
+; DISABLE-NEXT: [[SEXTRES:%[a-zA-Z_0-9-]+]] = sext i32 [[RES]] to i64
+;
+; OPTALL-NEXT: [[ZEXTLD0:%[a-zA-Z_0-9-]+]] = zext i32 [[LD0]] to i64
+; OPTALL-NEXT: [[FINAL:%[a-zA-Z_0-9-]+]] = add i64 [[SEXTRES]], [[ZEXTLD0]]
+; OPTALL-NEXT: ret i64 [[FINAL]]
+define i64 @doNotPromoteBecauseOfPairedLoad(i32* %p, i32 %cst) {
+  %ld0 = load i32, i32* %p
+  %idxLd1 = getelementptr inbounds i32, i32* %p, i64 1
+  %ld1 = load i32, i32* %idxLd1
+  %res = add nsw i32 %ld1, %cst
+  %sextres = sext i32 %res to i64
+  %zextLd0 = zext i32 %ld0 to i64
+  %final = add i64 %sextres, %zextLd0
+  ret i64 %final
+}
diff --git a/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/test/CodeGen/AArch64/arm64-convert-v4f64.ll
index c6b7d83..c4e3e4e 100644
--- a/test/CodeGen/AArch64/arm64-convert-v4f64.ll
+++ b/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -3,11 +3,11 @@
 
 define <4 x i16> @fptosi_v4f64_to_v4i16(<4 x double>* %ptr) {
 ; CHECK: fptosi_v4f64_to_v4i16
-; CHECK-DAG: fcvtzs  v[[LHS:[0-9]+]].2d, v1.2d
-; CHECK-DAG: fcvtzs  v[[RHS:[0-9]+]].2d, v0.2d
-; CHECK-DAG: xtn  v[[LHS_NA:[0-9]+]].2s, v[[LHS]].2d
-; CHECK-DAG: xtn  v[[RHS_NA:[0-9]+]].2s, v[[RHS]].2d
-; CHECK:     uzp1  v0.4h, v[[RHS_NA]].4h, v[[LHS_NA]].4h
+; CHECK-DAG: fcvtzs  v[[LHS:[0-9]+]].2d, v0.2d
+; CHECK-DAG: fcvtzs  v[[RHS:[0-9]+]].2d, v1.2d
+; CHECK-DAG: xtn  v[[MID:[0-9]+]].2s, v[[LHS]].2d
+; CHECK-DAG: xtn2  v[[MID]].4s, v[[RHS]].2d
+; CHECK:     xtn  v0.4h, v[[MID]].4s
   %tmp1 = load <4 x double>, <4 x double>* %ptr
   %tmp2 = fptosi <4 x double> %tmp1 to <4 x i16>
   ret <4 x i16> %tmp2
@@ -15,17 +15,17 @@ define <4 x i16> @fptosi_v4f64_to_v4i16(<4 x double>* %ptr) {
 
 define <8 x i8> @fptosi_v4f64_to_v4i8(<8 x double>* %ptr) {
 ; CHECK: fptosi_v4f64_to_v4i8
-; CHECK-DAG:  fcvtzs  v[[CONV3:[0-9]+]].2d, v3.2d
-; CHECK-DAG:  fcvtzs  v[[CONV2:[0-9]+]].2d, v2.2d
-; CHECK-DAG:  fcvtzs  v[[CONV1:[0-9]+]].2d, v1.2d
 ; CHECK-DAG:  fcvtzs  v[[CONV0:[0-9]+]].2d, v0.2d
-; CHECK-DAG:  xtn  v[[NA3:[0-9]+]].2s, v[[CONV3]].2d
+; CHECK-DAG:  fcvtzs  v[[CONV1:[0-9]+]].2d, v1.2d
+; CHECK-DAG:  fcvtzs  v[[CONV2:[0-9]+]].2d, v2.2d
+; CHECK-DAG:  fcvtzs  v[[CONV3:[0-9]+]].2d, v3.2d
 ; CHECK-DAG:  xtn  v[[NA2:[0-9]+]].2s, v[[CONV2]].2d
-; CHECK-DAG:  xtn  v[[NA1:[0-9]+]].2s, v[[CONV1]].2d
+; CHECK-DAG:  xtn2  v[[NA2]].4s, v[[CONV3]].2d
 ; CHECK-DAG:  xtn  v[[NA0:[0-9]+]].2s, v[[CONV0]].2d
-; CHECK-DAG:  uzp1  v[[TMP1:[0-9]+]].4h, v[[CONV2]].4h, v[[CONV3]].4h
-; CHECK-DAG:  uzp1  v[[TMP2:[0-9]+]].4h, v[[CONV0]].4h, v[[CONV1]].4h
-; CHECK:      uzp1  v0.8b, v[[TMP2]].8b, v[[TMP1]].8b
+; CHECK-DAG:  xtn2  v[[NA0]].4s, v[[CONV1]].2d
+; CHECK-DAG:  xtn  v[[TMP1:[0-9]+]].4h, v[[NA0]].4s
+; CHECK-DAG:  xtn2  v[[TMP1]].8h, v[[NA2]].4s
+; CHECK:      xtn  v0.8b, v[[TMP1]].8h
   %tmp1 = load <8 x double>, <8 x double>* %ptr
   %tmp2 = fptosi <8 x double> %tmp1 to <8 x i8>
   ret <8 x i8> %tmp2
diff --git a/test/CodeGen/AArch64/arm64-dup.ll b/test/CodeGen/AArch64/arm64-dup.ll
index 849e227..c6b7de3 100644
--- a/test/CodeGen/AArch64/arm64-dup.ll
+++ b/test/CodeGen/AArch64/arm64-dup.ll
@@ -321,3 +321,40 @@ entry:
   %sub = sub <4 x i16> %a, %mul
   ret <4 x i16> %sub
 }
+
+; Also test the DUP path in the PerfectShuffle generator.
+
+; CHECK-LABEL: test_perfectshuffle_dupext_v4i16:
+; CHECK-NEXT: dup.4h v0, v0[0]
+; CHECK-NEXT: ext.8b v0, v0, v1, #4
+define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+  %r = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
+  ret <4 x i16> %r
+}
+
+; CHECK-LABEL: test_perfectshuffle_dupext_v4f16:
+; CHECK-NEXT: dup.4h v0, v0[0]
+; CHECK-NEXT: ext.8b v0, v0, v1, #4
+; CHECK-NEXT: ret
+define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b) nounwind {
+  %r = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
+  ret <4 x half> %r
+}
+
+; CHECK-LABEL: test_perfectshuffle_dupext_v4i32:
+; CHECK-NEXT: dup.4s v0, v0[0]
+; CHECK-NEXT: ext.16b v0, v0, v1, #8
+; CHECK-NEXT: ret
+define <4 x i32> @test_perfectshuffle_dupext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+  %r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
+  ret <4 x i32> %r
+}
+
+; CHECK-LABEL: test_perfectshuffle_dupext_v4f32:
+; CHECK-NEXT: dup.4s v0, v0[0]
+; CHECK-NEXT: ext.16b v0, v0, v1, #8
+; CHECK-NEXT: ret
+define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float> %b) nounwind {
+  %r = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
+  ret <4 x float> %r
+}
diff --git a/test/CodeGen/AArch64/arm64-fcopysign.ll b/test/CodeGen/AArch64/arm64-fcopysign.ll
index 66241df..feffd41 100644
--- a/test/CodeGen/AArch64/arm64-fcopysign.ll
+++ b/test/CodeGen/AArch64/arm64-fcopysign.ll
@@ -39,7 +39,7 @@ entry:
 ; CHECK: fcvt s0, d0
 ; CHECK: movi.4s v[[CONST:[0-9]+]], #0x80, lsl #24
 ; CHECK: bit.16b v{{[0-9]+}}, v0, v[[CONST]]
-  %0 = tail call double (...)* @bar() nounwind
+  %0 = tail call double (...) @bar() nounwind
   %1 = fptrunc double %0 to float
   %2 = tail call float @copysignf(float 5.000000e-01, float %1) nounwind readnone
   %3 = fadd float %1, %2
diff --git a/test/CodeGen/AArch64/arm64-join-reserved.ll b/test/CodeGen/AArch64/arm64-join-reserved.ll
index e99168b..dee0344 100644
--- a/test/CodeGen/AArch64/arm64-join-reserved.ll
+++ b/test/CodeGen/AArch64/arm64-join-reserved.ll
@@ -10,7 +10,7 @@ target triple = "arm64-apple-macosx10"
 ; CHECK: ret
 define void @g() nounwind ssp {
 entry:
-  tail call void (i32, ...)* @f(i32 0, i32 0) nounwind
+  tail call void (i32, ...) @f(i32 0, i32 0) nounwind
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll b/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll
new file mode 100644
index 0000000..5bc4d71
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple=arm64-apple-ios -aarch64-strict-align < %s | FileCheck %s
+
+; Small (16-bytes here) unaligned memcpys should stay memcpy calls if
+; strict-alignment is turned on.
+define void @t0(i8* %out, i8* %in) {
+; CHECK-LABEL: t0:
+; CHECK:         orr w2, wzr, #0x10
+; CHECK-NEXT:    bl _memcpy
+entry:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out, i8* %in, i64 16, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)
diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll
index 4a92c3d..b74a406 100644
--- a/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1086,7 +1086,7 @@ define <2 x i32> @test_concat_diff_v1i32_v1i32(i32 %a, i32 %b) {
 ; CHECK-LABEL: test_concat_diff_v1i32_v1i32:
 ; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
 ; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
-; CHECK-NEXT: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: ins {{v[0-9]+}}.s[1], w{{[0-9]+}}
 entry:
   %c = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a)
   %d = insertelement <2 x i32> undef, i32 %c, i32 0
diff --git a/test/CodeGen/AArch64/arm64-neon-v8.1a.ll b/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
new file mode 100644
index 0000000..51ed8a1
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
@@ -0,0 +1,456 @@
+; RUN: llc < %s -verify-machineinstrs -march=arm64 -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V8a
+; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
+; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple
+
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32)
+declare i16 @llvm.aarch64.neon.sqrdmulh.i16(i16, i16)
+
+declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
+declare i16 @llvm.aarch64.neon.sqadd.i16(i16, i16)
+
+declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
+declare i16 @llvm.aarch64.neon.sqsub.i16(i16, i16)
+
+;-----------------------------------------------------------------------------
+; RDMA Vector
+; test for SIMDThreeSameVectorSQRDMLxHTiedHS
+
+define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_v4i16:
+   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
+   %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc,  <4 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
+; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.4h
+; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2
+   ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_v8i16:
+   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
+   %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
+; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.8h
+; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2
+   ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_v2i32:
+   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
+   %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
+; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.2s
+; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2
+   ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_v4i32:
+   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
+   %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK-V81:        sqrdmulh    v1.4s, v1.4s, v2.4s
+; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.4s
+; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2
+   ret <4 x i32> %retval
+}
+
+define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_v4i16:
+   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
+   %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
+; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.4h
+; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2
+   ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_v8i16:
+   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
+   %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
+; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.8h
+; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2
+   ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_v2i32:
+   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
+   %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
+; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.2s
+; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2
+   ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_v4i32:
+   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
+   %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.4s
+; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.4s
+; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2
+   ret <4 x i32> %retval
+}
+
+;-----------------------------------------------------------------------------
+; RDMA Vector, by element
+; tests for vXiYY_indexed in SIMDIndexedSQRDMLxHSDTied
+
+define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlah_lane_s16:
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+  %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+; CHECK-V8a :       sqrdmulh    v1.4h, v1.4h, v2.h[3]
+; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.h[3]
+; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2[3]
+  ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlahq_lane_s16:
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+  %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
+; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.h[2]
+; CHECK-V81a-apple: sqrdmlah.8h v0,    v1,    v2[2]
+  ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlah_lane_s32:
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+  %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
+; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.s[1]
+; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2[1]
+  ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlahq_lane_s32:
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+  %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
+; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.s[0]
+; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2[0]
+  ret <4 x i32> %retval
+}
+
+define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlsh_lane_s16:
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+  %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.h[3]
+; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.h[3]
+; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2[3]
+  ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlshq_lane_s16:
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+  %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
+; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.h[2]
+; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2[2]
+  ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlsh_lane_s32:
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+  %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
+; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.s[1]
+; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2[1]
+  ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlshq_lane_s32:
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+  %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
+; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.s[0]
+; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2[0]
+  ret <4 x i32> %retval
+}
+
+;-----------------------------------------------------------------------------
+; RDMA Vector, by element, extracted
+; i16 tests are for vXi16_indexed in SIMDIndexedSQRDMLxHSDTied, with IR in ACLE style
+; i32 tests are for   "def : Pat" in SIMDIndexedSQRDMLxHSDTied
+
+define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16:
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
+  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
+  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
+  %retval = extractelement <4 x i16> %retval_vec, i64 0
+; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, v0.4h, v1.h[1]
+; CHECK-V81a:       sqrdmlah    {{v[2-9]+}}.4h, v0.4h, v1.h[1]
+; CHECK-V81a-apple: sqrdmlah.4h {{v[2-9]+}},    v0,    v1[1]
+  ret i16 %retval
+}
+
+define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16:
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
+  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+  %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
+  %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
+  %retval = extractelement <8 x i16> %retval_vec, i64 0
+; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, v0.8h, v1.h[1]
+; CHECK-V81a:       sqrdmlah    {{v[2-9]+}}.8h, v0.8h, v1.h[1]
+; CHECK-V81a-apple: sqrdmlah.8h {{v[2-9]+}},    v0,    v1[1]
+  ret i16 %retval
+}
+
+define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32:
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+  %extract = extractelement <2 x i32> %prod, i64 0
+  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
+; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
+; CHECK-V81a:       sqrdmlah    v2.2s, v0.2s, v1.s[0]
+; CHECK-V81a-apple: sqrdmlah.2s v2,    v0,    v1[0]
+  ret i32 %retval
+}
+
+define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32:
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+  %extract = extractelement <4 x i32> %prod, i64 0
+  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
+; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
+; CHECK-V81a:       sqrdmlah    v2.4s, v0.4s, v1.s[0]
+; CHECK-V81a-apple: sqrdmlah.4s v2,    v0,    v1[0]
+  ret i32 %retval
+}
+
+define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16:
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
+  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
+  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
+  %retval = extractelement <4 x i16> %retval_vec, i64 0
+; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, v0.4h, v1.h[1]
+; CHECK-V81a:       sqrdmlsh    {{v[2-9]+}}.4h, v0.4h, v1.h[1]
+; CHECK-V81a-apple: sqrdmlsh.4h {{v[2-9]+}},    v0,    v1[1]
+  ret i16 %retval
+}
+
+define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16:
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
+  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+  %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
+  %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
+  %retval = extractelement <8 x i16> %retval_vec, i64 0
+; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, v0.8h, v1.h[1]
+; CHECK-V81a:       sqrdmlsh    {{v[2-9]+}}.8h, v0.8h, v1.h[1]
+; CHECK-V81a-apple: sqrdmlsh.8h {{v[2-9]+}},    v0,    v1[1]
+  ret i16 %retval
+}
+
+define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32:
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+  %extract = extractelement <2 x i32> %prod, i64 0
+  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
+; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
+; CHECK-V81a:       sqrdmlsh    v2.2s, v0.2s, v1.s[0]
+; CHECK-V81a-apple: sqrdmlsh.2s v2,    v0,    v1[0]
+  ret i32 %retval
+}
+
+define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32:
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+  %extract = extractelement <4 x i32> %prod, i64 0
+  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
+; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
+; CHECK-V81a:       sqrdmlsh    v2.4s, v0.4s, v1.s[0]
+; CHECK-V81a-apple: sqrdmlsh.4s v2,    v0,    v1[0]
+  ret i32 %retval
+}
+
+;-----------------------------------------------------------------------------
+; RDMA Scalar
+; test for "def : Pat" near SIMDThreeScalarHSTied in AArch64InstInfo.td
+
+define i16 @test_sqrdmlah_v1i16(i16 %acc, i16 %x, i16 %y) {
+; CHECK-LABEL: test_sqrdmlah_v1i16:
+  %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
+  %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
+  %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec,  <4 x i16> %y_vec)
+  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
+  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec,  <4 x i16> %prod_vec)
+  %retval = extractelement <4 x i16> %retval_vec, i64 0
+; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
+  ret i16 %retval
+}
+
+define i32 @test_sqrdmlah_v1i32(i32 %acc, i32 %x, i32 %y) {
+; CHECK-LABEL: test_sqrdmlah_v1i32:
+  %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
+  %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
+  %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec,  <4 x i32> %y_vec)
+  %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
+  %retval_vec =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc_vec,  <4 x i32> %prod_vec)
+  %retval = extractelement <4 x i32> %retval_vec, i64 0
+; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-V81a-apple: sqrdmlah.4s {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
+  ret i32 %retval
+}
+
+
+define i16 @test_sqrdmlsh_v1i16(i16 %acc, i16 %x, i16 %y) {
+; CHECK-LABEL: test_sqrdmlsh_v1i16:
+  %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
+  %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
+  %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec,  <4 x i16> %y_vec)
+  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
+  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec,  <4 x i16> %prod_vec)
+  %retval = extractelement <4 x i16> %retval_vec, i64 0
+; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-V81a-apple: sqrdmlsh.4h {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
+  ret i16 %retval
+}
+
+define i32 @test_sqrdmlsh_v1i32(i32 %acc, i32 %x, i32 %y) {
+; CHECK-LABEL: test_sqrdmlsh_v1i32:
+  %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
+  %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
+  %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec,  <4 x i32> %y_vec)
+  %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
+  %retval_vec =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc_vec,  <4 x i32> %prod_vec)
+  %retval = extractelement <4 x i32> %retval_vec, i64 0
+; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-V81a-apple: sqrdmlsh.4s {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
+  ret i32 %retval
+}
+define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) {
+; CHECK-LABEL: test_sqrdmlah_i32:
+  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
+  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
+; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-V81a:       sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-V81a-apple: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  ret i32 %retval
+}
+
+define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_i32:
+  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
+  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
+; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-V81a:       sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-V81a-apple: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  ret i32 %retval
+}
+
+;-----------------------------------------------------------------------------
+; RDMA Scalar, by element
+; i16 tests are performed via tests in above chapter, with IR in ACLE style
+; i32 tests are for i32_indexed in SIMDIndexedSQRDMLxHSDTied
+
+define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %x, <4 x i16> %y_vec) {
+; CHECK-LABEL: test_sqrdmlah_extract_i16:
+  %shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
+  %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
+  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %shuffle)
+  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
+  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
+  %retval = extractelement <4 x i16> %retval_vec, i32 0
+; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
+; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
+; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}},    {{v[0-9]+}}, v0[1]
+  ret i16 %retval
+}
+
+define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_extract_i32:
+  %extract = extractelement <4 x i32> %rhs, i32 3
+  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
+  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
+; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+; CHECK-V81a:       sqrdmlah   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+; CHECK-V81a-apple: sqrdmlah.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
+  ret i32 %retval
+}
+
+define i16 @test_sqrdmlshq_extract_i16(i16 %acc, i16 %x, <8 x i16> %y_vec) {
+; CHECK-LABEL: test_sqrdmlshq_extract_i16:
+  %shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1>
+  %x_vec = insertelement <8 x i16> undef, i16 %x, i64 0
+  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x_vec, <8 x i16> %shuffle)
+  %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
+  %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
+  %retval = extractelement <8 x i16> %retval_vec, i32 0
+; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
+; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
+; CHECK-V81a-apple: sqrdmlsh.8h {{v[0-9]+}},    {{v[0-9]+}}, v0[1]
+  ret i16 %retval
+}
+
+define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_extract_i32:
+  %extract = extractelement <4 x i32> %rhs, i32 3
+  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
+  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
+; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+; CHECK-V81a:       sqrdmlsh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+; CHECK-V81a-apple: sqrdmlsh.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
+  ret i32 %retval
+}
diff --git a/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll b/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll
index 5a740d8..2651f11 100644
--- a/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll
+++ b/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll
@@ -9,7 +9,7 @@
 define void @clobberScratch(i32* %p) {
   %v = load i32, i32* %p
   tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v)
   store i32 %v, i32* %p
   ret void
 }
diff --git a/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
index 8f79f80..b8236c5 100644
--- a/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
+++ b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
@@ -23,9 +23,9 @@ entry:
 ; FAST-NEXT:   movk  x16, #0xbeef
 ; FAST-NEXT:   blr x16
   %resolveCall2 = inttoptr i64 281474417671919 to i8*
-  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
+  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
   %resolveCall3 = inttoptr i64 244837814038255 to i8*
-  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
+  tail call webkit_jscc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
   ret void
 }
 
@@ -59,7 +59,7 @@ entry:
 ; FAST-NEXT:   movk  x16, #0xbeef
 ; FAST-NEXT:   blr x16
   %call = inttoptr i64 281474417671919 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
   ret i64 %result
 }
 
@@ -101,7 +101,7 @@ entry:
 ; FAST-NEXT:   movk  x16, #0xbeef
 ; FAST-NEXT:   blr x16
   %call = inttoptr i64 281474417671919 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
   ret i64 %result
 }
 
diff --git a/test/CodeGen/AArch64/arm64-patchpoint.ll b/test/CodeGen/AArch64/arm64-patchpoint.ll
index cf06653..d9ec7e5 100644
--- a/test/CodeGen/AArch64/arm64-patchpoint.ll
+++ b/test/CodeGen/AArch64/arm64-patchpoint.ll
@@ -16,9 +16,9 @@ entry:
 ; CHECK-NEXT:  blr  x16
 ; CHECK:       ret
   %resolveCall2 = inttoptr i64 244837814094590 to i8*
-  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
   %resolveCall3 = inttoptr i64 244837814094591 to i8*
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 3, i32 20, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 3, i32 20, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
   ret i64 %result
 }
 
@@ -38,7 +38,7 @@ entry:
   store i64 11, i64* %metadata
   store i64 12, i64* %metadata
   store i64 13, i64* %metadata
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
   ret void
 }
 
@@ -51,14 +51,14 @@ entry:
   %tmp80 = add i64 %tmp79, -16
   %tmp81 = inttoptr i64 %tmp80 to i64*
   %tmp82 = load i64, i64* %tmp81, align 8
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 8, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 15, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 14, i32 8, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 15, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
   %tmp83 = load i64, i64* %tmp33, align 8
   %tmp84 = add i64 %tmp83, -24
   %tmp85 = inttoptr i64 %tmp84 to i64*
   %tmp86 = load i64, i64* %tmp85, align 8
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 8, i64 %arg, i64 %tmp10, i64 %tmp86)
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 18, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 17, i32 8, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 18, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
   ret i64 10
 }
 
@@ -74,7 +74,7 @@ entry:
 ; CHECK-NEXT: nop
 ; CHECK-NEXT: ldp
 ; CHECK-NEXT: ret
-  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* null, i32 2, i64 %p1, i64 %p2)
+  %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* null, i32 2, i64 %p1, i64 %p2)
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/arm64-stackmap-nops.ll b/test/CodeGen/AArch64/arm64-stackmap-nops.ll
index 5915b64..2647ac4 100644
--- a/test/CodeGen/AArch64/arm64-stackmap-nops.ll
+++ b/test/CodeGen/AArch64/arm64-stackmap-nops.ll
@@ -8,7 +8,7 @@ entry:
 ; CHECK:      nop
 ; CHECK-NEXT: nop
 ; CHECK-NOT:  nop
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  0, i32  16)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  0, i32  16)
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/arm64-stackmap.ll b/test/CodeGen/AArch64/arm64-stackmap.ll
index 29e4484..1a4df7a 100644
--- a/test/CodeGen/AArch64/arm64-stackmap.ll
+++ b/test/CodeGen/AArch64/arm64-stackmap.ll
@@ -78,7 +78,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 define void @constantargs() {
 entry:
   %0 = inttoptr i64 244837814094590 to i8*
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 1, i32 20, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 1, i32 20, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
   ret void
 }
 
@@ -100,7 +100,7 @@ entry:
   ; Runtime void->void call.
   call void inttoptr (i64 244837814094590 to void ()*)()
   ; Followed by inline OSR patchpoint with 12-byte shadow and 2 live vars.
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
   ret void
 }
 
@@ -126,7 +126,7 @@ entry:
 cold:
   ; OSR patchpoint with 12-byte nop-slide and 2 live vars.
   %thunk = inttoptr i64 244837814094590 to i8*
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4, i32 20, i8* %thunk, i32 0, i64 %a, i64 %b)
+  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 4, i32 20, i8* %thunk, i32 0, i64 %a, i64 %b)
   unreachable
 ret:
   ret void
@@ -142,7 +142,7 @@ ret:
 define i64 @propertyRead(i64* %obj) {
 entry:
   %resolveRead = inttoptr i64 244837814094590 to i8*
-  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveRead, i32 1, i64* %obj)
+  %result = call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveRead, i32 1, i64* %obj)
   %add = add i64 %result, 3
   ret i64 %add
 }
@@ -162,7 +162,7 @@ entry:
 define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) {
 entry:
   %resolveWrite = inttoptr i64 244837814094590 to i8*
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
   ret void
 }
 
@@ -184,7 +184,7 @@ entry:
 define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
 entry:
   %resolveCall = inttoptr i64 244837814094590 to i8*
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 7, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 7, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
   ret void
 }
 
@@ -206,7 +206,7 @@ entry:
 define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
 entry:
   %resolveCall = inttoptr i64 244837814094590 to i8*
-  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 8, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  %result = call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 8, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
   %add = add i64 %result, 3
   ret i64 %add
 }
@@ -226,7 +226,7 @@ entry:
 ; CHECK-NEXT:   .short 29
 define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27) {
 entry:
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 11, i32 20, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
+  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 20, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
   ret void
 }
 
@@ -245,7 +245,7 @@ entry:
 ; CHECK-NEXT:   .short 29
 define webkit_jscc void @spilledStackMapValue(i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29) {
 entry:
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 16, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29)
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 12, i32 16, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29)
   ret void
 }
 
@@ -263,7 +263,7 @@ entry:
 ; CHECK-NEXT:   .long   33
 
 define void @liveConstant() {
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 8, i32 33)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 15, i32 8, i32 33)
   ret void
 }
 
@@ -280,7 +280,7 @@ define void @liveConstant() {
 ; CHECK-NEXT:   .long   -{{[0-9]+}}
 define void @clobberLR(i32 %a) {
   tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x31}"() nounwind
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/arm64-vshuffle.ll b/test/CodeGen/AArch64/arm64-vshuffle.ll
index 75e0d80..15ea21b 100644
--- a/test/CodeGen/AArch64/arm64-vshuffle.ll
+++ b/test/CodeGen/AArch64/arm64-vshuffle.ll
@@ -1,22 +1,8 @@
 ; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -mcpu=cyclone | FileCheck %s
 
 
-; The mask:
-; CHECK: lCPI0_0:
-; CHECK:  .byte   2                       ; 0x2
-; CHECK:  .byte   255                     ; 0xff
-; CHECK:  .byte   6                       ; 0x6
-; CHECK:  .byte   255                     ; 0xff
-; The second vector is legalized to undef and the elements of the first vector
-; are used instead.
-; CHECK:  .byte   2                       ; 0x2
-; CHECK:  .byte   4                       ; 0x4
-; CHECK:  .byte   6                       ; 0x6
-; CHECK:  .byte   0                       ; 0x0
 ; CHECK: test1
-; CHECK: ldr d[[REG0:[0-9]+]], [{{.*}}, lCPI0_0
-; CHECK: movi.8h v[[REG1:[0-9]+]], #0x1, lsl #8
-; CHECK: tbl.8b  v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+; CHECK: movi d[[REG0:[0-9]+]], #0000000000000000
 define <8 x i1> @test1() {
 entry:
   %Shuff = shufflevector <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6,
@@ -30,18 +16,16 @@ entry:
 
 ; CHECK: lCPI1_0:
 ; CHECK:          .byte   0                       ; 0x0
-; CHECK:          .byte   255                     ; 0xff
-; CHECK:          .byte   2                       ; 0x2
-; CHECK:          .byte   255                     ; 0xff
-; CHECK:          .byte   10                      ; 0xa
-; CHECK:          .byte   12                      ; 0xc
-; CHECK:          .byte   14                      ; 0xe
-; CHECK:          .byte   7                       ; 0x7
+; CHECK:          .byte   0                       ; 0x0
+; CHECK:          .byte   0                       ; 0x0
+; CHECK:          .byte   0                       ; 0x0
+; CHECK:          .byte   1                       ; 0x1
+; CHECK:          .byte   0                       ; 0x0
+; CHECK:          .byte   0                       ; 0x0
+; CHECK:          .byte   0                       ; 0x0
 ; CHECK: test2
-; CHECK: ldr     d[[REG0:[0-9]+]], [{{.*}}, lCPI1_0@PAGEOFF]
-; CHECK: adrp    x[[REG2:[0-9]+]], lCPI1_1@PAGE
-; CHECK: ldr     q[[REG1:[0-9]+]], [x[[REG2]], lCPI1_1@PAGEOFF]
-; CHECK: tbl.8b  v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+; CHECK: adrp    x[[REG2:[0-9]+]], lCPI1_0@PAGE
+; CHECK: ldr     d[[REG1:[0-9]+]], [x[[REG2]], lCPI1_0@PAGEOFF]
 define <8 x i1>@test2() {
 bb:
   %Shuff = shufflevector <8 x i1> zeroinitializer,
@@ -51,28 +35,8 @@ bb:
   ret <8 x i1> %Shuff
 }
 
-; CHECK: lCPI2_0:
-; CHECK:         .byte   2                       ; 0x2
-; CHECK:         .byte   255                     ; 0xff
-; CHECK:         .byte   6                       ; 0x6
-; CHECK:         .byte   255                     ; 0xff
-; CHECK:         .byte   10                      ; 0xa
-; CHECK:         .byte   12                      ; 0xc
-; CHECK:         .byte   14                      ; 0xe
-; CHECK:         .byte   0                       ; 0x0
-; CHECK:         .byte   2                       ; 0x2
-; CHECK:         .byte   255                     ; 0xff
-; CHECK:         .byte   6                       ; 0x6
-; CHECK:         .byte   255                     ; 0xff
-; CHECK:         .byte   10                      ; 0xa
-; CHECK:         .byte   12                      ; 0xc
-; CHECK:         .byte   14                      ; 0xe
-; CHECK:         .byte   0                       ; 0x0
 ; CHECK: test3
-; CHECK: adrp    x[[REG3:[0-9]+]], lCPI2_0@PAGE
-; CHECK: ldr     q[[REG0:[0-9]+]], [x[[REG3]], lCPI2_0@PAGEOFF]
-; CHECK: ldr     q[[REG1:[0-9]+]], [x[[REG3]], lCPI2_1@PAGEOFF]
-; CHECK: tbl.16b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+; CHECK: movi.4s v{{[0-9]+}}, #0x1
 define <16 x i1> @test3(i1* %ptr, i32 %v) {
 bb:
   %Shuff = shufflevector <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>, <16 x i1> undef,
@@ -81,29 +45,26 @@ bb:
                  i32 14, i32 0>
   ret <16 x i1> %Shuff
 }
-; CHECK: lCPI3_1:
+; CHECK: lCPI3_0:
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
 ; CHECK:         .byte   0                       ; 0x0
 ; CHECK:         .byte   1                       ; 0x1
-; CHECK:         .byte   2                       ; 0x2
-; CHECK:         .byte   18                      ; 0x12
-; CHECK:         .byte   4                       ; 0x4
-; CHECK:         .byte   5                       ; 0x5
-; CHECK:         .byte   6                       ; 0x6
-; CHECK:         .byte   7                       ; 0x7
-; CHECK:         .byte   8                       ; 0x8
-; CHECK:         .byte   31                      ; 0x1f
-; CHECK:         .byte   10                      ; 0xa
-; CHECK:         .byte   30                      ; 0x1e
-; CHECK:         .byte   12                      ; 0xc
-; CHECK:         .byte   13                      ; 0xd
-; CHECK:         .byte   14                      ; 0xe
-; CHECK:         .byte   15                      ; 0xf
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
 ; CHECK: _test4:
-; CHECK:         ldr     q[[REG1:[0-9]+]]
-; CHECK:         movi.2d v[[REG0:[0-9]+]], #0000000000000000
-; CHECK:         adrp    x[[REG3:[0-9]+]], lCPI3_1@PAGE
-; CHECK:         ldr     q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_1@PAGEOFF]
-; CHECK:         tbl.16b v{{[0-9]+}}, { v[[REG0]], v[[REG1]] }, v[[REG2]]
+; CHECK:         adrp    x[[REG3:[0-9]+]], lCPI3_0@PAGE
+; CHECK:         ldr     q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_0@PAGEOFF]
 define <16 x i1> @test4(i1* %ptr, i32 %v) {
 bb:
   %Shuff = shufflevector <16 x i1> zeroinitializer,
diff --git a/test/CodeGen/AArch64/bitcast.ll b/test/CodeGen/AArch64/bitcast.ll
new file mode 100644
index 0000000..e88ea9e
--- /dev/null
+++ b/test/CodeGen/AArch64/bitcast.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s
+
+; PR23065: SCALAR_TO_VECTOR implies the top elements 1 to N-1 of the N-element vector are undefined.
+
+define <4 x i16> @foo1(<2 x i32> %a) {
+; CHECK-LABEL: foo1:
+; CHECK:       movi	d0, #0000000000000000
+; CHECK-NEXT:  ret
+
+  %1 = shufflevector <2 x i32> <i32 58712, i32 undef>, <2 x i32> %a, <2 x i32> <i32 0, i32 2>
+; Can't optimize the following bitcast to scalar_to_vector.
+  %2 = bitcast <2 x i32> %1 to <4 x i16>
+  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  ret <4 x i16> %3
+}
+
+define <4 x i16> @foo2(<2 x i32> %a) {
+; CHECK-LABEL: foo2:
+; CHECK:       movi	d0, #0000000000000000
+; CHECK-NEXT:  ret
+
+  %1 = shufflevector <2 x i32> <i32 712, i32 undef>, <2 x i32> %a, <2 x i32> <i32 0, i32 2>
+; Can't optimize the following bitcast to scalar_to_vector.
+  %2 = bitcast <2 x i32> %1 to <4 x i16>
+  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  ret <4 x i16> %3
+}
diff --git a/test/CodeGen/AArch64/br-to-eh-lpad.ll b/test/CodeGen/AArch64/br-to-eh-lpad.ll
index e948b87..f304ba4 100644
--- a/test/CodeGen/AArch64/br-to-eh-lpad.ll
+++ b/test/CodeGen/AArch64/br-to-eh-lpad.ll
@@ -30,12 +30,12 @@ invoke.cont7:
   unreachable
 
 if.end50.thread:
-  tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 125)
-  tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 128)
+  tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 125)
+  tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 128)
   unreachable
 
 invoke.cont33:
-  tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 119)
+  tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 119)
   unreachable
 
 invoke.cont41:
@@ -51,7 +51,7 @@ lpad40:
   br label %finally.catchall
 
 finally.catchall:
-  tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 125)
+  tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 125)
   unreachable
 }
 
diff --git a/test/CodeGen/AArch64/concat_vector-scalar-combine.ll b/test/CodeGen/AArch64/concat_vector-scalar-combine.ll
new file mode 100644
index 0000000..1c64af6
--- /dev/null
+++ b/test/CodeGen/AArch64/concat_vector-scalar-combine.ll
@@ -0,0 +1,125 @@
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; Test the (concat_vectors (bitcast (scalar)), ..) pattern.
+
+define <8 x i8> @test_concat_scalar_v2i8_to_v8i8_dup(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalar_v2i8_to_v8i8_dup:
+; CHECK-NEXT: dup.4h v0, w0
+; CHECK-NEXT: ret
+  %t = trunc i32 %x to i16
+  %0 = bitcast i16 %t to <2 x i8>
+  %1 = shufflevector <2 x i8> %0, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  ret <8 x i8> %1
+}
+
+define <8 x i8> @test_concat_scalar_v4i8_to_v8i8_dup(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalar_v4i8_to_v8i8_dup:
+; CHECK-NEXT: dup.2s v0, w0
+; CHECK-NEXT: ret
+  %0 = bitcast i32 %x to <4 x i8>
+  %1 = shufflevector <4 x i8> %0, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x i8> %1
+}
+
+define <8 x i16> @test_concat_scalar_v2i16_to_v8i16_dup(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalar_v2i16_to_v8i16_dup:
+; CHECK-NEXT: dup.4s v0, w0
+; CHECK-NEXT: ret
+  %0 = bitcast i32 %x to <2 x i16>
+  %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 0, i32 1, i32 0, i32 1>
+  ret <8 x i16> %1
+}
+
+define <8 x i8> @test_concat_scalars_2x_v2i8_to_v8i8(i32 %x, i32 %y) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalars_2x_v2i8_to_v8i8:
+; CHECK-NEXT: ins.h v0[0], w0
+; CHECK-NEXT: ins.h v0[1], w1
+; CHECK-NEXT: ins.h v0[3], w1
+; CHECK-NEXT: ret
+  %tx = trunc i32 %x to i16
+  %ty = trunc i32 %y to i16
+  %bx = bitcast i16 %tx to <2 x i8>
+  %by = bitcast i16 %ty to <2 x i8>
+  %r = shufflevector <2 x i8> %bx, <2 x i8> %by, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 2, i32 3>
+  ret <8 x i8> %r
+}
+
+define <8 x i8> @test_concat_scalars_2x_v4i8_to_v8i8_dup(i32 %x, i32 %y) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalars_2x_v4i8_to_v8i8_dup:
+; CHECK-NEXT: fmov s0, w1
+; CHECK-NEXT: ins.s v0[1], w0
+; CHECK-NEXT: ret
+  %bx = bitcast i32 %x to <4 x i8>
+  %by = bitcast i32 %y to <4 x i8>
+  %r = shufflevector <4 x i8> %bx, <4 x i8> %by, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x i8> %r
+}
+
+define <8 x i16> @test_concat_scalars_2x_v2i16_to_v8i16_dup(i32 %x, i32 %y) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalars_2x_v2i16_to_v8i16_dup:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ins.s v0[1], w1
+; CHECK-NEXT: ins.s v0[2], w1
+; CHECK-NEXT: ins.s v0[3], w0
+; CHECK-NEXT: ret
+  %bx = bitcast i32 %x to <2 x i16>
+  %by = bitcast i32 %y to <2 x i16>
+  %r = shufflevector <2 x i16> %bx, <2 x i16> %by, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1>
+  ret <8 x i16> %r
+}
+
+; Also make sure we minimize bitcasts.
+
+; This is a pretty artificial testcase: make sure we bitcast to floating-point
+; if any of the scalars is floating-point.
+define <8 x i8> @test_concat_scalars_mixed_2x_v2i8_to_v8i8(float %dummy, i32 %x, half %y) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalars_mixed_2x_v2i8_to_v8i8:
+; CHECK-NEXT: fmov s[[X:[0-9]+]], w0
+; CHECK-NEXT: ins.h v0[0], v[[X]][0]
+; CHECK-NEXT: ins.h v0[1], v1[0]
+; CHECK-NEXT: ins.h v0[2], v[[X]][0]
+; CHECK-NEXT: ins.h v0[3], v1[0]
+; CHECK-NEXT: ret
+  %t = trunc i32 %x to i16
+  %0 = bitcast i16 %t to <2 x i8>
+  %y0 = bitcast half %y to <2 x i8>
+  %1 = shufflevector <2 x i8> %0, <2 x i8> %y0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x i8> %1
+}
+
+define <2 x float> @test_concat_scalars_fp_2x_v2i8_to_v8i8(float %dummy, half %x, half %y) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalars_fp_2x_v2i8_to_v8i8:
+; CHECK-NEXT: ins.h v0[0], v1[0]
+; CHECK-NEXT: ins.h v0[1], v2[0]
+; CHECK-NEXT: ins.h v0[2], v1[0]
+; CHECK-NEXT: ins.h v0[3], v2[0]
+; CHECK-NEXT: ret
+  %0 = bitcast half %x to <2 x i8>
+  %y0 = bitcast half %y to <2 x i8>
+  %1 = shufflevector <2 x i8> %0, <2 x i8> %y0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %2 = bitcast <8 x i8> %1 to <2 x float>
+  ret <2 x float> %2
+}
+
+define <4 x float> @test_concat_scalar_fp_v2i16_to_v16i8_dup(float %x) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalar_fp_v2i16_to_v16i8_dup:
+; CHECK-NEXT: dup.4s v0, v0[0]
+; CHECK-NEXT: ret
+  %0 = bitcast float %x to <2 x i16>
+  %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 0, i32 1, i32 0, i32 1>
+  %2 = bitcast <8 x i16> %1 to <4 x float>
+  ret <4 x float> %2
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/concat_vector-truncate-combine.ll b/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
index c510e27..ee52786 100644
--- a/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
+++ b/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
@@ -2,6 +2,8 @@
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
+; Test the (concat_vectors (trunc), (trunc)) pattern.
+
 define <4 x i16> @test_concat_truncate_v2i64_to_v4i16(<2 x i64> %a, <2 x i64> %b) #0 {
 entry:
 ; CHECK-LABEL: test_concat_truncate_v2i64_to_v4i16:
diff --git a/test/CodeGen/AArch64/concat_vector-truncated-scalar-combine.ll b/test/CodeGen/AArch64/concat_vector-truncated-scalar-combine.ll
new file mode 100644
index 0000000..eb6c80d
--- /dev/null
+++ b/test/CodeGen/AArch64/concat_vector-truncated-scalar-combine.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -asm-verbose=false | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; Test the (concat_vectors (bitcast (trunc (scalar))), undef..) pattern.
+
+define <8 x i8> @test_concat_from_truncated_scalar(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_concat_from_truncated_scalar:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ret
+  %t = trunc i32 %x to i16
+  %0 = bitcast i16 %t to <2 x i8>
+  %1 = shufflevector <2 x i8> %0, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x i8> %1
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/dag-combine-invaraints.ll b/test/CodeGen/AArch64/dag-combine-invaraints.ll
index 3614133..ac2d057 100644
--- a/test/CodeGen/AArch64/dag-combine-invaraints.ll
+++ b/test/CodeGen/AArch64/dag-combine-invaraints.ll
@@ -20,7 +20,7 @@ main_:
   %DHSelect = select i1 %tmp8, i32 %tmp9, i32 %tmp10
   store i32 %DHSelect, i32* %i32X, align 4
   %tmp15 = load i32, i32* %i32X, align 4
-  %tmp17 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str2, i32 0, i32 0), i32 %tmp15)
+  %tmp17 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str2, i32 0, i32 0), i32 %tmp15)
   ret i32 0
 
 ; CHECK: main:
diff --git a/test/CodeGen/AArch64/f16-instructions.ll b/test/CodeGen/AArch64/f16-instructions.ll
new file mode 100644
index 0000000..be5e2e5
--- /dev/null
+++ b/test/CodeGen/AArch64/f16-instructions.ll
@@ -0,0 +1,765 @@
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: test_fadd:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_fadd(half %a, half %b) #0 {
+  %r = fadd half %a, %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_fsub:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fsub s0, s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_fsub(half %a, half %b) #0 {
+  %r = fsub half %a, %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_fmul:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fmul s0, s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_fmul(half %a, half %b) #0 {
+  %r = fmul half %a, %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_fdiv:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fdiv s0, s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_fdiv(half %a, half %b) #0 {
+  %r = fdiv half %a, %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_frem:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: bl {{_?}}fmodf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_frem(half %a, half %b) #0 {
+  %r = frem half %a, %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_store:
+; CHECK-NEXT: str  h0, [x0]
+; CHECK-NEXT: ret
+define void @test_store(half %a, half* %b) #0 {
+  store half %a, half* %b
+  ret void
+}
+
+; CHECK-LABEL: test_load:
+; CHECK-NEXT: ldr  h0, [x0]
+; CHECK-NEXT: ret
+define half @test_load(half* %a) #0 {
+  %r = load half, half* %a
+  ret half %r
+}
+
+
+declare half @test_callee(half %a, half %b) #0
+
+; CHECK-LABEL: test_call:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: bl {{_?}}test_callee
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_call(half %a, half %b) #0 {
+  %r = call half @test_callee(half %a, half %b)
+  ret half %r
+}
+
+; CHECK-LABEL: test_call_flipped:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: mov.16b  v2, v0
+; CHECK-NEXT: mov.16b  v0, v1
+; CHECK-NEXT: mov.16b  v1, v2
+; CHECK-NEXT: bl {{_?}}test_callee
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_call_flipped(half %a, half %b) #0 {
+  %r = call half @test_callee(half %b, half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_tailcall_flipped:
+; CHECK-NEXT: mov.16b  v2, v0
+; CHECK-NEXT: mov.16b  v0, v1
+; CHECK-NEXT: mov.16b  v1, v2
+; CHECK-NEXT: b {{_?}}test_callee
+define half @test_tailcall_flipped(half %a, half %b) #0 {
+  %r = tail call half @test_callee(half %b, half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_select:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: cmp  w0, #0
+; CHECK-NEXT: fcsel s0, s0, s1, ne
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_select(half %a, half %b, i1 zeroext %c) #0 {
+  %r = select i1 %c, half %a, half %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_select_cc:
+; CHECK-DAG: fcvt s3, h3
+; CHECK-DAG: fcvt s2, h2
+; CHECK-DAG: fcvt s1, h1
+; CHECK-DAG: fcvt s0, h0
+; CHECK-DAG: fcmp s2, s3
+; CHECK-DAG: cset [[CC:w[0-9]+]], ne
+; CHECK-DAG: cmp [[CC]], #0
+; CHECK-NEXT: fcsel s0, s0, s1, ne
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_select_cc(half %a, half %b, half %c, half %d) #0 {
+  %cc = fcmp une half %c, %d
+  %r = select i1 %cc, half %a, half %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_fcmp_une:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, ne
+; CHECK-NEXT: ret
+define i1 @test_fcmp_une(half %a, half %b) #0 {
+  %r = fcmp une half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ueq:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: orr [[TRUE:w[0-9]+]], wzr, #0x1
+; CHECK-NEXT: csel [[CC:w[0-9]+]], [[TRUE]], wzr, eq
+; CHECK-NEXT: csel w0, [[TRUE]], [[CC]], vs
+; CHECK-NEXT: ret
+define i1 @test_fcmp_ueq(half %a, half %b) #0 {
+  %r = fcmp ueq half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ugt:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, hi
+; CHECK-NEXT: ret
+define i1 @test_fcmp_ugt(half %a, half %b) #0 {
+  %r = fcmp ugt half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_uge:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, pl
+; CHECK-NEXT: ret
+define i1 @test_fcmp_uge(half %a, half %b) #0 {
+  %r = fcmp uge half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ult:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, lt
+; CHECK-NEXT: ret
+define i1 @test_fcmp_ult(half %a, half %b) #0 {
+  %r = fcmp ult half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ule:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, le
+; CHECK-NEXT: ret
+define i1 @test_fcmp_ule(half %a, half %b) #0 {
+  %r = fcmp ule half %a, %b
+  ret i1 %r
+}
+
+
+; CHECK-LABEL: test_fcmp_uno:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, vs
+; CHECK-NEXT: ret
+define i1 @test_fcmp_uno(half %a, half %b) #0 {
+  %r = fcmp uno half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_one:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: orr [[TRUE:w[0-9]+]], wzr, #0x1
+; CHECK-NEXT: csel [[CC:w[0-9]+]], [[TRUE]], wzr, mi
+; CHECK-NEXT: csel w0, [[TRUE]], [[CC]], gt
+; CHECK-NEXT: ret
+define i1 @test_fcmp_one(half %a, half %b) #0 {
+  %r = fcmp one half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_oeq:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, eq
+; CHECK-NEXT: ret
+define i1 @test_fcmp_oeq(half %a, half %b) #0 {
+  %r = fcmp oeq half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ogt:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, gt
+; CHECK-NEXT: ret
+define i1 @test_fcmp_ogt(half %a, half %b) #0 {
+  %r = fcmp ogt half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_oge:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, ge
+; CHECK-NEXT: ret
+define i1 @test_fcmp_oge(half %a, half %b) #0 {
+  %r = fcmp oge half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_olt:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, mi
+; CHECK-NEXT: ret
+define i1 @test_fcmp_olt(half %a, half %b) #0 {
+  %r = fcmp olt half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ole:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, ls
+; CHECK-NEXT: ret
+define i1 @test_fcmp_ole(half %a, half %b) #0 {
+  %r = fcmp ole half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ord:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, vc
+; CHECK-NEXT: ret
+define i1 @test_fcmp_ord(half %a, half %b) #0 {
+  %r = fcmp ord half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_br_cc:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: b.mi [[BRCC_ELSE:.?LBB[0-9_]+]]
+; CHECK-NEXT: str  wzr, [x0]
+; CHECK-NEXT: ret
+; CHECK-NEXT: [[BRCC_ELSE]]:
+; CHECK-NEXT: str  wzr, [x1]
+; CHECK-NEXT: ret
+define void @test_br_cc(half %a, half %b, i32* %p1, i32* %p2) #0 {
+  %c = fcmp uge half %a, %b
+  br i1 %c, label %then, label %else
+then:
+  store i32 0, i32* %p1
+  ret void
+else:
+  store i32 0, i32* %p2
+  ret void
+}
+
+; CHECK-LABEL: test_phi:
+; CHECK: mov  x[[PTR:[0-9]+]], x0
+; CHECK: ldr  h[[AB:[0-9]+]], [x[[PTR]]]
+; CHECK: [[LOOP:LBB[0-9_]+]]:
+; CHECK: mov.16b  v[[R:[0-9]+]], v[[AB]]
+; CHECK: ldr  h[[AB]], [x[[PTR]]]
+; CHECK: mov  x0, x[[PTR]]
+; CHECK: bl {{_?}}test_dummy
+; CHECK: mov.16b  v0, v[[R]]
+; CHECK: ret
+define half @test_phi(half* %p1) #0 {
+entry:
+  %a = load half, half* %p1
+  br label %loop
+loop:
+  %r = phi half [%a, %entry], [%b, %loop]
+  %b = load half, half* %p1
+  %c = call i1 @test_dummy(half* %p1)
+  br i1 %c, label %loop, label %return
+return:
+  ret half %r
+}
+declare i1 @test_dummy(half* %p1) #0
+
+; CHECK-LABEL: test_fptosi_i32:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvtzs w0, s0
+; CHECK-NEXT: ret
+define i32 @test_fptosi_i32(half %a) #0 {
+  %r = fptosi half %a to i32
+  ret i32 %r
+}
+
+; CHECK-LABEL: test_fptosi_i64:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvtzs x0, s0
+; CHECK-NEXT: ret
+define i64 @test_fptosi_i64(half %a) #0 {
+  %r = fptosi half %a to i64
+  ret i64 %r
+}
+
+; CHECK-LABEL: test_fptoui_i32:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvtzu w0, s0
+; CHECK-NEXT: ret
+define i32 @test_fptoui_i32(half %a) #0 {
+  %r = fptoui half %a to i32
+  ret i32 %r
+}
+
+; CHECK-LABEL: test_fptoui_i64:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvtzu x0, s0
+; CHECK-NEXT: ret
+define i64 @test_fptoui_i64(half %a) #0 {
+  %r = fptoui half %a to i64
+  ret i64 %r
+}
+
+; CHECK-LABEL: test_uitofp_i32:
+; CHECK-NEXT: ucvtf s0, w0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_uitofp_i32(i32 %a) #0 {
+  %r = uitofp i32 %a to half
+  ret half %r
+}
+
+; CHECK-LABEL: test_uitofp_i64:
+; CHECK-NEXT: ucvtf s0, x0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_uitofp_i64(i64 %a) #0 {
+  %r = uitofp i64 %a to half
+  ret half %r
+}
+
+; CHECK-LABEL: test_sitofp_i32:
+; CHECK-NEXT: scvtf s0, w0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_sitofp_i32(i32 %a) #0 {
+  %r = sitofp i32 %a to half
+  ret half %r
+}
+
+; CHECK-LABEL: test_sitofp_i64:
+; CHECK-NEXT: scvtf s0, x0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_sitofp_i64(i64 %a) #0 {
+  %r = sitofp i64 %a to half
+  ret half %r
+}
+
+; CHECK-LABEL: test_fptrunc_float:
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+
+define half @test_fptrunc_float(float %a) #0 {
+  %r = fptrunc float %a to half
+  ret half %r
+}
+
+; CHECK-LABEL: test_fptrunc_double:
+; CHECK-NEXT: fcvt h0, d0
+; CHECK-NEXT: ret
+define half @test_fptrunc_double(double %a) #0 {
+  %r = fptrunc double %a to half
+  ret half %r
+}
+
+; CHECK-LABEL: test_fpext_float:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: ret
+define float @test_fpext_float(half %a) #0 {
+  %r = fpext half %a to float
+  ret float %r
+}
+
+; CHECK-LABEL: test_fpext_double:
+; CHECK-NEXT: fcvt d0, h0
+; CHECK-NEXT: ret
+define double @test_fpext_double(half %a) #0 {
+  %r = fpext half %a to double
+  ret double %r
+}
+
+
+; CHECK-LABEL: test_bitcast_halftoi16:
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+define i16 @test_bitcast_halftoi16(half %a) #0 {
+  %r = bitcast half %a to i16
+  ret i16 %r
+}
+
+; CHECK-LABEL: test_bitcast_i16tohalf:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ret
+define half @test_bitcast_i16tohalf(i16 %a) #0 {
+  %r = bitcast i16 %a to half
+  ret half %r
+}
+
+
+declare half @llvm.sqrt.f16(half %a) #0
+declare half @llvm.powi.f16(half %a, i32 %b) #0
+declare half @llvm.sin.f16(half %a) #0
+declare half @llvm.cos.f16(half %a) #0
+declare half @llvm.pow.f16(half %a, half %b) #0
+declare half @llvm.exp.f16(half %a) #0
+declare half @llvm.exp2.f16(half %a) #0
+declare half @llvm.log.f16(half %a) #0
+declare half @llvm.log10.f16(half %a) #0
+declare half @llvm.log2.f16(half %a) #0
+declare half @llvm.fma.f16(half %a, half %b, half %c) #0
+declare half @llvm.fabs.f16(half %a) #0
+declare half @llvm.minnum.f16(half %a, half %b) #0
+declare half @llvm.maxnum.f16(half %a, half %b) #0
+declare half @llvm.copysign.f16(half %a, half %b) #0
+declare half @llvm.floor.f16(half %a) #0
+declare half @llvm.ceil.f16(half %a) #0
+declare half @llvm.trunc.f16(half %a) #0
+declare half @llvm.rint.f16(half %a) #0
+declare half @llvm.nearbyint.f16(half %a) #0
+declare half @llvm.round.f16(half %a) #0
+declare half @llvm.fmuladd.f16(half %a, half %b, half %c) #0
+
+; CHECK-LABEL: test_sqrt:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fsqrt s0, s0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_sqrt(half %a) #0 {
+  %r = call half @llvm.sqrt.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_powi:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}__powisf2
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_powi(half %a, i32 %b) #0 {
+  %r = call half @llvm.powi.f16(half %a, i32 %b)
+  ret half %r
+}
+
+; CHECK-LABEL: test_sin:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}sinf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_sin(half %a) #0 {
+  %r = call half @llvm.sin.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_cos:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}cosf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_cos(half %a) #0 {
+  %r = call half @llvm.cos.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_pow:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: bl {{_?}}powf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_pow(half %a, half %b) #0 {
+  %r = call half @llvm.pow.f16(half %a, half %b)
+  ret half %r
+}
+
+; CHECK-LABEL: test_exp:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}expf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_exp(half %a) #0 {
+  %r = call half @llvm.exp.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_exp2:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}exp2f
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_exp2(half %a) #0 {
+  %r = call half @llvm.exp2.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_log:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}logf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_log(half %a) #0 {
+  %r = call half @llvm.log.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_log10:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}log10f
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_log10(half %a) #0 {
+  %r = call half @llvm.log10.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_log2:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}log2f
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_log2(half %a) #0 {
+  %r = call half @llvm.log2.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_fma:
+; CHECK-NEXT: fcvt s2, h2
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fmadd s0, s0, s1, s2
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_fma(half %a, half %b, half %c) #0 {
+  %r = call half @llvm.fma.f16(half %a, half %b, half %c)
+  ret half %r
+}
+
+; CHECK-LABEL: test_fabs:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fabs s0, s0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_fabs(half %a) #0 {
+  %r = call half @llvm.fabs.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_minnum:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: bl {{_?}}fminf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_minnum(half %a, half %b) #0 {
+  %r = call half @llvm.minnum.f16(half %a, half %b)
+  ret half %r
+}
+
+; CHECK-LABEL: test_maxnum:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: bl {{_?}}fmaxf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_maxnum(half %a, half %b) #0 {
+  %r = call half @llvm.maxnum.f16(half %a, half %b)
+  ret half %r
+}
+
+; CHECK-LABEL: test_copysign:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: movi.4s v2, #0x80, lsl #24
+; CHECK-NEXT: bit.16b v0, v1, v2
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_copysign(half %a, half %b) #0 {
+  %r = call half @llvm.copysign.f16(half %a, half %b)
+  ret half %r
+}
+
+; CHECK-LABEL: test_floor:
+; CHECK-NEXT: fcvt s1, h0
+; CHECK-NEXT: frintm s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: frintx s1, s1
+; CHECK-NEXT: ret
+define half @test_floor(half %a) #0 {
+  %r = call half @llvm.floor.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_ceil:
+; CHECK-NEXT: fcvt s1, h0
+; CHECK-NEXT: frintp s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: frintx s1, s1
+; CHECK-NEXT: ret
+define half @test_ceil(half %a) #0 {
+  %r = call half @llvm.ceil.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_trunc:
+; CHECK-NEXT: fcvt s1, h0
+; CHECK-NEXT: frintz s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: frintx s1, s1
+; CHECK-NEXT: ret
+define half @test_trunc(half %a) #0 {
+  %r = call half @llvm.trunc.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_rint:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: frintx s0, s0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_rint(half %a) #0 {
+  %r = call half @llvm.rint.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_nearbyint:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: frinti s0, s0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_nearbyint(half %a) #0 {
+  %r = call half @llvm.nearbyint.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_round:
+; CHECK-NEXT: fcvt s1, h0
+; CHECK-NEXT: frinta s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: frintx s1, s1
+; CHECK-NEXT: ret
+define half @test_round(half %a) #0 {
+  %r = call half @llvm.round.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_fmuladd:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fmul s0, s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvt s1, h2
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_fmuladd(half %a, half %b, half %c) #0 {
+  %r = call half @llvm.fmuladd.f16(half %a, half %b, half %c)
+  ret half %r
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/fast-isel-int-ext5.ll b/test/CodeGen/AArch64/fast-isel-int-ext5.ll
new file mode 100644
index 0000000..0f9ec62
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-int-ext5.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -O0 -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: int_ext_opt
+define i64 @int_ext_opt(i8* %addr, i1 %c1, i1 %c2) {
+entry:
+  %0 = load i8, i8* %addr
+  br i1 %c1, label %bb1, label %bb2
+
+bb1:
+  %1 = zext i8 %0 to i64
+  br i1 %c2, label %bb2, label %exit
+
+bb2:
+  %2 = phi i64 [1, %entry], [%1, %bb1]
+  ret i64 %2
+
+exit:
+  ret i64 0
+}
diff --git a/test/CodeGen/AArch64/fold-constants.ll b/test/CodeGen/AArch64/fold-constants.ll
new file mode 100644
index 0000000..2dd0d12
--- /dev/null
+++ b/test/CodeGen/AArch64/fold-constants.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
+
+define i64 @dotests_616() {
+; CHECK-LABEL: dotests_616
+; CHECK:       movi d0, #0000000000000000
+; CHECK-NEXT:  umov w8, v0.b[2]
+; CHECK-NEXT:  sbfx w8, w8, #0, #1
+; CHECK-NEXT:  fmov s0, w8
+; CHECK-NEXT:  fmov x0, d0
+; CHECK-NEXT:  ret
+entry:
+  %0 = bitcast <2 x i64> zeroinitializer to <8 x i16>
+  %1 = and <8 x i16> zeroinitializer, %0
+  %2 = icmp ne <8 x i16> %1, zeroinitializer
+  %3 = extractelement <8 x i1> %2, i32 2
+  %vgetq_lane285 = sext i1 %3 to i16
+  %vset_lane = insertelement <4 x i16> undef, i16 %vgetq_lane285, i32 0
+  %4 = bitcast <4 x i16> %vset_lane to <1 x i64>
+  %vget_lane = extractelement <1 x i64> %4, i32 0
+  ret i64 %vget_lane
+}
diff --git a/test/CodeGen/AArch64/fp16-instructions.ll b/test/CodeGen/AArch64/fp16-instructions.ll
deleted file mode 100644
index ba96694..0000000
--- a/test/CodeGen/AArch64/fp16-instructions.ll
+++ /dev/null
@@ -1,109 +0,0 @@
-; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
-
-define half @add_h(half %a, half %b) {
-entry:
-; CHECK-LABEL: add_h:
-; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
-; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
-; CHECK: fadd [[RES:s[0-9]+]], [[OP1]], [[OP2]]
-; CHECK: fcvt h0, [[RES]]
-  %0 = fadd half %a, %b
-  ret half %0
-}
-
-
-define half @sub_h(half %a, half %b) {
-entry:
-; CHECK-LABEL: sub_h:
-; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
-; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
-; CHECK: fsub [[RES:s[0-9]+]], [[OP1]], [[OP2]]
-; CHECK: fcvt h0, [[RES]]
-  %0 = fsub half %a, %b
-  ret half %0
-}
-
-
-define half @mul_h(half %a, half %b) {
-entry:
-; CHECK-LABEL: mul_h:
-; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
-; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
-; CHECK: fmul [[RES:s[0-9]+]], [[OP1]], [[OP2]]
-; CHECK: fcvt h0, [[RES]]
-  %0 = fmul half %a, %b
-  ret half %0
-}
-
-
-define half @div_h(half %a, half %b) {
-entry:
-; CHECK-LABEL: div_h:
-; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
-; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
-; CHECK: fdiv [[RES:s[0-9]+]], [[OP1]], [[OP2]]
-; CHECK: fcvt h0, [[RES]]
-  %0 = fdiv half %a, %b
-  ret half %0
-}
-
-
-define half @load_h(half* %a) {
-entry:
-; CHECK-LABEL: load_h:
-; CHECK: ldr h0, [x0]
-  %0 = load half, half* %a, align 4
-  ret half %0
-}
-
-
-define void @store_h(half* %a, half %b) {
-entry:
-; CHECK-LABEL: store_h:
-; CHECK: str h0, [x0]
-  store half %b, half* %a, align 4
-  ret void
-}
-
-define half @s_to_h(float %a) {
-; CHECK-LABEL: s_to_h:
-; CHECK: fcvt h0, s0
-  %1 = fptrunc float %a to half
-  ret half %1
-}
-
-define half @d_to_h(double %a) {
-; CHECK-LABEL: d_to_h:
-; CHECK: fcvt h0, d0
-  %1 = fptrunc double %a to half
-  ret half %1
-}
-
-define float @h_to_s(half %a) {
-; CHECK-LABEL: h_to_s:
-; CHECK: fcvt s0, h0
-  %1 = fpext half %a to float
-  ret float %1
-}
-
-define double @h_to_d(half %a) {
-; CHECK-LABEL: h_to_d:
-; CHECK: fcvt d0, h0
-  %1 = fpext half %a to double
-  ret double %1
-}
-
-define half @bitcast_i_to_h(i16 %a) {
-; CHECK-LABEL: bitcast_i_to_h:
-; CHECK: fmov s0, w0
-  %1 = bitcast i16 %a to half
-  ret half %1
-}
-
-
-define i16 @bitcast_h_to_i(half %a) {
-; CHECK-LABEL: bitcast_h_to_i:
-; CHECK: fmov w0, s0
-  %1 = bitcast half %a to i16
-  ret i16 %1
-}
diff --git a/test/CodeGen/AArch64/global-merge-1.ll b/test/CodeGen/AArch64/global-merge-1.ll
index b404389..14b0430 100644
--- a/test/CodeGen/AArch64/global-merge-1.ll
+++ b/test/CodeGen/AArch64/global-merge-1.ll
@@ -1,11 +1,11 @@
-; RUN: llc %s -mtriple=aarch64-none-linux-gnu -O3 -enable-global-merge -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-none-linux-gnu -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
 
-; RUN: llc %s -mtriple=aarch64-linux-gnuabi -O3 -enable-global-merge -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-linux-gnuabi -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
 
-; RUN: llc %s -mtriple=aarch64-apple-ios -O3 -enable-global-merge -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
-; RUN: llc %s -mtriple=aarch64-apple-ios -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
 
 @m = internal global i32 0, align 4
 @n = internal global i32 0, align 4
diff --git a/test/CodeGen/AArch64/global-merge-2.ll b/test/CodeGen/AArch64/global-merge-2.ll
index d5967b9..af68403 100644
--- a/test/CodeGen/AArch64/global-merge-2.ll
+++ b/test/CodeGen/AArch64/global-merge-2.ll
@@ -1,6 +1,6 @@
-; RUN: llc %s -mtriple=aarch64-none-linux-gnu -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-linux-gnuabi -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-apple-ios -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
 
 @x = global i32 0, align 4
 @y = global i32 0, align 4
diff --git a/test/CodeGen/AArch64/global-merge-3.ll b/test/CodeGen/AArch64/global-merge-3.ll
index 15035c0..9251083 100644
--- a/test/CodeGen/AArch64/global-merge-3.ll
+++ b/test/CodeGen/AArch64/global-merge-3.ll
@@ -1,6 +1,6 @@
-; RUN: llc %s -mtriple=aarch64-none-linux-gnu -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-linux-gnuabi -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-apple-ios -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
 
 @x = global [1000 x i32] zeroinitializer, align 1
 @y = global [1000 x i32] zeroinitializer, align 1
diff --git a/test/CodeGen/AArch64/global-merge-4.ll b/test/CodeGen/AArch64/global-merge-4.ll
index 8fb7747..bc6b68a 100644
--- a/test/CodeGen/AArch64/global-merge-4.ll
+++ b/test/CodeGen/AArch64/global-merge-4.ll
@@ -1,4 +1,4 @@
-; RUN: llc %s -mtriple=aarch64-linux-gnuabi -O3 -enable-global-merge -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -o - | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
 target triple = "arm64-apple-ios7.0.0"
diff --git a/test/CodeGen/AArch64/merge-store.ll b/test/CodeGen/AArch64/merge-store.ll
new file mode 100644
index 0000000..18dbad4
--- /dev/null
+++ b/test/CodeGen/AArch64/merge-store.ll
@@ -0,0 +1,20 @@
+; RUN: llc -march aarch64 %s -o - | FileCheck %s
+
+@g0 = external global <3 x float>, align 16
+@g1 = external global <3 x float>, align 4
+
+; CHECK: ldr s[[R0:[0-9]+]], {{\[}}[[R1:x[0-9]+]]{{\]}}, #4
+; CHECK: ld1{{\.?s?}} { v[[R0]]{{\.?s?}} }[1], {{\[}}[[R1]]{{\]}}
+; CHECK: str d[[R0]]
+
+define void @blam() {
+  %tmp4 = getelementptr inbounds <3 x float>, <3 x float>* @g1, i64 0, i64 0
+  %tmp5 = load <3 x float>, <3 x float>* @g0, align 16
+  %tmp6 = extractelement <3 x float> %tmp5, i64 0
+  store float %tmp6, float* %tmp4
+  %tmp7 = getelementptr inbounds float, float* %tmp4, i64 1
+  %tmp8 = load <3 x float>, <3 x float>* @g0, align 16
+  %tmp9 = extractelement <3 x float> %tmp8, i64 1
+  store float %tmp9, float* %tmp7
+  ret void;
+}
diff --git a/test/CodeGen/AArch64/print-mrs-system-register.ll b/test/CodeGen/AArch64/print-mrs-system-register.ll
new file mode 100644
index 0000000..3411ed6
--- /dev/null
+++ b/test/CodeGen/AArch64/print-mrs-system-register.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=arm64-apple-darwin %s -o - | FileCheck %s
+
+; CHECK: mrs x0, CPM_IOACC_CTL_EL3
+
+define void @foo1() #0 {
+entry:
+  tail call void asm sideeffect "mrs x0, cpm_ioacc_ctl_el3", ""()
+  ret void
+}
+
+attributes #0 = { "target-cpu"="cyclone" }
diff --git a/test/CodeGen/AArch64/sibling-call.ll b/test/CodeGen/AArch64/sibling-call.ll
index 34d45d8..a68fdec 100644
--- a/test/CodeGen/AArch64/sibling-call.ll
+++ b/test/CodeGen/AArch64/sibling-call.ll
@@ -75,8 +75,8 @@ define void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
 
 ; CHECK: ldr [[VAL0:x[0-9]+]],
 ; CHECK: ldr [[VAL1:x[0-9]+]],
-; CHECK: str [[VAL1]],
 ; CHECK: str [[VAL0]],
+; CHECK: str [[VAL1]],
 
 ; CHECK-NOT: add sp, sp,
 ; CHECK: b callee_stack16
diff --git a/test/CodeGen/AArch64/stackmap-liveness.ll b/test/CodeGen/AArch64/stackmap-liveness.ll
new file mode 100644
index 0000000..6b37aac
--- /dev/null
+++ b/test/CodeGen/AArch64/stackmap-liveness.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -mtriple=aarch64-apple-darwin | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL:  .section  __LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:   __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 1
+; Num LargeConstants
+; CHECK-NEXT:   .long   0
+; Num Callsites
+; CHECK-NEXT:   .long   1
+
+; Functions and stack size
+; CHECK-NEXT:   .quad _stackmap_liveness
+; CHECK-NEXT:   .quad 16
+
+; Test that the return register is recognized as an live-out.
+define i64 @stackmap_liveness(i1 %c) {
+; CHECK-LABEL:  .long L{{.*}}-_stackmap_liveness
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  0
+; Padding
+; CHECK-NEXT:   .short  0
+; Num LiveOut Entries: 1
+; CHECK-NEXT:   .short  2
+; LiveOut Entry 0: X0
+; CHECK-NEXT:   .short 0
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
+; LiveOut Entry 1: SP
+; CHECK-NEXT:   .short 31
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
+; Align
+; CHECK-NEXT:   .align  3
+  %1 = select i1 %c, i64 1, i64 2
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 1, i32 32, i8* null, i32 0)
+  ret i64 %1
+}
+
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+
diff --git a/test/CodeGen/AArch64/tailcall-explicit-sret.ll b/test/CodeGen/AArch64/tailcall-explicit-sret.ll
new file mode 100644
index 0000000..4d80f2a
--- /dev/null
+++ b/test/CodeGen/AArch64/tailcall-explicit-sret.ll
@@ -0,0 +1,106 @@
+; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false | FileCheck %s
+; Disable the load/store optimizer to avoid having LDP/STPs and simplify checks.
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; Check that we don't try to tail-call with a non-forwarded sret parameter.
+declare void @test_explicit_sret(i1024* sret) #0
+
+; This is the only OK case, where we forward the explicit sret pointer.
+
+; CHECK-LABEL: _test_tailcall_explicit_sret:
+; CHECK-NEXT: b _test_explicit_sret
+define void @test_tailcall_explicit_sret(i1024* sret %arg) #0 {
+  tail call void @test_explicit_sret(i1024* %arg)
+  ret void
+}
+
+; CHECK-LABEL: _test_call_explicit_sret:
+; CHECK-NOT: mov  x8
+; CHECK: bl _test_explicit_sret
+; CHECK: ret
+define void @test_call_explicit_sret(i1024* sret %arg) #0 {
+  call void @test_explicit_sret(i1024* %arg)
+  ret void
+}
+
+; CHECK-LABEL: _test_tailcall_explicit_sret_alloca_unused:
+; CHECK: mov  x8, sp
+; CHECK-NEXT: bl _test_explicit_sret
+; CHECK: ret
+define void @test_tailcall_explicit_sret_alloca_unused() #0 {
+  %l = alloca i1024, align 8
+  tail call void @test_explicit_sret(i1024* %l)
+  ret void
+}
+
+; CHECK-LABEL: _test_tailcall_explicit_sret_alloca_dummyusers:
+; CHECK: ldr [[PTRLOAD1:x[0-9]+]], [x0]
+; CHECK: str [[PTRLOAD1]], [sp]
+; CHECK: mov  x8, sp
+; CHECK-NEXT: bl _test_explicit_sret
+; CHECK: ret
+define void @test_tailcall_explicit_sret_alloca_dummyusers(i1024* %ptr) #0 {
+  %l = alloca i1024, align 8
+  %r = load i1024, i1024* %ptr, align 8
+  store i1024 %r, i1024* %l, align 8
+  tail call void @test_explicit_sret(i1024* %l)
+  ret void
+}
+
+; This is too conservative, but doesn't really happen in practice.
+
+; CHECK-LABEL: _test_tailcall_explicit_sret_gep:
+; CHECK: add  x8, x0, #128
+; CHECK-NEXT: bl _test_explicit_sret
+; CHECK: ret
+define void @test_tailcall_explicit_sret_gep(i1024* %ptr) #0 {
+  %ptr2 = getelementptr i1024, i1024* %ptr, i32 1
+  tail call void @test_explicit_sret(i1024* %ptr2)
+  ret void
+}
+
+; CHECK-LABEL: _test_tailcall_explicit_sret_alloca_returned:
+; CHECK: mov  x[[CALLERX8NUM:[0-9]+]], x8
+; CHECK: mov  x8, sp
+; CHECK-NEXT: bl _test_explicit_sret
+; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp]
+; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]]
+; CHECK: ret
+define i1024 @test_tailcall_explicit_sret_alloca_returned() #0 {
+  %l = alloca i1024, align 8
+  tail call void @test_explicit_sret(i1024* %l)
+  %r = load i1024, i1024* %l, align 8
+  ret i1024 %r
+}
+
+; CHECK-LABEL: _test_indirect_tailcall_explicit_sret_nosret_arg:
+; CHECK-DAG: mov  x[[CALLERX8NUM:[0-9]+]], x8
+; CHECK-DAG: mov  [[FPTR:x[0-9]+]], x0
+; CHECK: mov  x0, sp
+; CHECK-NEXT: blr [[FPTR]]
+; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp]
+; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]]
+; CHECK: ret
+define void @test_indirect_tailcall_explicit_sret_nosret_arg(i1024* sret %arg, void (i1024*)* %f) #0 {
+  %l = alloca i1024, align 8
+  tail call void %f(i1024* %l)
+  %r = load i1024, i1024* %l, align 8
+  store i1024 %r, i1024* %arg, align 8
+  ret void
+}
+
+; CHECK-LABEL: _test_indirect_tailcall_explicit_sret_:
+; CHECK: mov  x[[CALLERX8NUM:[0-9]+]], x8
+; CHECK: mov  x8, sp
+; CHECK-NEXT: blr x0
+; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp]
+; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]]
+; CHECK: ret
+define void @test_indirect_tailcall_explicit_sret_(i1024* sret %arg, i1024 ()* %f) #0 {
+  %ret = tail call i1024 %f()
+  store i1024 %ret, i1024* %arg, align 8
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/tailcall-implicit-sret.ll b/test/CodeGen/AArch64/tailcall-implicit-sret.ll
new file mode 100644
index 0000000..5d68059
--- /dev/null
+++ b/test/CodeGen/AArch64/tailcall-implicit-sret.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false | FileCheck %s
+; Disable the load/store optimizer to avoid having LDP/STPs and simplify checks.
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; Check that we don't try to tail-call with an sret-demoted return.
+
+declare i1024 @test_sret() #0
+
+; CHECK-LABEL: _test_call_sret:
+; CHECK: mov  x[[CALLERX8NUM:[0-9]+]], x8
+; CHECK: mov  x8, sp
+; CHECK-NEXT: bl _test_sret
+; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp]
+; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]]
+; CHECK: ret
+define i1024 @test_call_sret() #0 {
+  %a = call i1024 @test_sret()
+  ret i1024 %a
+}
+
+; CHECK-LABEL: _test_tailcall_sret:
+; CHECK: mov  x[[CALLERX8NUM:[0-9]+]], x8
+; CHECK: mov  x8, sp
+; CHECK-NEXT: bl _test_sret
+; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp]
+; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]]
+; CHECK: ret
+define i1024 @test_tailcall_sret() #0 {
+  %a = tail call i1024 @test_sret()
+  ret i1024 %a
+}
+
+; CHECK-LABEL: _test_indirect_tailcall_sret:
+; CHECK: mov  x[[CALLERX8NUM:[0-9]+]], x8
+; CHECK: mov  x8, sp
+; CHECK-NEXT: blr x0
+; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp]
+; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]]
+; CHECK: ret
+define i1024 @test_indirect_tailcall_sret(i1024 ()* %f) #0 {
+  %a = tail call i1024 %f()
+  ret i1024 %a
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/tailcall-mem-intrinsics.ll b/test/CodeGen/AArch64/tailcall-mem-intrinsics.ll
new file mode 100644
index 0000000..b970fb1
--- /dev/null
+++ b/test/CodeGen/AArch64/tailcall-mem-intrinsics.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s
+
+; CHECK-LABEL: tail_memcpy:
+; CHECK: b memcpy
+define void @tail_memcpy(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: tail_memmove:
+; CHECK: b memmove
+define void @tail_memmove(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 {
+entry:
+  tail call void @llvm.memmove.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: tail_memset:
+; CHECK: b memset
+define void @tail_memset(i8* nocapture %p, i8 %c, i32 %n) #0 {
+entry:
+  tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #0
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #0
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/vcvt-oversize.ll b/test/CodeGen/AArch64/vcvt-oversize.ll
new file mode 100644
index 0000000..066a4b6
--- /dev/null
+++ b/test/CodeGen/AArch64/vcvt-oversize.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+define <8 x i8> @float_to_i8(<8 x float>* %in) {
+; CHECK-LABEL: float_to_i8:
+; CHECK-DAG: fadd v[[LSB:[0-9]+]].4s, v0.4s, v0.4s
+; CHECK-DAG: fadd v[[MSB:[0-9]+]].4s, v1.4s, v1.4s
+; CHECK-DAG: fcvtzu v[[LSB2:[0-9]+]].4s, v[[LSB]].4s
+; CHECK-DAG: fcvtzu v[[MSB2:[0-9]+]].4s, v[[MSB]].4s
+; CHECK-DAG: xtn v[[TMP:[0-9]+]].4h, v[[LSB]].4s
+; CHECK-DAG: xtn2 v[[TMP]].8h, v[[MSB]].4s
+; CHECK-DAG: xtn v0.8b, v[[TMP]].8h
+  %l = load <8 x float>, <8 x float>* %in
+  %scale = fmul <8 x float> %l, <float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0>
+  %conv = fptoui <8 x float> %scale to <8 x i8>
+  ret <8 x i8> %conv
+}
diff --git a/test/CodeGen/ARM/2007-03-21-JoinIntervalsCrash.ll b/test/CodeGen/ARM/2007-03-21-JoinIntervalsCrash.ll
index 0162d7f..7c42596 100644
--- a/test/CodeGen/ARM/2007-03-21-JoinIntervalsCrash.ll
+++ b/test/CodeGen/ARM/2007-03-21-JoinIntervalsCrash.ll
@@ -83,7 +83,7 @@ cond_next881:		; preds = %bb866
 	%tmp884885 = inttoptr i64 %tmp10959 to %struct.tree_identifier*		; <%struct.tree_identifier*> [#uses=1]
 	%tmp887 = getelementptr %struct.tree_identifier, %struct.tree_identifier* %tmp884885, i32 0, i32 1, i32 0		; <i8**> [#uses=1]
 	%tmp888 = load i8*, i8** %tmp887		; <i8*> [#uses=1]
-	tail call void (i32, ...)* @error( i32 undef, i8* %tmp888 )
+	tail call void (i32, ...) @error( i32 undef, i8* %tmp888 )
 	ret void
 
 cond_true918:		; preds = %cond_false841
diff --git a/test/CodeGen/ARM/2007-04-03-PEIBug.ll b/test/CodeGen/ARM/2007-04-03-PEIBug.ll
index cf5094f..87863bd 100644
--- a/test/CodeGen/ARM/2007-04-03-PEIBug.ll
+++ b/test/CodeGen/ARM/2007-04-03-PEIBug.ll
@@ -5,7 +5,7 @@ entry:
 	%A = alloca [1123 x i32], align 16		; <[1123 x i32]*> [#uses=1]
 	%B = alloca [3123 x i32], align 16		; <[3123 x i32]*> [#uses=1]
 	%C = alloca [12312 x i32], align 16		; <[12312 x i32]*> [#uses=1]
-	%tmp = call i32 (...)* @bar( [3123 x i32]* %B, [1123 x i32]* %A, [12312 x i32]* %C )		; <i32> [#uses=0]
+	%tmp = call i32 (...) @bar( [3123 x i32]* %B, [1123 x i32]* %A, [12312 x i32]* %C )		; <i32> [#uses=0]
 	ret i32 undef
 }
 
diff --git a/test/CodeGen/ARM/2007-04-03-UndefinedSymbol.ll b/test/CodeGen/ARM/2007-04-03-UndefinedSymbol.ll
index b687029..11f3003 100644
--- a/test/CodeGen/ARM/2007-04-03-UndefinedSymbol.ll
+++ b/test/CodeGen/ARM/2007-04-03-UndefinedSymbol.ll
@@ -10,7 +10,7 @@ define internal void @_ZN1B1iEv(%struct.B* %this) {
 entry:
 	%tmp1 = getelementptr %struct.B, %struct.B* %this, i32 0, i32 0		; <i32*> [#uses=1]
 	%tmp2 = load i32, i32* %tmp1		; <i32> [#uses=1]
-	%tmp4 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([7 x i8], [7 x i8]* @str, i32 0, i32 0), i32 %tmp2 )		; <i32> [#uses=0]
+	%tmp4 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([7 x i8], [7 x i8]* @str, i32 0, i32 0), i32 %tmp2 )		; <i32> [#uses=0]
 	ret void
 }
 
@@ -20,7 +20,7 @@ define internal void @_ZN1B1jEv(%struct.B* %this) {
 entry:
 	%tmp1 = getelementptr %struct.B, %struct.B* %this, i32 0, i32 0		; <i32*> [#uses=1]
 	%tmp2 = load i32, i32* %tmp1		; <i32> [#uses=1]
-	%tmp4 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([7 x i8], [7 x i8]* @str1, i32 0, i32 0), i32 %tmp2 )		; <i32> [#uses=0]
+	%tmp4 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([7 x i8], [7 x i8]* @str1, i32 0, i32 0), i32 %tmp2 )		; <i32> [#uses=0]
 	ret void
 }
 
diff --git a/test/CodeGen/ARM/2007-05-03-BadPostIndexedLd.ll b/test/CodeGen/ARM/2007-05-03-BadPostIndexedLd.ll
index ca168b6..50573b4 100644
--- a/test/CodeGen/ARM/2007-05-03-BadPostIndexedLd.ll
+++ b/test/CodeGen/ARM/2007-05-03-BadPostIndexedLd.ll
@@ -93,7 +93,7 @@ cond_true1272:		; preds = %cond_next1267
 	%tmp42.i348 = sub i32 0, %tmp2930.i		; <i32> [#uses=1]
 	%tmp45.i = getelementptr %struct.TestObj, %struct.TestObj* %tmp1273, i32 0, i32 0		; <i8**> [#uses=2]
 	%tmp48.i = load i8*, i8** %tmp45.i		; <i8*> [#uses=1]
-	%tmp50.i350 = call i32 (i8*, i8*, ...)* @sprintf( i8* getelementptr ([256 x i8], [256 x i8]* @Msg, i32 0, i32 0), i8* getelementptr ([48 x i8], [48 x i8]* @.str53615, i32 0, i32 0), i8* null, i8** %tmp45.i, i8* %tmp48.i )		; <i32> [#uses=0]
+	%tmp50.i350 = call i32 (i8*, i8*, ...) @sprintf( i8* getelementptr ([256 x i8], [256 x i8]* @Msg, i32 0, i32 0), i8* getelementptr ([48 x i8], [48 x i8]* @.str53615, i32 0, i32 0), i8* null, i8** %tmp45.i, i8* %tmp48.i )		; <i32> [#uses=0]
 	br i1 false, label %cond_true.i632.i, label %Ut_TraceMsg.exit648.i
 
 cond_true.i632.i:		; preds = %cond_true1272
diff --git a/test/CodeGen/ARM/2007-05-07-tailmerge-1.ll b/test/CodeGen/ARM/2007-05-07-tailmerge-1.ll
index 5895a32..f49c805 100644
--- a/test/CodeGen/ARM/2007-05-07-tailmerge-1.ll
+++ b/test/CodeGen/ARM/2007-05-07-tailmerge-1.ll
@@ -24,13 +24,13 @@ entry:
 	br i1 %toBool, label %cond_true, label %cond_false
 
 cond_true:		; preds = %entry
-	%tmp3 = call i32 (...)* @bar( )		; <i32> [#uses=0]
-	%tmp4 = call i32 (...)* @baz( i32 5, i32 6 )		; <i32> [#uses=0]
+	%tmp3 = call i32 (...) @bar( )		; <i32> [#uses=0]
+	%tmp4 = call i32 (...) @baz( i32 5, i32 6 )		; <i32> [#uses=0]
 	br label %cond_next
 
 cond_false:		; preds = %entry
-	%tmp5 = call i32 (...)* @foo( )		; <i32> [#uses=0]
-	%tmp6 = call i32 (...)* @baz( i32 5, i32 6 )		; <i32> [#uses=0]
+	%tmp5 = call i32 (...) @foo( )		; <i32> [#uses=0]
+	%tmp6 = call i32 (...) @baz( i32 5, i32 6 )		; <i32> [#uses=0]
 	br label %cond_next
 
 cond_next:		; preds = %cond_false, %cond_true
@@ -41,17 +41,17 @@ cond_next:		; preds = %cond_false, %cond_true
 	br i1 %toBool10, label %cond_true11, label %cond_false15
 
 cond_true11:		; preds = %cond_next
-	%tmp13 = call i32 (...)* @foo( )		; <i32> [#uses=0]
-	%tmp14 = call i32 (...)* @quux( i32 3, i32 4 )		; <i32> [#uses=0]
+	%tmp13 = call i32 (...) @foo( )		; <i32> [#uses=0]
+	%tmp14 = call i32 (...) @quux( i32 3, i32 4 )		; <i32> [#uses=0]
 	br label %cond_next18
 
 cond_false15:		; preds = %cond_next
-	%tmp16 = call i32 (...)* @bar( )		; <i32> [#uses=0]
-	%tmp17 = call i32 (...)* @quux( i32 3, i32 4 )		; <i32> [#uses=0]
+	%tmp16 = call i32 (...) @bar( )		; <i32> [#uses=0]
+	%tmp17 = call i32 (...) @quux( i32 3, i32 4 )		; <i32> [#uses=0]
 	br label %cond_next18
 
 cond_next18:		; preds = %cond_false15, %cond_true11
-	%tmp19 = call i32 (...)* @bar( )		; <i32> [#uses=0]
+	%tmp19 = call i32 (...) @bar( )		; <i32> [#uses=0]
 	br label %return
 
 return:		; preds = %cond_next18
diff --git a/test/CodeGen/ARM/2007-05-09-tailmerge-2.ll b/test/CodeGen/ARM/2007-05-09-tailmerge-2.ll
index abb6a33f..421d501 100644
--- a/test/CodeGen/ARM/2007-05-09-tailmerge-2.ll
+++ b/test/CodeGen/ARM/2007-05-09-tailmerge-2.ll
@@ -26,8 +26,8 @@ entry:
 	br i1 %toBool, label %cond_true, label %cond_false
 
 cond_true:		; preds = %entry
-	%tmp3 = call i32 (...)* @bar( )		; <i32> [#uses=0]
-	%tmp4 = call i32 (...)* @baz( i32 5, i32 6 )		; <i32> [#uses=0]
+	%tmp3 = call i32 (...) @bar( )		; <i32> [#uses=0]
+	%tmp4 = call i32 (...) @baz( i32 5, i32 6 )		; <i32> [#uses=0]
 	%tmp7 = load i32, i32* %q_addr		; <i32> [#uses=1]
 	%tmp8 = icmp ne i32 %tmp7, 0		; <i1> [#uses=1]
 	%tmp89 = zext i1 %tmp8 to i8		; <i8> [#uses=1]
@@ -35,8 +35,8 @@ cond_true:		; preds = %entry
 	br i1 %toBool10, label %cond_true11, label %cond_false15
 
 cond_false:		; preds = %entry
-	%tmp5 = call i32 (...)* @foo( )		; <i32> [#uses=0]
-	%tmp6 = call i32 (...)* @baz( i32 5, i32 6 )		; <i32> [#uses=0]
+	%tmp5 = call i32 (...) @foo( )		; <i32> [#uses=0]
+	%tmp6 = call i32 (...) @baz( i32 5, i32 6 )		; <i32> [#uses=0]
 	%tmp27 = load i32, i32* %q_addr		; <i32> [#uses=1]
 	%tmp28 = icmp ne i32 %tmp27, 0		; <i1> [#uses=1]
 	%tmp289 = zext i1 %tmp28 to i8		; <i8> [#uses=1]
@@ -44,17 +44,17 @@ cond_false:		; preds = %entry
 	br i1 %toBool210, label %cond_true11, label %cond_false15
 
 cond_true11:		; preds = %cond_next
-	%tmp13 = call i32 (...)* @foo( )		; <i32> [#uses=0]
-	%tmp14 = call i32 (...)* @quux( i32 3, i32 4 )		; <i32> [#uses=0]
+	%tmp13 = call i32 (...) @foo( )		; <i32> [#uses=0]
+	%tmp14 = call i32 (...) @quux( i32 3, i32 4 )		; <i32> [#uses=0]
 	br label %cond_next18
 
 cond_false15:		; preds = %cond_next
-	%tmp16 = call i32 (...)* @bar( )		; <i32> [#uses=0]
-	%tmp17 = call i32 (...)* @quux( i32 3, i32 4 )		; <i32> [#uses=0]
+	%tmp16 = call i32 (...) @bar( )		; <i32> [#uses=0]
+	%tmp17 = call i32 (...) @quux( i32 3, i32 4 )		; <i32> [#uses=0]
 	br label %cond_next18
 
 cond_next18:		; preds = %cond_false15, %cond_true11
-	%tmp19 = call i32 (...)* @bar( )		; <i32> [#uses=0]
+	%tmp19 = call i32 (...) @bar( )		; <i32> [#uses=0]
 	br label %return
 
 return:		; preds = %cond_next18
diff --git a/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll b/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll
index 1edaefb..52cc37e 100644
--- a/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll
+++ b/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll
@@ -36,8 +36,8 @@ entry:
 	br i1 %toBool, label %cond_true, label %cond_false
 
 cond_true:		; preds = %entry
-	%tmp3 = call i32 (...)* @bar( )		; <i32> [#uses=0]
-	%tmp4 = call i32 (...)* @baz( i32 5, i32 6 )		; <i32> [#uses=0]
+	%tmp3 = call i32 (...) @bar( )		; <i32> [#uses=0]
+	%tmp4 = call i32 (...) @baz( i32 5, i32 6 )		; <i32> [#uses=0]
 	%tmp7 = load i32, i32* %q_addr		; <i32> [#uses=1]
 	%tmp8 = icmp ne i32 %tmp7, 0		; <i1> [#uses=1]
 	%tmp89 = zext i1 %tmp8 to i8		; <i8> [#uses=1]
@@ -45,8 +45,8 @@ cond_true:		; preds = %entry
 	br i1 %toBool10, label %cond_true11, label %cond_false15
 
 cond_false:		; preds = %entry
-	%tmp5 = call i32 (...)* @foo( )		; <i32> [#uses=0]
-	%tmp6 = call i32 (...)* @baz( i32 5, i32 6 )		; <i32> [#uses=0]
+	%tmp5 = call i32 (...) @foo( )		; <i32> [#uses=0]
+	%tmp6 = call i32 (...) @baz( i32 5, i32 6 )		; <i32> [#uses=0]
 	%tmp27 = load i32, i32* %q_addr		; <i32> [#uses=1]
 	%tmp28 = icmp ne i32 %tmp27, 0		; <i1> [#uses=1]
 	%tmp289 = zext i1 %tmp28 to i8		; <i8> [#uses=1]
@@ -54,17 +54,17 @@ cond_false:		; preds = %entry
 	br i1 %toBool210, label %cond_true11, label %cond_false15
 
 cond_true11:		; preds = %cond_next
-	%tmp13 = call i32 (...)* @foo( )		; <i32> [#uses=0]
-	%tmp14 = call i32 (...)* @quux( i32 3, i32 4 )		; <i32> [#uses=0]
+	%tmp13 = call i32 (...) @foo( )		; <i32> [#uses=0]
+	%tmp14 = call i32 (...) @quux( i32 3, i32 4 )		; <i32> [#uses=0]
 	br label %cond_next18
 
 cond_false15:		; preds = %cond_next
-	%tmp16 = call i32 (...)* @bar( )		; <i32> [#uses=0]
-	%tmp17 = call i32 (...)* @quux( i32 3, i32 4 )		; <i32> [#uses=0]
+	%tmp16 = call i32 (...) @bar( )		; <i32> [#uses=0]
+	%tmp17 = call i32 (...) @quux( i32 3, i32 4 )		; <i32> [#uses=0]
 	br label %cond_next18
 
 cond_next18:		; preds = %cond_false15, %cond_true11
-	%tmp19 = call i32 (...)* @bar( )		; <i32> [#uses=0]
+	%tmp19 = call i32 (...) @bar( )		; <i32> [#uses=0]
 	br label %return
 
 return:		; preds = %cond_next18
diff --git a/test/CodeGen/ARM/2008-03-07-RegScavengerAssert.ll b/test/CodeGen/ARM/2008-03-07-RegScavengerAssert.ll
index 5ee8b46..753f9e3 100644
--- a/test/CodeGen/ARM/2008-03-07-RegScavengerAssert.ll
+++ b/test/CodeGen/ARM/2008-03-07-RegScavengerAssert.ll
@@ -13,7 +13,7 @@ bb88.i:		; preds = %bb74.i
 mandel.exit:		; preds = %bb88.i
 	%tmp2 = load volatile double, double* getelementptr ({ double, double }, { double, double }* @accum, i32 0, i32 0), align 8		; <double> [#uses=1]
 	%tmp23 = fptosi double %tmp2 to i32		; <i32> [#uses=1]
-	%tmp5 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %tmp23 )		; <i32> [#uses=0]
+	%tmp5 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %tmp23 )		; <i32> [#uses=0]
 	ret i32 0
 }
 
diff --git a/test/CodeGen/ARM/2008-04-10-ScavengerAssert.ll b/test/CodeGen/ARM/2008-04-10-ScavengerAssert.ll
index bd1f174..1ededa3 100644
--- a/test/CodeGen/ARM/2008-04-10-ScavengerAssert.ll
+++ b/test/CodeGen/ARM/2008-04-10-ScavengerAssert.ll
@@ -73,10 +73,10 @@ bb609.i.i:		; preds = %cond_next602.i.i
 	br label %bb620.i.i
 bb620.i.i:		; preds = %bb620.i.i, %bb609.i.i
 	%indvar166.i465.i = phi i32 [ %indvar.next167.i.i, %bb620.i.i ], [ 0, %bb609.i.i ]		; <i32> [#uses=1]
-	%tmp640.i.i = call i32 (%struct.FILE*, i8*, ...)* @fscanf( %struct.FILE* %tmp61, i8* getelementptr ([5 x i8], [5 x i8]* @.str584, i32 0, i32 0), [1024 x i8]* null )		; <i32> [#uses=0]
+	%tmp640.i.i = call i32 (%struct.FILE*, i8*, ...) @fscanf( %struct.FILE* %tmp61, i8* getelementptr ([5 x i8], [5 x i8]* @.str584, i32 0, i32 0), [1024 x i8]* null )		; <i32> [#uses=0]
 	%tmp648.i.i = load i32, i32* null, align 4		; <i32> [#uses=1]
 	%tmp650.i468.i = icmp sgt i32 0, %tmp648.i.i		; <i1> [#uses=1]
-	%tmp624.i469.i = call i32 (%struct.FILE*, i8*, ...)* @fscanf( %struct.FILE* %tmp61, i8* getelementptr ([5 x i8], [5 x i8]* @.str584, i32 0, i32 0), [1024 x i8]* null )		; <i32> [#uses=0]
+	%tmp624.i469.i = call i32 (%struct.FILE*, i8*, ...) @fscanf( %struct.FILE* %tmp61, i8* getelementptr ([5 x i8], [5 x i8]* @.str584, i32 0, i32 0), [1024 x i8]* null )		; <i32> [#uses=0]
 	%indvar.next167.i.i = add i32 %indvar166.i465.i, 1		; <i32> [#uses=1]
 	br i1 %tmp650.i468.i, label %bb653.i.i.loopexit, label %bb620.i.i
 bb653.i.i.loopexit:		; preds = %bb620.i.i
diff --git a/test/CodeGen/ARM/2009-02-16-SpillerBug.ll b/test/CodeGen/ARM/2009-02-16-SpillerBug.ll
index d090da0..cad5440 100644
--- a/test/CodeGen/ARM/2009-02-16-SpillerBug.ll
+++ b/test/CodeGen/ARM/2009-02-16-SpillerBug.ll
@@ -81,7 +81,7 @@ bb244:		; preds = %bb122, %bb122, %bb122, %bb122, %bb122, %bb122, %bb122, %bb122
 	br i1 %0, label %bb435, label %bb433
 
 bb394:		; preds = %bb122
-	call void (i32, i32, i8*, i32, %struct.FILE_POS*, ...)* @Error(i32 1, i32 3, i8* getelementptr ([23 x i8], [23 x i8]* @"\01LC13423", i32 0, i32 0), i32 0, %struct.FILE_POS* @no_file_pos, i8* getelementptr ([13 x i8], [13 x i8]* @"\01LC18972", i32 0, i32 0), i8* null) nounwind
+	call void (i32, i32, i8*, i32, %struct.FILE_POS*, ...) @Error(i32 1, i32 3, i8* getelementptr ([23 x i8], [23 x i8]* @"\01LC13423", i32 0, i32 0), i32 0, %struct.FILE_POS* @no_file_pos, i8* getelementptr ([13 x i8], [13 x i8]* @"\01LC18972", i32 0, i32 0), i8* null) nounwind
 	br label %bb396
 
 bb396:		; preds = %bb394, %bb131, %bb122, %bb122, %bb122, %bb122, %RESUME
diff --git a/test/CodeGen/ARM/2009-04-08-FREM.ll b/test/CodeGen/ARM/2009-04-08-FREM.ll
index 606c6b1..e0f9485 100644
--- a/test/CodeGen/ARM/2009-04-08-FREM.ll
+++ b/test/CodeGen/ARM/2009-04-08-FREM.ll
@@ -4,6 +4,6 @@ declare i32 @printf(i8*, ...)
 
 define i32 @main() {
 	%rem_r = frem double 0.000000e+00, 0.000000e+00		; <double> [#uses=1]
-	%1 = call i32 (i8*, ...)* @printf(i8* null, double %rem_r)		; <i32> [#uses=0]
+	%1 = call i32 (i8*, ...) @printf(i8* null, double %rem_r)		; <i32> [#uses=0]
 	ret i32 0
 }
diff --git a/test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll b/test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll
index 887fb0b..ac641f9 100644
--- a/test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll
+++ b/test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll
@@ -5,7 +5,7 @@
 define i16 @fn16(i16 %arg0.0, <2 x i16> %arg1, i16 %arg2.0) nounwind {
 entry:
 	store <2 x i16> %arg1, <2 x i16>* null
-	%0 = call i32 (i8*, ...)* @printf(i8* getelementptr ([30 x i8], [30 x i8]* @.str, i32 0, i32 0), i32 0) nounwind		; <i32> [#uses=0]
+	%0 = call i32 (i8*, ...) @printf(i8* getelementptr ([30 x i8], [30 x i8]* @.str, i32 0, i32 0), i32 0) nounwind		; <i32> [#uses=0]
 	ret i16 0
 }
 
diff --git a/test/CodeGen/ARM/2009-05-11-CodePlacementCrash.ll b/test/CodeGen/ARM/2009-05-11-CodePlacementCrash.ll
index b616cb3..ae005db 100644
--- a/test/CodeGen/ARM/2009-05-11-CodePlacementCrash.ll
+++ b/test/CodeGen/ARM/2009-05-11-CodePlacementCrash.ll
@@ -19,7 +19,7 @@ bb1:		; preds = %bb
 
 bb3:		; preds = %bb1, %bb
 	%iftmp.0.0 = phi i32 [ 0, %bb1 ], [ -1, %bb ]		; <i32> [#uses=1]
-	%1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([7 x i8], [7 x i8]* @"\01LC", i32 0, i32 0), i32 0, i32 %iftmp.0.0) nounwind		; <i32> [#uses=0]
+	%1 = tail call i32 (i8*, ...) @printf(i8* getelementptr ([7 x i8], [7 x i8]* @"\01LC", i32 0, i32 0), i32 0, i32 %iftmp.0.0) nounwind		; <i32> [#uses=0]
 	%2 = load %struct.List*, %struct.List** null, align 4		; <%struct.List*> [#uses=2]
 	%phitmp = icmp eq %struct.List* %2, null		; <i1> [#uses=1]
 	br i1 %phitmp, label %bb5, label %bb
diff --git a/test/CodeGen/ARM/2009-06-02-ISelCrash.ll b/test/CodeGen/ARM/2009-06-02-ISelCrash.ll
index 0612d51..7bbb809 100644
--- a/test/CodeGen/ARM/2009-06-02-ISelCrash.ll
+++ b/test/CodeGen/ARM/2009-06-02-ISelCrash.ll
@@ -57,6 +57,6 @@ Fft.exit.i:		; preds = %bb7.i.i
 	br i1 undef, label %bb5.i, label %bb1.outer2.i.i.outer
 
 bb5.i:		; preds = %Fft.exit.i
-	%0 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([15 x i8], [15 x i8]* @"\01LC", i32 0, i32 0), double undef, double undef) nounwind		; <i32> [#uses=0]
+	%0 = tail call i32 (i8*, ...) @printf(i8* getelementptr ([15 x i8], [15 x i8]* @"\01LC", i32 0, i32 0), double undef, double undef) nounwind		; <i32> [#uses=0]
 	unreachable
 }
diff --git a/test/CodeGen/ARM/2009-06-30-RegScavengerAssert.ll b/test/CodeGen/ARM/2009-06-30-RegScavengerAssert.ll
index 72a41f9..e9c4b03 100644
--- a/test/CodeGen/ARM/2009-06-30-RegScavengerAssert.ll
+++ b/test/CodeGen/ARM/2009-06-30-RegScavengerAssert.ll
@@ -47,14 +47,14 @@ bb11:		; preds = %bb9
 	tail call  void @diff(i8* undef, i8* %3, i32 undef, i32 undef, i32 undef, i32 undef) nounwind
 	%4 = sitofp i32 undef to double		; <double> [#uses=1]
 	%5 = fdiv double %4, 1.000000e+01		; <double> [#uses=1]
-	%6 = tail call  i32 (i8*, ...)* @printf(i8* getelementptr ([29 x i8], [29 x i8]* @"\01LC12", i32 0, i32 0), double %5) nounwind		; <i32> [#uses=0]
+	%6 = tail call  i32 (i8*, ...) @printf(i8* getelementptr ([29 x i8], [29 x i8]* @"\01LC12", i32 0, i32 0), double %5) nounwind		; <i32> [#uses=0]
 	%7 = load i32, i32* @al_len, align 4		; <i32> [#uses=1]
 	%8 = load i32, i32* @no_mat, align 4		; <i32> [#uses=1]
 	%9 = load i32, i32* @no_mis, align 4		; <i32> [#uses=1]
 	%10 = sub i32 %7, %8		; <i32> [#uses=1]
 	%11 = sub i32 %10, %9		; <i32> [#uses=1]
-	%12 = tail call  i32 (i8*, ...)* @printf(i8* getelementptr ([33 x i8], [33 x i8]* @"\01LC16", i32 0, i32 0), i32 %11) nounwind		; <i32> [#uses=0]
-	%13 = tail call  i32 (i8*, ...)* @printf(i8* getelementptr ([47 x i8], [47 x i8]* @"\01LC17", i32 0, i32 0), i32 undef, i32 %1, i32 undef, i32 undef) nounwind		; <i32> [#uses=0]
+	%12 = tail call  i32 (i8*, ...) @printf(i8* getelementptr ([33 x i8], [33 x i8]* @"\01LC16", i32 0, i32 0), i32 %11) nounwind		; <i32> [#uses=0]
+	%13 = tail call  i32 (i8*, ...) @printf(i8* getelementptr ([47 x i8], [47 x i8]* @"\01LC17", i32 0, i32 0), i32 undef, i32 %1, i32 undef, i32 undef) nounwind		; <i32> [#uses=0]
 	br i1 undef, label %bb15, label %bb12
 
 bb12:		; preds = %bb11
diff --git a/test/CodeGen/ARM/2009-06-30-RegScavengerAssert2.ll b/test/CodeGen/ARM/2009-06-30-RegScavengerAssert2.ll
index 92b1869..08291e6 100644
--- a/test/CodeGen/ARM/2009-06-30-RegScavengerAssert2.ll
+++ b/test/CodeGen/ARM/2009-06-30-RegScavengerAssert2.ll
@@ -42,10 +42,10 @@ bb11:		; preds = %bb9
 	store i32 0, i32* @no_mis, align 4
 	%4 = getelementptr i8, i8* %B, i32 %0		; <i8*> [#uses=1]
 	tail call  void @diff(i8* undef, i8* %4, i32 undef, i32 %3, i32 undef, i32 undef) nounwind
-	%5 = tail call  i32 (i8*, ...)* @printf(i8* getelementptr ([33 x i8], [33 x i8]* @"\01LC11", i32 0, i32 0), i32 %tmp13) nounwind		; <i32> [#uses=0]
+	%5 = tail call  i32 (i8*, ...) @printf(i8* getelementptr ([33 x i8], [33 x i8]* @"\01LC11", i32 0, i32 0), i32 %tmp13) nounwind		; <i32> [#uses=0]
 	%6 = load i32, i32* @no_mis, align 4		; <i32> [#uses=1]
-	%7 = tail call  i32 (i8*, ...)* @printf(i8* getelementptr ([33 x i8], [33 x i8]* @"\01LC15", i32 0, i32 0), i32 %6) nounwind		; <i32> [#uses=0]
-	%8 = tail call  i32 (i8*, ...)* @printf(i8* getelementptr ([47 x i8], [47 x i8]* @"\01LC17", i32 0, i32 0), i32 undef, i32 %1, i32 undef, i32 %2) nounwind		; <i32> [#uses=0]
+	%7 = tail call  i32 (i8*, ...) @printf(i8* getelementptr ([33 x i8], [33 x i8]* @"\01LC15", i32 0, i32 0), i32 %6) nounwind		; <i32> [#uses=0]
+	%8 = tail call  i32 (i8*, ...) @printf(i8* getelementptr ([47 x i8], [47 x i8]* @"\01LC17", i32 0, i32 0), i32 undef, i32 %1, i32 undef, i32 %2) nounwind		; <i32> [#uses=0]
 	br i1 undef, label %bb15, label %bb12
 
 bb12:		; preds = %bb11
diff --git a/test/CodeGen/ARM/2009-10-16-Scope.ll b/test/CodeGen/ARM/2009-10-16-Scope.ll
index 9caa785..b2b3bbe 100644
--- a/test/CodeGen/ARM/2009-10-16-Scope.ll
+++ b/test/CodeGen/ARM/2009-10-16-Scope.ll
@@ -9,7 +9,7 @@ entry:
   br label %do.body, !dbg !0
 
 do.body:                                          ; preds = %entry
-  call void @llvm.dbg.declare(metadata i32* %count_, metadata !4, metadata !MDExpression())
+  call void @llvm.dbg.declare(metadata i32* %count_, metadata !4, metadata !MDExpression()), !dbg !MDLocation(scope: !5)
   %conv = ptrtoint i32* %count_ to i32, !dbg !0   ; <i32> [#uses=1]
   %call = call i32 @foo(i32 %conv) ssp, !dbg !0   ; <i32> [#uses=0]
   br label %do.end, !dbg !0
diff --git a/test/CodeGen/ARM/2009-10-27-double-align.ll b/test/CodeGen/ARM/2009-10-27-double-align.ll
index b43f2a6..39f3292 100644
--- a/test/CodeGen/ARM/2009-10-27-double-align.ll
+++ b/test/CodeGen/ARM/2009-10-27-double-align.ll
@@ -8,7 +8,7 @@ entry:
 ;CHECK: [sp, #8]
 ;CHECK: [sp, #12]
 ;CHECK: [sp]
-        tail call  void (i8*, ...)* @f(i8* getelementptr ([1 x i8], [1 x i8]* @.str, i32 0, i32 0), i32 1, double 2.000000e+00, i32 3, double 4.000000e+00)
+        tail call  void (i8*, ...) @f(i8* getelementptr ([1 x i8], [1 x i8]* @.str, i32 0, i32 0), i32 1, double 2.000000e+00, i32 3, double 4.000000e+00)
         ret void
 }
 
diff --git a/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll b/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
index 8df3aaf..312cccd 100644
--- a/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
+++ b/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
@@ -5,7 +5,7 @@ target triple = "armv4t-apple-darwin10"
 
 define hidden i32 @__addvsi3(i32 %a, i32 %b) nounwind {
 entry:
-  tail call void @llvm.dbg.value(metadata i32 %b, i64 0, metadata !0, metadata !MDExpression())
+  tail call void @llvm.dbg.value(metadata i32 %b, i64 0, metadata !0, metadata !MDExpression()), !dbg !MDLocation(scope: !1)
   %0 = add nsw i32 %b, %a, !dbg !9                ; <i32> [#uses=1]
   ret i32 %0, !dbg !11
 }
@@ -27,6 +27,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !9 = !MDLocation(line: 95, scope: !10)
 !10 = distinct !MDLexicalBlock(line: 94, column: 0, file: !12, scope: !1)
 !11 = !MDLocation(line: 100, scope: !10)
-!13 = !{i32 0}
+!13 = !{}
 !14 = !{!1}
 !15 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/ARM/2010-05-18-LocalAllocCrash.ll b/test/CodeGen/ARM/2010-05-18-LocalAllocCrash.ll
index 89ad5f5..deb5884 100644
--- a/test/CodeGen/ARM/2010-05-18-LocalAllocCrash.ll
+++ b/test/CodeGen/ARM/2010-05-18-LocalAllocCrash.ll
@@ -19,12 +19,12 @@ entry:
   %tmp21 = load i32, i32* undef                        ; <i32> [#uses=1]
   %0 = mul i32 1, %tmp21                          ; <i32> [#uses=1]
   %vla22 = alloca i8, i32 %0, align 1             ; <i8*> [#uses=1]
-  call  void (...)* @zz(i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i32 0, i32 0), i32 2, i32 1)
+  call  void (...) @zz(i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i32 0, i32 0), i32 2, i32 1)
   br i1 undef, label %if.then, label %if.end36
 
 if.then:                                          ; preds = %entry
-  %call = call  i32 (...)* @x(%struct.q* undef, i8* undef, i8* %vla6, i8* %vla10, i32 undef) ; <i32> [#uses=0]
-  %call35 = call  i32 (...)* @x(%struct.q* undef, i8* %vla14, i8* %vla18, i8* %vla22, i32 undef) ; <i32> [#uses=0]
+  %call = call  i32 (...) @x(%struct.q* undef, i8* undef, i8* %vla6, i8* %vla10, i32 undef) ; <i32> [#uses=0]
+  %call35 = call  i32 (...) @x(%struct.q* undef, i8* %vla14, i8* %vla18, i8* %vla22, i32 undef) ; <i32> [#uses=0]
   unreachable
 
 if.end36:                                         ; preds = %entry
diff --git a/test/CodeGen/ARM/2010-06-21-LdStMultipleBug.ll b/test/CodeGen/ARM/2010-06-21-LdStMultipleBug.ll
index 9cd61d3..6f55ac0 100644
--- a/test/CodeGen/ARM/2010-06-21-LdStMultipleBug.ll
+++ b/test/CodeGen/ARM/2010-06-21-LdStMultipleBug.ll
@@ -13,7 +13,7 @@
 define void @TW_oldinput(%struct.FILE* nocapture %fp) nounwind {
 entry:
   %xcenter = alloca i32, align 4                  ; <i32*> [#uses=2]
-  %0 = call i32 (%struct.FILE*, i8*, ...)* @fscanf(%struct.FILE* %fp, i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str2708, i32 0, i32 0), i32* undef, i32* undef, i32* %xcenter, i32* null) nounwind ; <i32> [#uses=1]
+  %0 = call i32 (%struct.FILE*, i8*, ...) @fscanf(%struct.FILE* %fp, i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str2708, i32 0, i32 0), i32* undef, i32* undef, i32* %xcenter, i32* null) nounwind ; <i32> [#uses=1]
   %1 = icmp eq i32 %0, 4                          ; <i1> [#uses=1]
   br i1 %1, label %bb, label %return
 
@@ -137,7 +137,7 @@ bb322:                                            ; preds = %bb248
   br i1 undef, label %bb248, label %bb445
 
 bb445:                                            ; preds = %bb322, %bb10, %bb
-  %49 = call i32 (%struct.FILE*, i8*, ...)* @fscanf(%struct.FILE* %fp, i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str2708, i32 0, i32 0), i32* undef, i32* undef, i32* %xcenter, i32* null) nounwind ; <i32> [#uses=1]
+  %49 = call i32 (%struct.FILE*, i8*, ...) @fscanf(%struct.FILE* %fp, i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str2708, i32 0, i32 0), i32* undef, i32* undef, i32* %xcenter, i32* null) nounwind ; <i32> [#uses=1]
   %50 = icmp eq i32 %49, 4                        ; <i1> [#uses=1]
   br i1 %50, label %bb, label %return
 
diff --git a/test/CodeGen/ARM/2010-07-26-GlobalMerge.ll b/test/CodeGen/ARM/2010-07-26-GlobalMerge.ll
index 4c5d8d9..b02efea 100644
--- a/test/CodeGen/ARM/2010-07-26-GlobalMerge.ll
+++ b/test/CodeGen/ARM/2010-07-26-GlobalMerge.ll
@@ -31,7 +31,7 @@ define internal void @_ZN1AD1Ev(%struct.A* nocapture %this) nounwind ssp align 2
 entry:
   %tmp.i = getelementptr inbounds %struct.A, %struct.A* %this, i32 0, i32 0 ; <i32*> [#uses=1]
   %tmp2.i = load i32, i32* %tmp.i                      ; <i32> [#uses=1]
-  %call.i = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str4, i32 0, i32 0), i32 %tmp2.i) nounwind ; <i32> [#uses=0]
+  %call.i = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str4, i32 0, i32 0), i32 %tmp2.i) nounwind ; <i32> [#uses=0]
   %tmp3.i = load i32, i32* @d                          ; <i32> [#uses=1]
   %inc.i = add nsw i32 %tmp3.i, 1                 ; <i32> [#uses=1]
   store i32 %inc.i, i32* @d
@@ -46,7 +46,7 @@ entry:
   %exception.i = tail call i8* @__cxa_allocate_exception(i32 4) nounwind ; <i8*> [#uses=2]
   %tmp2.i.i.i = bitcast i8* %exception.i to i32*  ; <i32*> [#uses=1]
   store i32 1, i32* %tmp2.i.i.i
-  %call.i.i.i = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str5, i32 0, i32 0), i32 1) nounwind ; <i32> [#uses=0]
+  %call.i.i.i = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str5, i32 0, i32 0), i32 1) nounwind ; <i32> [#uses=0]
   invoke void @__cxa_throw(i8* %exception.i, i8* bitcast (%0* @_ZTI1A to i8*), i8* bitcast (void (%struct.A*)* @_ZN1AD1Ev to i8*)) noreturn
           to label %.noexc unwind label %lpad
 
@@ -55,16 +55,16 @@ entry:
 
 try.cont:                                         ; preds = %lpad
   %0 = tail call i8* @__cxa_get_exception_ptr(i8* %exn) nounwind ; <i8*> [#uses=0]
-  %call.i.i = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str3, i32 0, i32 0), i32 2) nounwind ; <i32> [#uses=0]
+  %call.i.i = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str3, i32 0, i32 0), i32 2) nounwind ; <i32> [#uses=0]
   %1 = tail call i8* @__cxa_begin_catch(i8* %exn) nounwind ; <i8*> [#uses=0]
   %puts = tail call i32 @puts(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @str1, i32 0, i32 0)) ; <i32> [#uses=0]
-  %call.i.i3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str4, i32 0, i32 0), i32 2) nounwind ; <i32> [#uses=0]
+  %call.i.i3 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str4, i32 0, i32 0), i32 2) nounwind ; <i32> [#uses=0]
   %tmp3.i.i = load i32, i32* @d                        ; <i32> [#uses=1]
   %inc.i.i4 = add nsw i32 %tmp3.i.i, 1            ; <i32> [#uses=1]
   store i32 %inc.i.i4, i32* @d
   tail call void @__cxa_end_catch()
   %tmp13 = load i32, i32* @d                           ; <i32> [#uses=1]
-  %call14 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str2, i32 0, i32 0), i32 2, i32 %tmp13) ; <i32> [#uses=0]
+  %call14 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str2, i32 0, i32 0), i32 2, i32 %tmp13) ; <i32> [#uses=0]
   %tmp16 = load i32, i32* @d                           ; <i32> [#uses=1]
   %cmp = icmp ne i32 %tmp16, 2                    ; <i1> [#uses=1]
   %conv = zext i1 %cmp to i32                     ; <i32> [#uses=1]
diff --git a/test/CodeGen/ARM/2010-12-15-elf-lcomm.ll b/test/CodeGen/ARM/2010-12-15-elf-lcomm.ll
index d3c0fee..cb91890 100644
--- a/test/CodeGen/ARM/2010-12-15-elf-lcomm.ll
+++ b/test/CodeGen/ARM/2010-12-15-elf-lcomm.ll
@@ -15,11 +15,6 @@
 ; ASM-NEXT:     .type   _MergedGlobals,%object  @ @_MergedGlobals
 
 
-; OBJ:      Sections [
-; OBJ:        Section {
-; OBJ:          Index: 4
-; OBJ-NEXT:     Name: .bss
-
 ; OBJ:      Symbols [
 ; OBJ:        Symbol {
 ; OBJ:          Name: array00
diff --git a/test/CodeGen/ARM/2011-04-15-AndVFlagPeepholeBug.ll b/test/CodeGen/ARM/2011-04-15-AndVFlagPeepholeBug.ll
index e712e08..f17884e 100644
--- a/test/CodeGen/ARM/2011-04-15-AndVFlagPeepholeBug.ll
+++ b/test/CodeGen/ARM/2011-04-15-AndVFlagPeepholeBug.ll
@@ -12,7 +12,7 @@ entry:
   br i1 %cmp, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
-  tail call void (...)* @g(i32 %a, i32 %b) nounwind
+  tail call void (...) @g(i32 %a, i32 %b) nounwind
   br label %if.end
 
 if.end:                                           ; preds = %if.then, %entry
diff --git a/test/CodeGen/ARM/2011-04-15-RegisterCmpPeephole.ll b/test/CodeGen/ARM/2011-04-15-RegisterCmpPeephole.ll
index 5404cf5..864e291 100644
--- a/test/CodeGen/ARM/2011-04-15-RegisterCmpPeephole.ll
+++ b/test/CodeGen/ARM/2011-04-15-RegisterCmpPeephole.ll
@@ -12,7 +12,7 @@ entry:
   br i1 %cmp, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
-  tail call void (...)* @h(i32 %a, i32 %b) nounwind
+  tail call void (...) @h(i32 %a, i32 %b) nounwind
   br label %if.end
 
 if.end:                                           ; preds = %if.then, %entry
@@ -31,7 +31,7 @@ entry:
   br i1 %cmp, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
-  tail call void (...)* @h(i32 %a, i32 %b) nounwind
+  tail call void (...) @h(i32 %a, i32 %b) nounwind
   br label %if.end
 
 if.end:                                           ; preds = %if.then, %entry
diff --git a/test/CodeGen/ARM/2011-10-26-ExpandUnalignedLoadCrash.ll b/test/CodeGen/ARM/2011-10-26-ExpandUnalignedLoadCrash.ll
index 9f2fa63..86596d6 100644
--- a/test/CodeGen/ARM/2011-10-26-ExpandUnalignedLoadCrash.ll
+++ b/test/CodeGen/ARM/2011-10-26-ExpandUnalignedLoadCrash.ll
@@ -4,7 +4,7 @@ target triple = "armv6-none-linux-gnueabi"
 
 define void @sample_test(i8* %.T0348, i16* nocapture %sourceA, i16* nocapture %destValues) {
 L.entry:
-  %0 = call i32 (...)* @get_index(i8* %.T0348, i32 0)
+  %0 = call i32 (...) @get_index(i8* %.T0348, i32 0)
   %1 = bitcast i16* %destValues to i8*
   %2 = mul i32 %0, 6
   %3 = getelementptr i8, i8* %1, i32 %2
diff --git a/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll b/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
index b64b1bf..4a1341c 100644
--- a/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
+++ b/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
@@ -33,7 +33,7 @@ entry:
 ; CHECK: movw r0, #555
 define i32 @main() {
 entry:
-  call void (i32, ...)* @test_byval_8_bytes_alignment(i32 555, %struct_t* byval @static_val)
+  call void (i32, ...) @test_byval_8_bytes_alignment(i32 555, %struct_t* byval @static_val)
   ret i32 0
 }
 
@@ -48,7 +48,7 @@ define void @test_byval_8_bytes_alignment_fixed_arg(i32 %n1, %struct_t* byval %v
 entry:
   %a = getelementptr inbounds %struct_t, %struct_t* %val, i32 0, i32 0
   %0 = load double, double* %a
-  call void (double)* @f(double %0)
+  call void (double) @f(double %0)
   ret void
 }
 
@@ -60,6 +60,6 @@ entry:
 ; CHECK: movw r0, #555
 define i32 @main_fixed_arg() {
 entry:
-  call void (i32, %struct_t*)* @test_byval_8_bytes_alignment_fixed_arg(i32 555, %struct_t* byval @static_val)
+  call void (i32, %struct_t*) @test_byval_8_bytes_alignment_fixed_arg(i32 555, %struct_t* byval @static_val)
   ret i32 0
 }
diff --git a/test/CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll b/test/CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll
index ef06f59..34af902 100644
--- a/test/CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll
+++ b/test/CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll
@@ -14,6 +14,6 @@ define void @test_byval_usage_scheduling(i32 %n1, i32 %n2, %struct_t* byval %val
 entry:
   %a = getelementptr inbounds %struct_t, %struct_t* %val, i32 0, i32 0
   %0 = load double, double* %a
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str, i32 0, i32 0), double %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str, i32 0, i32 0), double %0)
   ret void
 }
diff --git a/test/CodeGen/ARM/2013-04-21-AAPCS-VA-C.1.cp.ll b/test/CodeGen/ARM/2013-04-21-AAPCS-VA-C.1.cp.ll
index 427519f..d18dbd2 100644
--- a/test/CodeGen/ARM/2013-04-21-AAPCS-VA-C.1.cp.ll
+++ b/test/CodeGen/ARM/2013-04-21-AAPCS-VA-C.1.cp.ll
@@ -14,7 +14,7 @@ define void @printfn(i32 %a, i16 signext %b, double %C, i8 signext %E) {
 entry:
   %conv = sext i16 %b to i32
   %conv1 = sext i8 %E to i32
-  %call = tail call i32 (i8*, ...)* @printf(
+  %call = tail call i32 (i8*, ...) @printf(
 	i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), ; --> R0
         i32 %a,                                          ; --> R1
         i32 %conv,                                       ; --> R2
diff --git a/test/CodeGen/ARM/2013-10-11-select-stalls.ll b/test/CodeGen/ARM/2013-10-11-select-stalls.ll
index 33c0587..d6045c7 100644
--- a/test/CodeGen/ARM/2013-10-11-select-stalls.ll
+++ b/test/CodeGen/ARM/2013-10-11-select-stalls.ll
@@ -7,7 +7,7 @@ define <16 x i8> @multiselect(i32 %avail, i8* %foo, i8* %bar) {
 entry:
   %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %foo, i32 1)
   %vld2 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %bar, i32 1)
-  %and = and i32 %avail, 1
+  %and = and i32 %avail, 3
   %tobool = icmp eq i32 %and, 0
   %retv = select i1 %tobool, <16 x i8> %vld1, <16 x i8> %vld2
   ret <16 x i8> %retv
diff --git a/test/CodeGen/ARM/aliases.ll b/test/CodeGen/ARM/aliases.ll
index c24d0d2..04ca3e8 100644
--- a/test/CodeGen/ARM/aliases.ll
+++ b/test/CodeGen/ARM/aliases.ll
@@ -38,7 +38,7 @@ entry:
    %tmp0 = load i32, i32* @bar_i
    %tmp2 = call i32 @foo_f()
    %tmp3 = add i32 %tmp, %tmp2
-   %tmp4 = call %FunTy* @bar_f()
+   %tmp4 = call i32 @bar_f()
    %tmp5 = add i32 %tmp3, %tmp4
    %tmp6 = add i32 %tmp1, %tmp5
    %tmp7 = add i32 %tmp6, %tmp0
diff --git a/test/CodeGen/ARM/arguments.ll b/test/CodeGen/ARM/arguments.ll
index e7fbf9f..3b1d8dd 100644
--- a/test/CodeGen/ARM/arguments.ll
+++ b/test/CodeGen/ARM/arguments.ll
@@ -18,7 +18,7 @@ define i32 @f2() nounwind optsize {
 ; DARWIN-LABEL: f2:
 ; DARWIN: mov	r3, #128
 entry:
-  %0 = tail call i32 (i32, ...)* @g2(i32 5, double 1.600000e+01, i32 128) nounwind optsize ; <i32> [#uses=1]
+  %0 = tail call i32 (i32, ...) @g2(i32 5, double 1.600000e+01, i32 128) nounwind optsize ; <i32> [#uses=1]
   %not. = icmp ne i32 %0, 128                     ; <i1> [#uses=1]
   %.0 = zext i1 %not. to i32                      ; <i32> [#uses=1]
   ret i32 %.0
diff --git a/test/CodeGen/ARM/arm-asm.ll b/test/CodeGen/ARM/arm-asm.ll
index e869abe..f9199ff 100644
--- a/test/CodeGen/ARM/arm-asm.ll
+++ b/test/CodeGen/ARM/arm-asm.ll
@@ -2,6 +2,6 @@
 
 define void @frame_dummy() {
 entry:
-        %tmp1 = tail call void (i8*)* (void (i8*)*)* asm "", "=r,0,~{dirflag},~{fpsr},~{flags}"( void (i8*)* null )           ; <void (i8*)*> [#uses=0]
+        %tmp1 = tail call void (i8*)* (void (i8*)*) asm "", "=r,0,~{dirflag},~{fpsr},~{flags}"( void (i8*)* null )           ; <void (i8*)*> [#uses=0]
         ret void
 }
diff --git a/test/CodeGen/ARM/build-attributes.ll b/test/CodeGen/ARM/build-attributes.ll
index 8540833..1982fa9 100644
--- a/test/CodeGen/ARM/build-attributes.ll
+++ b/test/CodeGen/ARM/build-attributes.ll
@@ -81,6 +81,8 @@
 ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=+fp-only-sp  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M7-FAST
 ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 | FileCheck %s --check-prefix=CORTEX-M7-DOUBLE
 ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r4 | FileCheck %s --check-prefix=CORTEX-R4
+; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r4f | FileCheck %s --check-prefix=CORTEX-R4F
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5 | FileCheck %s --check-prefix=CORTEX-R5
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-R5-FAST
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
@@ -1012,6 +1014,49 @@
 ; CORTEX-M7-NOFPU-FAST-NOT:  .eabi_attribute 22
 ; CORTEX-M7-NOFPU-FAST:  .eabi_attribute 23, 1
 
+; CORTEX-R4:  .cpu cortex-r4
+; CORTEX-R4:  .eabi_attribute 6, 10
+; CORTEX-R4:  .eabi_attribute 7, 82
+; CORTEX-R4:  .eabi_attribute 8, 1
+; CORTEX-R4:  .eabi_attribute 9, 2
+; CORTEX-R4-NOT:  .fpu vfpv3-d16
+; CORTEX-R4-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; CORTEX-R4:  .eabi_attribute 20, 1
+; CORTEX-R4:  .eabi_attribute 21, 1
+; CORTEX-R4-NOT:  .eabi_attribute 22
+; CORTEX-R4:  .eabi_attribute 23, 3
+; CORTEX-R4:  .eabi_attribute 24, 1
+; CORTEX-R4:  .eabi_attribute 25, 1
+; CORTEX-R4-NOT:  .eabi_attribute 28
+; CORTEX-R4-NOT:  .eabi_attribute 36
+; CORTEX-R4:  .eabi_attribute 38, 1
+; CORTEX-R4-NOT:  .eabi_attribute 42
+; CORTEX-R4-NOT:  .eabi_attribute 44
+; CORTEX-R4-NOT:  .eabi_attribute 68
+
+; CORTEX-R4F:  .cpu cortex-r4f
+; CORTEX-R4F:  .eabi_attribute 6, 10
+; CORTEX-R4F:  .eabi_attribute 7, 82
+; CORTEX-R4F:  .eabi_attribute 8, 1
+; CORTEX-R4F:  .eabi_attribute 9, 2
+; CORTEX-R4F:  .fpu vfpv3-d16
+; CORTEX-R4F-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; CORTEX-R4F:  .eabi_attribute 20, 1
+; CORTEX-R4F:  .eabi_attribute 21, 1
+; CORTEX-R4F-NOT:  .eabi_attribute 22
+; CORTEX-R4F:  .eabi_attribute 23, 3
+; CORTEX-R4F:  .eabi_attribute 24, 1
+; CORTEX-R4F:  .eabi_attribute 25, 1
+; CORTEX-R4F:  .eabi_attribute 27, 1
+; CORTEX-R4F-NOT:  .eabi_attribute 28
+; CORTEX-R4F-NOT:  .eabi_attribute 36
+; CORTEX-R4F:  .eabi_attribute 38, 1
+; CORTEX-R4F-NOT:  .eabi_attribute 42
+; CORTEX-R4F-NOT:  .eabi_attribute 44
+; CORTEX-R4F-NOT:  .eabi_attribute 68
+
 ; CORTEX-R5:  .cpu cortex-r5
 ; CORTEX-R5:  .eabi_attribute 6, 10
 ; CORTEX-R5:  .eabi_attribute 7, 82
diff --git a/test/CodeGen/ARM/bx_fold.ll b/test/CodeGen/ARM/bx_fold.ll
index c1aac44..f6651ae 100644
--- a/test/CodeGen/ARM/bx_fold.ll
+++ b/test/CodeGen/ARM/bx_fold.ll
@@ -14,7 +14,7 @@ bb:		; preds = %bb1
 bb1:		; preds = %bb, %entry
 	%indvar = phi i32 [ 0, %entry ], [ %indvar.next, %bb ]		; <i32> [#uses=3]
 	%i.0 = bitcast i32 %indvar to i32		; <i32> [#uses=2]
-	%tmp = tail call i32 (...)* @bar( )		; <i32> [#uses=1]
+	%tmp = tail call i32 (...) @bar( )		; <i32> [#uses=1]
 	%tmp2 = add i32 %i.0, %tmp		; <i32> [#uses=1]
 	%Ptr_addr.0 = sub i32 %Ptr, %tmp2		; <i32> [#uses=0]
 	%tmp12 = icmp eq i32 %i.0, %Ptr		; <i1> [#uses=1]
diff --git a/test/CodeGen/ARM/cache-intrinsic.ll b/test/CodeGen/ARM/cache-intrinsic.ll
index a041d075..12b55c7 100644
--- a/test/CodeGen/ARM/cache-intrinsic.ll
+++ b/test/CodeGen/ARM/cache-intrinsic.ll
@@ -10,10 +10,10 @@ define i32 @main() {
 entry:
   %retval = alloca i32, align 4
   store i32 0, i32* %retval
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0))
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0))
   %call1 = call i8* @strcpy(i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str1, i32 0, i32 0)) #3
   call void @llvm.clear_cache(i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0), i8* getelementptr inbounds (i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0), i32 32)) #3
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0))
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0))
   ret i32 0
 }
 
diff --git a/test/CodeGen/ARM/compare-call.ll b/test/CodeGen/ARM/compare-call.ll
index d4bd92b..f45ed73 100644
--- a/test/CodeGen/ARM/compare-call.ll
+++ b/test/CodeGen/ARM/compare-call.ll
@@ -9,7 +9,7 @@ entry:
         br i1 %tmp.upgrd.1, label %cond_true, label %UnifiedReturnBlock
 
 cond_true:              ; preds = %entry
-        %tmp.upgrd.2 = tail call i32 (...)* @bar( )             ; <i32> [#uses=0]
+        %tmp.upgrd.2 = tail call i32 (...) @bar( )             ; <i32> [#uses=0]
         ret void
 
 UnifiedReturnBlock:             ; preds = %entry
diff --git a/test/CodeGen/ARM/debug-info-arg.ll b/test/CodeGen/ARM/debug-info-arg.ll
index cb9520e..c75c630 100644
--- a/test/CodeGen/ARM/debug-info-arg.ll
+++ b/test/CodeGen/ARM/debug-info-arg.ll
@@ -32,7 +32,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!33}
 
-!0 = !MDCompileUnit(language: DW_LANG_C99, producer: "Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", isOptimized: true, emissionKind: 1, file: !32, enums: !4, retainedTypes: !4, subprograms: !30, imports:  null)
+!0 = !MDCompileUnit(language: DW_LANG_C99, producer: "Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", isOptimized: true, emissionKind: 1, file: !32, enums: !{}, retainedTypes: !{}, subprograms: !30, imports:  null)
 !1 = !MDSubprogram(name: "foo", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 11, file: !2, scope: !2, type: !3, function: void (%struct.tag_s*, %struct.tag_s*, i64, i64, %struct.tag_s*, %struct.tag_s*)* @foo, variables: !31)
 !2 = !MDFile(filename: "one.c", directory: "/Volumes/Athwagate/R10048772")
 !3 = !MDSubroutineType(types: !4)
diff --git a/test/CodeGen/ARM/debug-info-blocks.ll b/test/CodeGen/ARM/debug-info-blocks.ll
index 96876b1..4e499c6 100644
--- a/test/CodeGen/ARM/debug-info-blocks.ll
+++ b/test/CodeGen/ARM/debug-info-blocks.ll
@@ -95,7 +95,7 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!162}
 
-!0 = !MDCompileUnit(language: DW_LANG_ObjC, producer: "Apple clang version 2.1", isOptimized: false, runtimeVersion: 2, emissionKind: 1, file: !153, enums: !147, retainedTypes: !26, subprograms: !148)
+!0 = !MDCompileUnit(language: DW_LANG_ObjC, producer: "Apple clang version 2.1", isOptimized: false, runtimeVersion: 2, emissionKind: 1, file: !153, enums: !147, retainedTypes: !{}, subprograms: !148)
 !1 = !MDCompositeType(tag: DW_TAG_enumeration_type, line: 248, size: 32, align: 32, file: !160, scope: !0, elements: !3)
 !2 = !MDFile(filename: "header.h", directory: "/Volumes/Sandbox/llvm")
 !3 = !{!4}
@@ -158,7 +158,7 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !60 = !MDCompositeType(tag: DW_TAG_structure_type, name: "UIMydata", line: 26, size: 128, align: 32, runtimeLang: DW_LANG_ObjC, file: !154, scope: !24, elements: !62)
 !61 = !MDFile(filename: "header11.h", directory: "/Volumes/Sandbox/llvm")
 !62 = !{!63, !71, !75, !79}
-!63 = !MDDerivedType(tag: DW_TAG_inheritance, file: !60, baseType: !64)
+!63 = !MDDerivedType(tag: DW_TAG_inheritance, file: !61, baseType: !64)
 !64 = !MDCompositeType(tag: DW_TAG_structure_type, name: "NSO", line: 66, size: 32, align: 32, runtimeLang: DW_LANG_ObjC, file: !155, scope: !40, elements: !66)
 !65 = !MDFile(filename: "NSO.h", directory: "/Volumes/Sandbox/llvm")
 !66 = !{!67}
@@ -192,7 +192,7 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !94 = !MDCompositeType(tag: DW_TAG_structure_type, name: "twork", line: 43, size: 32, align: 32, runtimeLang: DW_LANG_ObjC, file: !157, scope: !40, elements: !96)
 !95 = !MDFile(filename: "header13.h", directory: "/Volumes/Sandbox/llvm")
 !96 = !{!97}
-!97 = !MDDerivedType(tag: DW_TAG_inheritance, file: !94, baseType: !64)
+!97 = !MDDerivedType(tag: DW_TAG_inheritance, file: !95, baseType: !64)
 !98 = !MDDerivedType(tag: DW_TAG_member, name: "_itemID", line: 38, size: 64, align: 32, offset: 32, flags: DIFlagPrivate, file: !152, scope: !24, baseType: !99, extraData: !"")
 !99 = !MDDerivedType(tag: DW_TAG_typedef, name: "uint64_t", line: 55, file: !153, scope: !0, baseType: !100)
 !100 = !MDBasicType(tag: DW_TAG_base_type, name: "long long unsigned int", size: 64, align: 32, encoding: DW_ATE_unsigned)
@@ -201,7 +201,7 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !103 = !MDCompositeType(tag: DW_TAG_structure_type, name: "MyLibrary2", line: 22, size: 32, align: 32, runtimeLang: DW_LANG_ObjC, file: !158, scope: !40, elements: !105)
 !104 = !MDFile(filename: "header14.h", directory: "/Volumes/Sandbox/llvm")
 !105 = !{!106}
-!106 = !MDDerivedType(tag: DW_TAG_inheritance, file: !103, baseType: !64)
+!106 = !MDDerivedType(tag: DW_TAG_inheritance, file: !104, baseType: !64)
 !107 = !MDDerivedType(tag: DW_TAG_member, name: "_bounds", line: 40, size: 128, align: 32, offset: 128, flags: DIFlagPrivate, file: !152, scope: !24, baseType: !108, extraData: !"")
 !108 = !MDDerivedType(tag: DW_TAG_typedef, name: "CR", line: 33, file: !153, scope: !0, baseType: !109)
 !109 = !MDCompositeType(tag: DW_TAG_structure_type, name: "CR", line: 29, size: 128, align: 32, file: !156, scope: !0, elements: !110)
diff --git a/test/CodeGen/ARM/debug-info-branch-folding.ll b/test/CodeGen/ARM/debug-info-branch-folding.ll
index bd2ae80..cb57efa 100644
--- a/test/CodeGen/ARM/debug-info-branch-folding.ll
+++ b/test/CodeGen/ARM/debug-info-branch-folding.ll
@@ -28,10 +28,10 @@ for.body9:                                        ; preds = %for.body9, %entry
 for.end54:                                        ; preds = %for.body9
   %tmp115 = extractelement <4 x float> %add19, i32 1
   %conv6.i75 = fpext float %tmp115 to double, !dbg !45
-  %call.i82 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), double undef, double %conv6.i75, double undef, double undef) nounwind, !dbg !45
+  %call.i82 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), double undef, double %conv6.i75, double undef, double undef) nounwind, !dbg !45
   %tmp116 = extractelement <4 x float> %add20, i32 1
   %conv6.i76 = fpext float %tmp116 to double, !dbg !45
-  %call.i83 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), double undef, double %conv6.i76, double undef, double undef) nounwind, !dbg !45
+  %call.i83 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), double undef, double %conv6.i76, double undef, double undef) nounwind, !dbg !45
   ret i32 0, !dbg !49
 }
 
@@ -44,7 +44,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 
 !0 = !MDSubprogram(name: "test0001", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !54, scope: null, type: !3, function: <4 x float> (float)* @test0001, variables: !51)
 !1 = !MDFile(filename: "build2.c", directory: "/private/tmp")
-!2 = !MDCompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (trunk 129915)", isOptimized: true, emissionKind: 1, file: !54, enums: !17, retainedTypes: !17, subprograms: !50, imports:  null)
+!2 = !MDCompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (trunk 129915)", isOptimized: true, emissionKind: 1, file: !54, enums: !{}, retainedTypes: !{}, subprograms: !50, imports:  null)
 !3 = !MDSubroutineType(types: !4)
 !4 = !{!5}
 !5 = !MDDerivedType(tag: DW_TAG_typedef, name: "v4f32", line: 14, file: !54, scope: !2, baseType: !6)
diff --git a/test/CodeGen/ARM/debug-info-d16-reg.ll b/test/CodeGen/ARM/debug-info-d16-reg.ll
index 0667e0f..034d0f4 100644
--- a/test/CodeGen/ARM/debug-info-d16-reg.ll
+++ b/test/CodeGen/ARM/debug-info-d16-reg.ll
@@ -16,7 +16,7 @@ entry:
   tail call void @llvm.dbg.value(metadata double %val, i64 0, metadata !20, metadata !MDExpression()), !dbg !26
   tail call void @llvm.dbg.value(metadata i8 %c, i64 0, metadata !21, metadata !MDExpression()), !dbg !26
   %0 = zext i8 %c to i32, !dbg !27
-  %1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %val, i32 %0) nounwind, !dbg !27
+  %1 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %val, i32 %0) nounwind, !dbg !27
   ret i32 0, !dbg !29
 }
 
@@ -26,7 +26,7 @@ entry:
   tail call void @llvm.dbg.value(metadata double %val, i64 0, metadata !17, metadata !MDExpression()), !dbg !30
   tail call void @llvm.dbg.value(metadata i8 %c, i64 0, metadata !18, metadata !MDExpression()), !dbg !30
   %0 = zext i8 %c to i32, !dbg !31
-  %1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %val, i32 %0) nounwind, !dbg !31
+  %1 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %val, i32 %0) nounwind, !dbg !31
   ret i32 0, !dbg !33
 }
 
@@ -45,11 +45,11 @@ entry:
   %3 = getelementptr inbounds i8, i8* bitcast (i32 (i32, i8**)* @main to i8*), i32 %argc, !dbg !37
   %4 = trunc i32 %argc to i8, !dbg !37
   %5 = add i8 %4, 97, !dbg !37
-  tail call void @llvm.dbg.value(metadata i8* %3, i64 0, metadata !19, metadata !MDExpression()) nounwind, !dbg !38
-  tail call void @llvm.dbg.value(metadata double %1, i64 0, metadata !20, metadata !MDExpression()) nounwind, !dbg !38
-  tail call void @llvm.dbg.value(metadata i8 %5, i64 0, metadata !21, metadata !MDExpression()) nounwind, !dbg !38
+  tail call void @llvm.dbg.value(metadata i8* %3, i64 0, metadata !49, metadata !MDExpression()) nounwind, !dbg !38
+  tail call void @llvm.dbg.value(metadata double %1, i64 0, metadata !50, metadata !MDExpression()) nounwind, !dbg !38
+  tail call void @llvm.dbg.value(metadata i8 %5, i64 0, metadata !51, metadata !MDExpression()) nounwind, !dbg !38
   %6 = zext i8 %5 to i32, !dbg !39
-  %7 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i8* %3, double %1, i32 %6) nounwind, !dbg !39
+  %7 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i8* %3, double %1, i32 %6) nounwind, !dbg !39
   %8 = tail call i32 @printer(i8* %3, double %1, i8 zeroext %5) nounwind, !dbg !40
   ret i32 0, !dbg !41
 }
@@ -75,12 +75,17 @@ declare i32 @puts(i8* nocapture) nounwind
 !13 = !MDDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, file: !46, scope: !1, baseType: !14)
 !14 = !MDDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, file: !46, scope: !1, baseType: !15)
 !15 = !MDBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!16 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "ptr", line: 11, arg: 0, scope: !0, file: !1, type: !6)
-!17 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "val", line: 11, arg: 0, scope: !0, file: !1, type: !7)
-!18 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "c", line: 11, arg: 0, scope: !0, file: !1, type: !8)
-!19 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "ptr", line: 4, arg: 0, scope: !9, file: !1, type: !6)
-!20 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "val", line: 4, arg: 0, scope: !9, file: !1, type: !7)
-!21 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "c", line: 4, arg: 0, scope: !9, file: !1, type: !8)
+!16 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "ptr", line: 11, arg: 1, scope: !0, file: !1, type: !6)
+!17 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "val", line: 11, arg: 2, scope: !0, file: !1, type: !7)
+!18 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "c", line: 11, arg: 3, scope: !0, file: !1, type: !8)
+!19 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "ptr", line: 4, arg: 1, scope: !9, file: !1, type: !6)
+!20 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "val", line: 4, arg: 2, scope: !9, file: !1, type: !7)
+!21 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "c", line: 4, arg: 3, scope: !9, file: !1, type: !8)
+
+!49 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "ptr", line: 4, arg: 1, scope: !9, file: !1, type: !6)
+!50 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "val", line: 4, arg: 2, scope: !9, file: !1, type: !7)
+!51 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "c", line: 4, arg: 2, scope: !9, file: !1, type: !8)
+
 !22 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "argc", line: 17, arg: 0, scope: !10, file: !1, type: !5)
 !23 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "argv", line: 17, arg: 0, scope: !10, file: !1, type: !13)
 !24 = !MDLocalVariable(tag: DW_TAG_auto_variable, name: "dval", line: 19, scope: !25, file: !1, type: !7)
@@ -106,5 +111,5 @@ declare i32 @puts(i8* nocapture) nounwind
 !44 = !{!19, !20, !21}
 !45 = !{!22, !23, !24}
 !46 = !MDFile(filename: "a.c", directory: "/tmp/")
-!47 = !{i32 0}
+!47 = !{}
 !48 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/ARM/debug-info-no-frame.ll b/test/CodeGen/ARM/debug-info-no-frame.ll
new file mode 100644
index 0000000..418a074
--- /dev/null
+++ b/test/CodeGen/ARM/debug-info-no-frame.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mtriple=armv7-none-linux-gnueabihf < %s -o - | FileCheck %s
+
+; Function Attrs: nounwind
+define void @need_cfi_def_cfa_offset() #0 {
+; CHECK-LABEL: need_cfi_def_cfa_offset:
+; CHECK: sub	sp, sp, #4
+; CHECK: .cfi_def_cfa_offset 4
+entry:
+  %Depth = alloca i32, align 4
+  call void @llvm.dbg.declare(metadata i32* %Depth, metadata !9, metadata !10), !dbg !11
+  store i32 2, i32* %Depth, align 4, !dbg !11
+  ret void, !dbg !12
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+
+!0 = !MDCompileUnit(language: DW_LANG_C99, file: !1, producer: "", isOptimized: false)
+!1 = !MDFile(filename: "file.c", directory: "/dir")
+!2 = !{}
+!3 = !MDSubprogram(name: "need_cfi_def_cfa_offset", scope: !1, file: !1, line: 1, type: !4, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, function: void ()* @need_cfi_def_cfa_offset, variables: !2)
+!4 = !MDSubroutineType(types: !5)
+!5 = !{null}
+!6 = !MDBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !MDLocalVariable(tag: DW_TAG_auto_variable, name: "Depth", scope: !3, file: !1, line: 3, type: !6)
+!10 = !MDExpression()
+!11 = !MDLocation(line: 3, column: 9, scope: !3)
+!12 = !MDLocation(line: 7, column: 5, scope: !3)
diff --git a/test/CodeGen/ARM/debug-info-qreg.ll b/test/CodeGen/ARM/debug-info-qreg.ll
index 398f652..9cfd67d 100644
--- a/test/CodeGen/ARM/debug-info-qreg.ll
+++ b/test/CodeGen/ARM/debug-info-qreg.ll
@@ -27,7 +27,7 @@ for.end54:                                        ; preds = %for.body9
   tail call void @llvm.dbg.value(metadata <4 x float> %add19, i64 0, metadata !27, metadata !MDExpression()), !dbg !39
   %tmp115 = extractelement <4 x float> %add19, i32 1
   %conv6.i75 = fpext float %tmp115 to double, !dbg !45
-  %call.i82 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), double undef, double %conv6.i75, double undef, double undef) nounwind, !dbg !45
+  %call.i82 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), double undef, double %conv6.i75, double undef, double undef) nounwind, !dbg !45
   ret i32 0, !dbg !49
 }
 
@@ -40,11 +40,11 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 
 !0 = !MDSubprogram(name: "test0001", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3, file: !54, scope: !1, type: !3, function: <4 x float> (float)* @test0001, variables: !51)
 !1 = !MDFile(filename: "build2.c", directory: "/private/tmp")
-!2 = !MDCompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (trunk 129915)", isOptimized: true, emissionKind: 1, file: !54, enums: !17, retainedTypes: !17, subprograms: !50, imports:  null)
+!2 = !MDCompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (trunk 129915)", isOptimized: true, emissionKind: 1, file: !54, enums: !{}, retainedTypes: !{}, subprograms: !50, imports:  null)
 !3 = !MDSubroutineType(types: !4)
 !4 = !{!5}
 !5 = !MDDerivedType(tag: DW_TAG_typedef, name: "v4f32", line: 14, file: !54, scope: !2, baseType: !6)
-!6 = !MDCompositeType(tag: DW_TAG_array_type, size: 128, align: 128, file: !2, baseType: !7, elements: !8)
+!6 = !MDCompositeType(tag: DW_TAG_array_type, size: 128, align: 128, file: !1, baseType: !7, elements: !8)
 !7 = !MDBasicType(tag: DW_TAG_base_type, name: "float", size: 32, align: 32, encoding: DW_ATE_float)
 !8 = !{!9}
 !9 = !MDSubrange(count: 4)
diff --git a/test/CodeGen/ARM/debug-info-s16-reg.ll b/test/CodeGen/ARM/debug-info-s16-reg.ll
index d08ec03..3cd2837 100644
--- a/test/CodeGen/ARM/debug-info-s16-reg.ll
+++ b/test/CodeGen/ARM/debug-info-s16-reg.ll
@@ -19,7 +19,7 @@ entry:
   tail call void @llvm.dbg.value(metadata i8 %c, i64 0, metadata !12, metadata !MDExpression()), !dbg !26
   %conv = fpext float %val to double, !dbg !27
   %conv3 = zext i8 %c to i32, !dbg !27
-  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %conv, i32 %conv3) nounwind optsize, !dbg !27
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %conv, i32 %conv3) nounwind optsize, !dbg !27
   ret i32 0, !dbg !29
 }
 
@@ -32,7 +32,7 @@ entry:
   tail call void @llvm.dbg.value(metadata i8 %c, i64 0, metadata !16, metadata !MDExpression()), !dbg !32
   %conv = fpext float %val to double, !dbg !33
   %conv3 = zext i8 %c to i32, !dbg !33
-  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %conv, i32 %conv3) nounwind optsize, !dbg !33
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %conv, i32 %conv3) nounwind optsize, !dbg !33
   ret i32 0, !dbg !35
 }
 
@@ -48,12 +48,12 @@ entry:
   %add.ptr = getelementptr i8, i8* bitcast (i32 (i32, i8**)* @main to i8*), i32 %argc, !dbg !40
   %add5 = add nsw i32 %argc, 97, !dbg !40
   %conv6 = trunc i32 %add5 to i8, !dbg !40
-  tail call void @llvm.dbg.value(metadata i8* %add.ptr, i64 0, metadata !8, metadata !MDExpression()) nounwind, !dbg !41
-  tail call void @llvm.dbg.value(metadata float %conv1, i64 0, metadata !10, metadata !MDExpression()) nounwind, !dbg !42
-  tail call void @llvm.dbg.value(metadata i8 %conv6, i64 0, metadata !12, metadata !MDExpression()) nounwind, !dbg !43
+  tail call void @llvm.dbg.value(metadata i8* %add.ptr, i64 0, metadata !58, metadata !MDExpression()) nounwind, !dbg !41
+  tail call void @llvm.dbg.value(metadata float %conv1, i64 0, metadata !60, metadata !MDExpression()) nounwind, !dbg !42
+  tail call void @llvm.dbg.value(metadata i8 %conv6, i64 0, metadata !62, metadata !MDExpression()) nounwind, !dbg !43
   %conv.i = fpext float %conv1 to double, !dbg !44
   %conv3.i = and i32 %add5, 255, !dbg !44
-  %call.i = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i8* %add.ptr, double %conv.i, i32 %conv3.i) nounwind optsize, !dbg !44
+  %call.i = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i8* %add.ptr, double %conv.i, i32 %conv3.i) nounwind optsize, !dbg !44
   %call14 = tail call i32 @printer(i8* %add.ptr, float %conv1, i8 zeroext %conv6) optsize, !dbg !45
   ret i32 0, !dbg !46
 }
@@ -79,6 +79,11 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !11 = !MDBasicType(tag: DW_TAG_base_type, name: "float", size: 32, align: 32, encoding: DW_ATE_float)
 !12 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "c", line: 4, arg: 3, scope: !0, file: !1, type: !13)
 !13 = !MDBasicType(tag: DW_TAG_base_type, name: "unsigned char", size: 8, align: 8, encoding: DW_ATE_unsigned_char)
+
+!58 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "ptr", line: 4, arg: 1, scope: !0, file: !1, type: !9)
+!60 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "val", line: 4, arg: 2, scope: !0, file: !1, type: !11)
+!62 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "c", line: 4, arg: 3, scope: !0, file: !1, type: !13)
+
 !14 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "ptr", line: 11, arg: 1, scope: !6, file: !1, type: !9)
 !15 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "val", line: 11, arg: 2, scope: !6, file: !1, type: !11)
 !16 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "c", line: 11, arg: 3, scope: !6, file: !1, type: !13)
@@ -117,5 +122,5 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !49 = !{!14, !15, !16}
 !50 = !{!17, !18, !22}
 !51 = !MDFile(filename: "a.c", directory: "/private/tmp")
-!52 = !{i32 0}
+!52 = !{}
 !53 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/ARM/debug-info-sreg2.ll b/test/CodeGen/ARM/debug-info-sreg2.ll
index df578fd..e5f7a27 100644
--- a/test/CodeGen/ARM/debug-info-sreg2.ll
+++ b/test/CodeGen/ARM/debug-info-sreg2.ll
@@ -62,5 +62,5 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !16 = !{!1}
 !17 = !{!5, !8}
 !18 = !MDFile(filename: "k.cc", directory: "/private/tmp")
-!19 = !{i32 0}
+!19 = !{}
 !20 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/ARM/div.ll b/test/CodeGen/ARM/div.ll
index a339c81..7b298fe 100644
--- a/test/CodeGen/ARM/div.ll
+++ b/test/CodeGen/ARM/div.ll
@@ -1,11 +1,13 @@
-; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 | FileCheck %s -check-prefix=CHECK-ARM
-; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=swift     | FileCheck %s -check-prefix=CHECK-HWDIV
-; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-r5 | FileCheck %s -check-prefix=CHECK-HWDIV
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8    | FileCheck %s -check-prefix=CHECK-SWDIV
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=swift        | FileCheck %s -check-prefix=CHECK-HWDIV
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-r4    | FileCheck %s -check-prefix=CHECK-SWDIV
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-r4f   | FileCheck %s -check-prefix=CHECK-SWDIV
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-r5    | FileCheck %s -check-prefix=CHECK-HWDIV
 
 define i32 @f1(i32 %a, i32 %b) {
 entry:
-; CHECK-ARM: f1
-; CHECK-ARM: __divsi3
+; CHECK-SWDIV: f1
+; CHECK-SWDIV: __divsi3
 
 ; CHECK-HWDIV: f1
 ; CHECK-HWDIV: sdiv
@@ -15,8 +17,8 @@ entry:
 
 define i32 @f2(i32 %a, i32 %b) {
 entry:
-; CHECK-ARM: f2
-; CHECK-ARM: __udivsi3
+; CHECK-SWDIV: f2
+; CHECK-SWDIV: __udivsi3
 
 ; CHECK-HWDIV: f2
 ; CHECK-HWDIV: udiv
@@ -26,8 +28,8 @@ entry:
 
 define i32 @f3(i32 %a, i32 %b) {
 entry:
-; CHECK-ARM: f3
-; CHECK-ARM: __modsi3
+; CHECK-SWDIV: f3
+; CHECK-SWDIV: __modsi3
 
 ; CHECK-HWDIV: f3
 ; CHECK-HWDIV: sdiv
@@ -38,8 +40,8 @@ entry:
 
 define i32 @f4(i32 %a, i32 %b) {
 entry:
-; CHECK-ARM: f4
-; CHECK-ARM: __umodsi3
+; CHECK-SWDIV: f4
+; CHECK-SWDIV: __umodsi3
 
 ; CHECK-HWDIV: f4
 ; CHECK-HWDIV: udiv
diff --git a/test/CodeGen/ARM/fast-isel-vararg.ll b/test/CodeGen/ARM/fast-isel-vararg.ll
index aa37e7d..35442ee 100644
--- a/test/CodeGen/ARM/fast-isel-vararg.ll
+++ b/test/CodeGen/ARM/fast-isel-vararg.ll
@@ -37,7 +37,7 @@ entry:
 ; THUMB: str.w {{[a-z0-9]+}}, [sp]
 ; THUMB: str.w {{[a-z0-9]+}}, [sp, #4]
 ; THUMB: bl {{_?}}CallVariadic
-  %call = call i32 (i32, ...)* @CallVariadic(i32 5, i32 %0, i32 %1, i32 %2, i32 %3, i32 %4)
+  %call = call i32 (i32, ...) @CallVariadic(i32 5, i32 %0, i32 %1, i32 %2, i32 %3, i32 %4)
   store i32 %call, i32* %tmp, align 4
   %5 = load i32, i32* %tmp, align 4
   ret i32 %5
diff --git a/test/CodeGen/ARM/fcopysign.ll b/test/CodeGen/ARM/fcopysign.ll
index 1de0572..d013fbf 100644
--- a/test/CodeGen/ARM/fcopysign.ll
+++ b/test/CodeGen/ARM/fcopysign.ll
@@ -48,7 +48,7 @@ entry:
 ; SOFT: vmov.i32 [[REG6:(d[0-9]+)]], #0x80000000
 ; SOFT: vshr.u64 [[REG7]], [[REG7]], #32
 ; SOFT: vbsl [[REG6]], [[REG7]], 
-  %0 = tail call double (...)* @bar() nounwind
+  %0 = tail call double (...) @bar() nounwind
   %1 = fptrunc double %0 to float
   %2 = tail call float @copysignf(float 5.000000e-01, float %1) nounwind readnone
   %3 = fadd float %1, %2
diff --git a/test/CodeGen/ARM/ghc-tcreturn-lowered.ll b/test/CodeGen/ARM/ghc-tcreturn-lowered.ll
index 9731b3d..f34f8f1 100644
--- a/test/CodeGen/ARM/ghc-tcreturn-lowered.ll
+++ b/test/CodeGen/ARM/ghc-tcreturn-lowered.ll
@@ -16,6 +16,6 @@ define ghccc void @test_indirect_tail() {
 ; CHECK-LABEL: test_indirect_tail:
 ; CHECK: bx {{r[0-9]+}}
   %func = load void()*, void()** @ind_func
-  tail call ghccc void()* %func()
+  tail call ghccc void() %func()
   ret void
 }
diff --git a/test/CodeGen/ARM/global-merge-1.ll b/test/CodeGen/ARM/global-merge-1.ll
index 03a9d33..d4d9b0f 100644
--- a/test/CodeGen/ARM/global-merge-1.ll
+++ b/test/CodeGen/ARM/global-merge-1.ll
@@ -1,10 +1,12 @@
 ; RUN: llc %s -O0 -o - | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O0 -o - -arm-global-merge=false | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O0 -o - -arm-global-merge=true | FileCheck -check-prefix=MERGE %s
 ; RUN: llc %s -O1 -o - | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O1 -o - -enable-global-merge=false | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O1 -o - -enable-global-merge=true | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O1 -o - -arm-global-merge=false | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O1 -o - -arm-global-merge=true | FileCheck -check-prefix=MERGE %s
 ; RUN: llc %s -O3 -o - | FileCheck -check-prefix=MERGE %s
-; RUN: llc %s -O3 -o - -enable-global-merge=false | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O3 -o - -enable-global-merge=true | FileCheck -check-prefix=MERGE %s
+; RUN: llc %s -O3 -o - -arm-global-merge=false | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O3 -o - -arm-global-merge=true | FileCheck -check-prefix=MERGE %s
 
 ; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
 ; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
diff --git a/test/CodeGen/ARM/ifcvt6.ll b/test/CodeGen/ARM/ifcvt6.ll
index a00deda..7890193 100644
--- a/test/CodeGen/ARM/ifcvt6.ll
+++ b/test/CodeGen/ARM/ifcvt6.ll
@@ -10,7 +10,7 @@ entry:
 	br i1 %tmp7, label %cond_true, label %UnifiedReturnBlock
 
 cond_true:		; preds = %entry
-	%tmp10 = call i32 (...)* @bar( )		; <i32> [#uses=0]
+	%tmp10 = call i32 (...) @bar( )		; <i32> [#uses=0]
 	ret void
 
 UnifiedReturnBlock:		; preds = %entry
diff --git a/test/CodeGen/ARM/indirectbr-2.ll b/test/CodeGen/ARM/indirectbr-2.ll
index 044fb56..ca068db 100644
--- a/test/CodeGen/ARM/indirectbr-2.ll
+++ b/test/CodeGen/ARM/indirectbr-2.ll
@@ -27,7 +27,7 @@ define i32 @func() nounwind ssp {
   %9 = load i32, i32* %8
   %10 = add i32 %9, ptrtoint (i8* blockaddress(@func, %4) to i32)
   %11 = inttoptr i32 %10 to i8*
-  %12 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8], [45 x i8]* @0, i32 0, i32 0))
+  %12 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([45 x i8], [45 x i8]* @0, i32 0, i32 0))
   indirectbr i8* %11, [label %13, label %14]
 
 ; <label>:13                                      ; preds = %4
diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll
index 78d2228..4ea26e1 100644
--- a/test/CodeGen/ARM/memcpy-inline.ll
+++ b/test/CodeGen/ARM/memcpy-inline.ll
@@ -30,7 +30,7 @@ entry:
 define void @t1(i8* nocapture %C) nounwind {
 entry:
 ; CHECK-LABEL: t1:
-; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
+; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
 ; CHECK: adds r0, #15
 ; CHECK: adds r1, #15
@@ -48,7 +48,7 @@ entry:
 ; CHECK: str [[REG2]], [r0, #32]
 ; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]!
 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]!
-; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
+; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false)
   ret void
@@ -59,7 +59,7 @@ entry:
 ; CHECK-LABEL: t3:
 ; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]!
 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]!
-; CHECK: vld1.8 {d{{[0-9]+}}}, [r1]
+; CHECK: vldr d{{[0-9]+}}, [r1]
 ; CHECK: vst1.8 {d{{[0-9]+}}}, [r0]
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str3, i64 0, i64 0), i64 24, i32 1, i1 false)
   ret void
@@ -68,7 +68,7 @@ entry:
 define void @t4(i8* nocapture %C) nounwind {
 entry:
 ; CHECK-LABEL: t4:
-; CHECK: vld1.8 {[[REG3:d[0-9]+]], [[REG4:d[0-9]+]]}, [r1]
+; CHECK: vld1.64 {[[REG3:d[0-9]+]], [[REG4:d[0-9]+]]}, [r1]
 ; CHECK: vst1.8 {[[REG3]], [[REG4]]}, [r0]!
 ; CHECK: strh [[REG5:r[0-9]+]], [r0]
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str4, i64 0, i64 0), i64 18, i32 1, i1 false)
@@ -97,11 +97,11 @@ entry:
 define void @t6() nounwind {
 entry:
 ; CHECK-LABEL: t6:
-; CHECK: vld1.8 {[[REG9:d[0-9]+]]}, [r0]
+; CHECK: vldr [[REG9:d[0-9]+]], [r0]
 ; CHECK: vstr [[REG9]], [r1]
 ; CHECK: adds r1, #6
 ; CHECK: adds r0, #6
-; CHECK: vld1.8
+; CHECK: vld1.16
 ; CHECK: vst1.16
 ; CHECK-T1-LABEL: t6:
 ; CHECK-T1: movs [[TREG5:r[0-9]]],
diff --git a/test/CodeGen/ARM/memfunc.ll b/test/CodeGen/ARM/memfunc.ll
index 160096a..c214336 100644
--- a/test/CodeGen/ARM/memfunc.ll
+++ b/test/CodeGen/ARM/memfunc.ll
@@ -3,22 +3,19 @@
 ; RUN: llc < %s -mtriple=arm-none-eabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI --check-prefix=CHECK
 ; RUN: llc < %s -mtriple=arm-none-eabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI --check-prefix=CHECK
 
-@from = common global [500 x i32] zeroinitializer, align 4
-@to = common global [500 x i32] zeroinitializer, align 4
-
-define void @f1() {
+define void @f1(i8* %dest, i8* %src) {
 entry:
   ; CHECK-LABEL: f1
 
   ; CHECK-IOS: memmove
   ; CHECK-DARWIN: memmove
   ; CHECK-EABI: __aeabi_memmove
-  call void @llvm.memmove.p0i8.p0i8.i32(i8* bitcast ([500 x i32]* @from to i8*), i8* bitcast ([500 x i32]* @to to i8*), i32 500, i32 0, i1 false)
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 500, i32 0, i1 false)
 
   ; CHECK-IOS: memcpy
   ; CHECK-DARWIN: memcpy
   ; CHECK-EABI: __aeabi_memcpy
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([500 x i32]* @from to i8*), i8* bitcast ([500 x i32]* @to to i8*), i32 500, i32 0, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 500, i32 0, i1 false)
 
   ; EABI memset swaps arguments
   ; CHECK-IOS: mov r1, #0
@@ -27,7 +24,7 @@ entry:
   ; CHECK-DARWIN: memset
   ; CHECK-EABI: mov r2, #0
   ; CHECK-EABI: __aeabi_memset
-  call void @llvm.memset.p0i8.i32(i8* bitcast ([500 x i32]* @from to i8*), i8 0, i32 500, i32 0, i1 false)
+  call void @llvm.memset.p0i8.i32(i8* %dest, i8 0, i32 500, i32 0, i1 false)
   unreachable
 }
 
@@ -281,6 +278,47 @@ entry:
   unreachable
 }
 
+; Check that global variables are aligned if they are large enough, but only if
+; they are defined in this object and don't have an explicit section.
+@arr1 = global [7 x i8] c"\01\02\03\04\05\06\07", align 1
+@arr2 = global [8 x i8] c"\01\02\03\04\05\06\07\08", align 1
+@arr3 = global [7 x i8] c"\01\02\03\04\05\06\07", section "foo,bar", align 1
+@arr4 = global [8 x i8] c"\01\02\03\04\05\06\07\08", section "foo,bar", align 1
+@arr5 = weak global [7 x i8] c"\01\02\03\04\05\06\07", align 1
+@arr6 = weak_odr global [7 x i8] c"\01\02\03\04\05\06\07", align 1
+@arr7 = external global [7 x i8], align 1
+define void @f9(i8* %dest, i32 %n) {
+entry:
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @arr1, i32 0, i32 0), i32 %n, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([8 x i8], [8 x i8]* @arr2, i32 0, i32 0), i32 %n, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @arr3, i32 0, i32 0), i32 %n, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([8 x i8], [8 x i8]* @arr4, i32 0, i32 0), i32 %n, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @arr5, i32 0, i32 0), i32 %n, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @arr6, i32 0, i32 0), i32 %n, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @arr7, i32 0, i32 0), i32 %n, i32 1, i1 false)
+
+  unreachable
+}
+
+; CHECK: {{\.data|\.section.+data}}
+; CHECK-NOT: .align
+; CHECK: arr1:
+; CHECK-IOS: .align 3
+; CHECK-DARWIN: .align 2
+; CHECK-EABI: .align 2
+; CHECK: arr2:
+; CHECK: {{\.section.+foo,bar}}
+; CHECK-NOT: .align
+; CHECK: arr3:
+; CHECK-NOT: .align
+; CHECK: arr4:
+; CHECK: {{\.data|\.section.+data}}
+; CHECK-NOT: .align
+; CHECK: arr5:
+; CHECK-NOT: .align
+; CHECK: arr6:
+; CHECK-NOT: arr7:
+
 declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
 declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
diff --git a/test/CodeGen/ARM/neon-spfp.ll b/test/CodeGen/ARM/neon-spfp.ll
index 8f0f3a8..4eeaa8a 100644
--- a/test/CodeGen/ARM/neon-spfp.ll
+++ b/test/CodeGen/ARM/neon-spfp.ll
@@ -64,7 +64,7 @@ for.body:                                         ; preds = %for.body, %entry
 ; CHECK-DARWINA15: vmul.f32 s{{[0-9]*}}
 ; CHECK-DARWINSWIFT: vmul.f32 d{{[0-9]*}}
   %conv = fpext float %mul to double
-  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str, i32 0, i32 0), double %conv) #1
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str, i32 0, i32 0), double %conv) #1
   %inc = add nsw i32 %i.04, 1
   %exitcond = icmp eq i32 %inc, 16000
   br i1 %exitcond, label %for.end, label %for.body
diff --git a/test/CodeGen/ARM/optselect-regclass.ll b/test/CodeGen/ARM/optselect-regclass.ll
index f921127..4c5d44c 100644
--- a/test/CodeGen/ARM/optselect-regclass.ll
+++ b/test/CodeGen/ARM/optselect-regclass.ll
@@ -17,7 +17,7 @@ entry:
   %or = or i32 %cond13, %bf.clear10
   %shl = shl nuw i32 %or, 2
   %add = add i32 0, %shl
-  tail call void (i8*, i32, i32, i8*, ...)* @__sprintf_chk(i8* getelementptr inbounds ([50 x i8], [50 x i8]* @operands, i32 0, i32 0), i32 0, i32 50, i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str86, i32 0, i32 0), i32 undef, i32 undef, i32 %add)
+  tail call void (i8*, i32, i32, i8*, ...) @__sprintf_chk(i8* getelementptr inbounds ([50 x i8], [50 x i8]* @operands, i32 0, i32 0), i32 0, i32 50, i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str86, i32 0, i32 0), i32 undef, i32 undef, i32 %add)
   ret void
 }
 
diff --git a/test/CodeGen/ARM/print-memb-operand.ll b/test/CodeGen/ARM/print-memb-operand.ll
new file mode 100644
index 0000000..7748efb
--- /dev/null
+++ b/test/CodeGen/ARM/print-memb-operand.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=armv7 %s -o - | FileCheck %s
+
+; CHECK: dmb ld
+
+define void @test2() #0 {
+  call void @llvm.arm.dmb(i32 13)
+  ret void
+}
+
+declare void @llvm.arm.dmb(i32)
+
+attributes #0 = { "target-cpu"="cyclone" }
diff --git a/test/CodeGen/ARM/regpair_hint_phys.ll b/test/CodeGen/ARM/regpair_hint_phys.ll
new file mode 100644
index 0000000..8585a4c
--- /dev/null
+++ b/test/CodeGen/ARM/regpair_hint_phys.ll
@@ -0,0 +1,22 @@
+; RUN: llc -o - %s
+; ARM target used to fail an assertion if RegPair{Odd|Even} hint pointed to a
+; physreg.
+target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+target triple = "thumbv7-apple-tvos8.3.0"
+
+declare i8* @llvm.frameaddress(i32) #1
+declare i8* @llvm.returnaddress(i32) #1
+
+@somevar = global [2 x i32] [i32 0, i32 0]
+
+define void @__ubsan_handle_shift_out_of_bounds() #0 {
+entry:
+  %0 = tail call i8* @llvm.frameaddress(i32 0)
+  %1 = ptrtoint i8* %0 to i32
+  %2 = tail call i8* @llvm.returnaddress(i32 0)
+  %3 = ptrtoint i8* %2 to i32
+  %val0 = insertvalue [2 x i32] [i32 undef, i32 undef], i32 %3, 0
+  %val1 = insertvalue [2 x i32] %val0, i32 %1, 1
+  store [2 x i32] %val1, [2 x i32]* @somevar, align 8
+  ret void
+}
diff --git a/test/CodeGen/ARM/saxpy10-a9.ll b/test/CodeGen/ARM/saxpy10-a9.ll
index af7b7ad..91610f1 100644
--- a/test/CodeGen/ARM/saxpy10-a9.ll
+++ b/test/CodeGen/ARM/saxpy10-a9.ll
@@ -14,15 +14,12 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-
 ; CHECK: vldr
 ; CHECK: vldr
 ; CHECK: vldr
-; CHECK: vldr
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vmul
 ; CHECK-NEXT: vadd
 ; CHECK-NEXT: vadd
 ; CHECK-NEXT: vldr
 ; CHECK-NEXT: vldr
-; CHECK-NEXT: vldr
-; CHECK-NEXT: vadd
-; CHECK-NEXT: vmul
-; CHECK-NEXT: vldr
 ; CHECK-NEXT: vadd
 ; CHECK-NEXT: vadd
 ; CHECK-NEXT: vmul
@@ -31,6 +28,7 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-
 ; CHECK-NEXT: vadd
 ; CHECK-NEXT: vldr
 ; CHECK-NEXT: vmul
+; CHECK-NEXT: vldr
 ; CHECK-NEXT: vadd
 ; CHECK-NEXT: vldr
 ; CHECK-NEXT: vadd
@@ -48,6 +46,8 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-
 ; CHECK-NEXT: vmul
 ; CHECK-NEXT: vadd
 ; CHECK-NEXT: vldr
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vldr
 ; CHECK-NEXT: vmul
 ; CHECK-NEXT: vadd
 ; CHECK-NEXT: vldr
diff --git a/test/CodeGen/ARM/shifter_operand.ll b/test/CodeGen/ARM/shifter_operand.ll
index 3999168..6f5c0e8 100644
--- a/test/CodeGen/ARM/shifter_operand.ll
+++ b/test/CodeGen/ARM/shifter_operand.ll
@@ -64,7 +64,7 @@ entry:
 ; A9-NOT: ldr [[REG:r[0-9]+]], [r0, r1, lsl #2]!
 ; A9: str [[REG]], [r0, r1, lsl #2]
 ; A9-NOT: str [[REG]], [r0]
-  %0 = tail call i8* (...)* @malloc(i32 undef) nounwind
+  %0 = tail call i8* (...) @malloc(i32 undef) nounwind
   %1 = bitcast i8* %0 to i32*
   %2 = sext i16 %addr to i32
   %3 = getelementptr inbounds i32, i32* %1, i32 %2
diff --git a/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll b/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll
index 15f8ec2..2a7a82d 100644
--- a/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll
+++ b/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll
@@ -16,7 +16,7 @@ entry:
   %title = alloca [15 x i8], align 1
   %0 = getelementptr inbounds [15 x i8], [15 x i8]* %title, i32 0, i32 0
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* getelementptr inbounds ([15 x i8], [15 x i8]* @main.title, i32 0, i32 0), i32 15, i32 1, i1 false)
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i8* %0) #3
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i8* %0) #3
   ret i32 0
 }
 
diff --git a/test/CodeGen/ARM/stm.ll b/test/CodeGen/ARM/stm.ll
index 31c6ecd..88207e6 100644
--- a/test/CodeGen/ARM/stm.ll
+++ b/test/CodeGen/ARM/stm.ll
@@ -10,7 +10,7 @@ entry:
 ; CHECK: main
 ; CHECK: push
 ; CHECK: stm
-	%0 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([26 x i8], [26 x i8]* @"\01LC1", i32 0, i32 0), i32 -2, i32 -3, i32 2, i32 -6) nounwind		; <i32> [#uses=0]
-	%1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([32 x i8], [32 x i8]* @"\01LC", i32 0, i32 0), i32 0, i32 1, i32 0, i32 1, i32 0, i32 1) nounwind		; <i32> [#uses=0]
+	%0 = tail call i32 (i8*, ...) @printf(i8* getelementptr ([26 x i8], [26 x i8]* @"\01LC1", i32 0, i32 0), i32 -2, i32 -3, i32 2, i32 -6) nounwind		; <i32> [#uses=0]
+	%1 = tail call i32 (i8*, ...) @printf(i8* getelementptr ([32 x i8], [32 x i8]* @"\01LC", i32 0, i32 0), i32 0, i32 1, i32 0, i32 1, i32 0, i32 1) nounwind		; <i32> [#uses=0]
 	ret i32 0
 }
diff --git a/test/CodeGen/ARM/uint64tof64.ll b/test/CodeGen/ARM/uint64tof64.ll
index f77603e..cd35ce7 100644
--- a/test/CodeGen/ARM/uint64tof64.ll
+++ b/test/CodeGen/ARM/uint64tof64.ll
@@ -10,7 +10,7 @@ entry:
 	%0 = load i64, i64* null, align 4		; <i64> [#uses=1]
 	%1 = uitofp i64 %0 to double		; <double> [#uses=1]
 	%2 = fdiv double 0.000000e+00, %1		; <double> [#uses=1]
-	%3 = call i32 (%struct.FILE*, i8*, ...)* @fprintf(%struct.FILE* null, i8* getelementptr ([54 x i8], [54 x i8]* @"\01LC10", i32 0, i32 0), i64 0, double %2)		; <i32> [#uses=0]
+	%3 = call i32 (%struct.FILE*, i8*, ...) @fprintf(%struct.FILE* null, i8* getelementptr ([54 x i8], [54 x i8]* @"\01LC10", i32 0, i32 0), i64 0, double %2)		; <i32> [#uses=0]
 	ret void
 }
 
diff --git a/test/CodeGen/ARM/vargs.ll b/test/CodeGen/ARM/vargs.ll
index 78d8448..41ec038 100644
--- a/test/CodeGen/ARM/vargs.ll
+++ b/test/CodeGen/ARM/vargs.ll
@@ -4,8 +4,8 @@
 
 define i32 @main() {
 entry:
-        %tmp = call i32 (i8*, ...)* @printf( i8* getelementptr ([43 x i8], [43 x i8]* @str, i32 0, i64 0), i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10 )         ; <i32> [#uses=0]
-        %tmp2 = call i32 (i8*, ...)* @printf( i8* getelementptr ([43 x i8], [43 x i8]* @str, i32 0, i64 0), i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1 )                ; <i32> [#uses=0]
+        %tmp = call i32 (i8*, ...) @printf( i8* getelementptr ([43 x i8], [43 x i8]* @str, i32 0, i64 0), i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10 )         ; <i32> [#uses=0]
+        %tmp2 = call i32 (i8*, ...) @printf( i8* getelementptr ([43 x i8], [43 x i8]* @str, i32 0, i64 0), i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1 )                ; <i32> [#uses=0]
         ret i32 11
 }
 
diff --git a/test/CodeGen/ARM/vcvt.ll b/test/CodeGen/ARM/vcvt.ll
index 0b7ffb8..78105f7 100644
--- a/test/CodeGen/ARM/vcvt.ll
+++ b/test/CodeGen/ARM/vcvt.ll
@@ -180,8 +180,8 @@ define <2 x i64> @fix_float_to_i64(<2 x float> %in) {
 
 define <4 x i16> @fix_double_to_i16(<4 x double> %in) {
 ; CHECK-LABEL: fix_double_to_i16:
-; CHECK: vcvt.s32.f64
-; CHECK: vcvt.s32.f64
+; CHECK: vcvt.u32.f64
+; CHECK: vcvt.u32.f64
 
   %scale = fmul <4 x double> %in, <double 2.0, double 2.0, double 2.0, double 2.0>
   %conv = fptoui <4 x double> %scale to <4 x i16>
diff --git a/test/CodeGen/ARM/vector-spilling.ll b/test/CodeGen/ARM/vector-spilling.ll
index b8058c8..9e3225e 100644
--- a/test/CodeGen/ARM/vector-spilling.ll
+++ b/test/CodeGen/ARM/vector-spilling.ll
@@ -25,7 +25,7 @@ entry:
   %8 = shufflevector <8 x i64> %1, <8 x i64> %3, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   %9 = shufflevector <8 x i64> %1, <8 x i64> %3, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 
-  tail call void(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>)* @foo(<8 x i64> %1, <8 x i64> %3, <8 x i64> %5, <8 x i64> %7, <8 x i64> %8, <8 x i64> %9)
+  tail call void(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>) @foo(<8 x i64> %1, <8 x i64> %3, <8 x i64> %5, <8 x i64> %7, <8 x i64> %8, <8 x i64> %9)
   ret void
 }
 
diff --git a/test/CodeGen/ARM/vfp.ll b/test/CodeGen/ARM/vfp.ll
index 31b55e8..03c0354 100644
--- a/test/CodeGen/ARM/vfp.ll
+++ b/test/CodeGen/ARM/vfp.ll
@@ -124,11 +124,11 @@ entry:
 	br i1 %tmp6, label %cond_true, label %cond_false
 
 cond_true:		; preds = %entry
-	%tmp.upgrd.2 = tail call i32 (...)* @bar( )		; <i32> [#uses=0]
+	%tmp.upgrd.2 = tail call i32 (...) @bar( )		; <i32> [#uses=0]
 	ret void
 
 cond_false:		; preds = %entry
-	%tmp7 = tail call i32 (...)* @baz( )		; <i32> [#uses=0]
+	%tmp7 = tail call i32 (...) @baz( )		; <i32> [#uses=0]
 	ret void
 }
 
@@ -147,10 +147,10 @@ entry:
 	br i1 %tmp.upgrd.3, label %cond_true, label %cond_false
 
 cond_true:		; preds = %entry
-	%tmp.upgrd.4 = tail call i32 (...)* @bar( )		; <i32> [#uses=0]
+	%tmp.upgrd.4 = tail call i32 (...) @bar( )		; <i32> [#uses=0]
 	ret void
 
 cond_false:		; preds = %entry
-	%tmp1 = tail call i32 (...)* @baz( )		; <i32> [#uses=0]
+	%tmp1 = tail call i32 (...) @baz( )		; <i32> [#uses=0]
 	ret void
 }
diff --git a/test/CodeGen/ARM/vminmaxnm.ll b/test/CodeGen/ARM/vminmaxnm.ll
index a183284..2e2648d 100644
--- a/test/CodeGen/ARM/vminmaxnm.ll
+++ b/test/CodeGen/ARM/vminmaxnm.ll
@@ -1,5 +1,8 @@
-; RUN: llc < %s -mtriple armv8 -mattr=+neon | FileCheck %s
-; RUN: llc < %s -mtriple armv8 -mattr=+neon,+fp-armv8 -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK-FAST
+; RUN: llc < %s -mtriple armv8 -mattr=+neon,+fp-armv8 | FileCheck %s
+; RUN: llc < %s -mtriple armv8 -mattr=+neon,+fp-armv8 \
+; RUN:          -enable-no-nans-fp-math -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK-FAST
+
+; vectors
 
 define <4 x float> @vmaxnmq(<4 x float>* %A, <4 x float>* %B) nounwind {
 ; CHECK-LABEL: vmaxnmq:
@@ -37,6 +40,8 @@ define <2 x float> @vminnmd(<2 x float>* %A, <2 x float>* %B) nounwind {
   ret <2 x float> %tmp3
 }
 
+; scalars
+
 define float @fp-armv8_vminnm_o(float %a, float %b) {
 ; CHECK-FAST-LABEL: "fp-armv8_vminnm_o":
 ; CHECK-FAST-NOT: vcmp
@@ -48,6 +53,17 @@ define float @fp-armv8_vminnm_o(float %a, float %b) {
   ret float %cond
 }
 
+define double @fp-armv8_vminnm_ole(double %a, double %b) {
+; CHECK-FAST-LABEL: "fp-armv8_vminnm_ole":
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vminnm.f64
+; CHECK-LABEL: "fp-armv8_vminnm_ole":
+; CHECK-NOT: vminnm.f64
+  %cmp = fcmp ole double %a, %b
+  %cond = select i1 %cmp, double %a, double %b
+  ret double %cond
+}
+
 define float @fp-armv8_vminnm_o_rev(float %a, float %b) {
 ; CHECK-FAST-LABEL: "fp-armv8_vminnm_o_rev":
 ; CHECK-FAST-NOT: vcmp
@@ -59,6 +75,17 @@ define float @fp-armv8_vminnm_o_rev(float %a, float %b) {
   ret float %cond
 }
 
+define double @fp-armv8_vminnm_oge_rev(double %a, double %b) {
+; CHECK-FAST-LABEL: "fp-armv8_vminnm_oge_rev":
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vminnm.f64
+; CHECK-LABEL: "fp-armv8_vminnm_oge_rev":
+; CHECK-NOT: vminnm.f64
+  %cmp = fcmp oge double %a, %b
+  %cond = select i1 %cmp, double %b, double %a
+  ret double %cond
+}
+
 define float @fp-armv8_vminnm_u(float %a, float %b) {
 ; CHECK-FAST-LABEL: "fp-armv8_vminnm_u":
 ; CHECK-FAST-NOT: vcmp
@@ -70,6 +97,17 @@ define float @fp-armv8_vminnm_u(float %a, float %b) {
   ret float %cond
 }
 
+define float @fp-armv8_vminnm_ule(float %a, float %b) {
+; CHECK-FAST-LABEL: "fp-armv8_vminnm_ule":
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vminnm.f32
+; CHECK-LABEL: "fp-armv8_vminnm_ule":
+; CHECK-NOT: vminnm.f32
+  %cmp = fcmp ule float %a, %b
+  %cond = select i1 %cmp, float %a, float %b
+  ret float %cond
+}
+
 define float @fp-armv8_vminnm_u_rev(float %a, float %b) {
 ; CHECK-FAST-LABEL: "fp-armv8_vminnm_u_rev":
 ; CHECK-FAST-NOT: vcmp
@@ -81,6 +119,17 @@ define float @fp-armv8_vminnm_u_rev(float %a, float %b) {
   ret float %cond
 }
 
+define double @fp-armv8_vminnm_uge_rev(double %a, double %b) {
+; CHECK-FAST-LABEL: "fp-armv8_vminnm_uge_rev":
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vminnm.f64
+; CHECK-LABEL: "fp-armv8_vminnm_uge_rev":
+; CHECK-NOT: vminnm.f64
+  %cmp = fcmp uge double %a, %b
+  %cond = select i1 %cmp, double %b, double %a
+  ret double %cond
+}
+
 define float @fp-armv8_vmaxnm_o(float %a, float %b) {
 ; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_o":
 ; CHECK-FAST-NOT: vcmp
@@ -92,6 +141,17 @@ define float @fp-armv8_vmaxnm_o(float %a, float %b) {
   ret float %cond
 }
 
+define float @fp-armv8_vmaxnm_oge(float %a, float %b) {
+; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_oge":
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vmaxnm.f32
+; CHECK-LABEL: "fp-armv8_vmaxnm_oge":
+; CHECK-NOT: vmaxnm.f32
+  %cmp = fcmp oge float %a, %b
+  %cond = select i1 %cmp, float %a, float %b
+  ret float %cond
+}
+
 define float @fp-armv8_vmaxnm_o_rev(float %a, float %b) {
 ; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_o_rev":
 ; CHECK-FAST-NOT: vcmp
@@ -103,6 +163,17 @@ define float @fp-armv8_vmaxnm_o_rev(float %a, float %b) {
   ret float %cond
 }
 
+define float @fp-armv8_vmaxnm_ole_rev(float %a, float %b) {
+; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_ole_rev":
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vmaxnm.f32
+; CHECK-LABEL: "fp-armv8_vmaxnm_ole_rev":
+; CHECK-NOT: vmaxnm.f32
+  %cmp = fcmp ole float %a, %b
+  %cond = select i1 %cmp, float %b, float %a
+  ret float %cond
+}
+
 define float @fp-armv8_vmaxnm_u(float %a, float %b) {
 ; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_u":
 ; CHECK-FAST-NOT: vcmp
@@ -114,6 +185,17 @@ define float @fp-armv8_vmaxnm_u(float %a, float %b) {
   ret float %cond
 }
 
+define float @fp-armv8_vmaxnm_uge(float %a, float %b) {
+; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_uge":
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vmaxnm.f32
+; CHECK-LABEL: "fp-armv8_vmaxnm_uge":
+; CHECK-NOT: vmaxnm.f32
+  %cmp = fcmp uge float %a, %b
+  %cond = select i1 %cmp, float %a, float %b
+  ret float %cond
+}
+
 define float @fp-armv8_vmaxnm_u_rev(float %a, float %b) {
 ; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_u_rev":
 ; CHECK-FAST-NOT: vcmp
@@ -125,6 +207,17 @@ define float @fp-armv8_vmaxnm_u_rev(float %a, float %b) {
   ret float %cond
 }
 
+define double @fp-armv8_vmaxnm_ule_rev(double %a, double %b) {
+; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_ule_rev":
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vmaxnm.f64
+; CHECK-LABEL: "fp-armv8_vmaxnm_ule_rev":
+; CHECK-NOT: vmaxnm.f64
+  %cmp = fcmp ule double %a, %b
+  %cond = select i1 %cmp, double %b, double %a
+  ret double %cond
+}
+
 
 declare <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
 declare <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/weak2.ll b/test/CodeGen/ARM/weak2.ll
index 82ab90e..a2911d7 100644
--- a/test/CodeGen/ARM/weak2.ll
+++ b/test/CodeGen/ARM/weak2.ll
@@ -8,7 +8,7 @@ entry:
 	br i1 %tmp5, label %UnifiedReturnBlock, label %cond_true8
 
 cond_true8:		; preds = %entry
-	%tmp10 = tail call i32 (...)* %t.0( )		; <i32> [#uses=1]
+	%tmp10 = tail call i32 (...) %t.0( )		; <i32> [#uses=1]
 	ret i32 %tmp10
 
 UnifiedReturnBlock:		; preds = %entry
diff --git a/test/CodeGen/BPF/basictest.ll b/test/CodeGen/BPF/basictest.ll
index c0b6af4..2a2d498 100644
--- a/test/CodeGen/BPF/basictest.ll
+++ b/test/CodeGen/BPF/basictest.ll
@@ -8,8 +8,8 @@ define i32 @test0(i32 %X) {
 }
 
 ; CHECK-LABEL: store_imm:
-; CHECK: stw  0(r1), r0
-; CHECK: stw  4(r2), r0
+; CHECK: stw  0(r1), r{{[03]}}
+; CHECK: stw  4(r2), r{{[03]}}
 define i32 @store_imm(i32* %a, i32* %b) {
 entry:
   store i32 0, i32* %a, align 4
diff --git a/test/CodeGen/BPF/ex1.ll b/test/CodeGen/BPF/ex1.ll
index 366bc17..be038e9 100644
--- a/test/CodeGen/BPF/ex1.ll
+++ b/test/CodeGen/BPF/ex1.ll
@@ -26,7 +26,7 @@ define i32 @bpf_prog1(%struct.bpf_context* nocapture %ctx) #0 section "events/ne
 ; <label>:10                                      ; preds = %0
   %11 = getelementptr inbounds [15 x i8], [15 x i8]* %fmt, i64 0, i64 0
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %11, i8* getelementptr inbounds ([15 x i8], [15 x i8]* @bpf_prog1.fmt, i64 0, i64 0), i64 15, i32 1, i1 false)
-  %12 = call i32 (i8*, i32, ...)* inttoptr (i64 11 to i32 (i8*, i32, ...)*)(i8* %11, i32 15, %struct.sk_buff* %4, i8* %7) #1
+  %12 = call i32 (i8*, i32, ...) inttoptr (i64 11 to i32 (i8*, i32, ...)*)(i8* %11, i32 15, %struct.sk_buff* %4, i8* %7) #1
 ; CHECK-LABEL: bpf_prog1:
 ; CHECK: call 4
 ; CHECK: call 9
diff --git a/test/CodeGen/BPF/intrinsics.ll b/test/CodeGen/BPF/intrinsics.ll
index e0f050e..98b57de 100644
--- a/test/CodeGen/BPF/intrinsics.ll
+++ b/test/CodeGen/BPF/intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=bpf | FileCheck %s
+; RUN: llc < %s -march=bpf -show-mc-encoding | FileCheck %s
 
 ; Function Attrs: nounwind uwtable
 define i32 @ld_b(i64 %foo, i64* nocapture %bar, i8* %ctx, i8* %ctx2) #0 {
@@ -48,3 +48,41 @@ define i32 @ld_w(i8* %ctx, i8* %ctx2, i32 %foo) #0 {
 }
 
 declare i64 @llvm.bpf.load.word(i8*, i64) #1
+
+define i32 @ld_pseudo() #0 {
+entry:
+  %call = tail call i64 @llvm.bpf.pseudo(i64 2, i64 3)
+  tail call void @bar(i64 %call, i32 4) #2
+  ret i32 0
+; CHECK-LABEL: ld_pseudo:
+; CHECK: ld_pseudo r1, 2, 3 # encoding: [0x18,0x21,0x00,0x00,0x03,0x00
+}
+
+declare void @bar(i64, i32) #1
+
+declare i64 @llvm.bpf.pseudo(i64, i64) #2
+
+define i32 @bswap(i64 %a, i64 %b, i64 %c) #0 {
+entry:
+  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
+  %conv = trunc i64 %b to i32
+  %1 = tail call i32 @llvm.bswap.i32(i32 %conv)
+  %conv1 = zext i32 %1 to i64
+  %add = add i64 %conv1, %0
+  %conv2 = trunc i64 %c to i16
+  %2 = tail call i16 @llvm.bswap.i16(i16 %conv2)
+  %conv3 = zext i16 %2 to i64
+  %add4 = add i64 %add, %conv3
+  %conv5 = trunc i64 %add4 to i32
+  ret i32 %conv5
+; CHECK-LABEL: bswap:
+; CHECK: bswap64 r1     # encoding: [0xdc,0x01,0x00,0x00,0x40,0x00,0x00,0x00]
+; CHECK: bswap32 r2     # encoding: [0xdc,0x02,0x00,0x00,0x20,0x00,0x00,0x00]
+; CHECK: add     r2, r1 # encoding: [0x0f,0x12,0x00,0x00,0x00,0x00,0x00,0x00]
+; CHECK: bswap16 r3     # encoding: [0xdc,0x03,0x00,0x00,0x10,0x00,0x00,0x00]
+; CHECK: add     r2, r3 # encoding: [0x0f,0x32,0x00,0x00,0x00,0x00,0x00,0x00]
+}
+
+declare i64 @llvm.bswap.i64(i64) #1
+declare i32 @llvm.bswap.i32(i32) #1
+declare i16 @llvm.bswap.i16(i16) #1
diff --git a/test/CodeGen/BPF/sanity.ll b/test/CodeGen/BPF/sanity.ll
index b9040ef..09a6b65 100644
--- a/test/CodeGen/BPF/sanity.ll
+++ b/test/CodeGen/BPF/sanity.ll
@@ -106,7 +106,7 @@ define void @foo_printf() #1 {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* getelementptr inbounds ([9 x i8], [9 x i8]* @foo_printf.fmt, i64 0, i64 0), i64 9, i32 1, i1 false)
 ; CHECK-LABEL: foo_printf:
 ; CHECK: ld_64 r1, 729618802566522216
-  %2 = call i32 (i8*, ...)* @printf(i8* %1) #3
+  %2 = call i32 (i8*, ...) @printf(i8* %1) #3
   ret void
 }
 
diff --git a/test/CodeGen/CPP/2009-05-01-Long-Double.ll b/test/CodeGen/CPP/2009-05-01-Long-Double.ll
index ae18582..470303d 100644
--- a/test/CodeGen/CPP/2009-05-01-Long-Double.ll
+++ b/test/CodeGen/CPP/2009-05-01-Long-Double.ll
@@ -3,7 +3,7 @@
 define x86_fp80 @some_func() nounwind {
 entry:
 	%retval = alloca x86_fp80		; <x86_fp80*> [#uses=2]
-	%call = call i32 (...)* @other_func()		; <i32> [#uses=1]
+	%call = call i32 (...) @other_func()		; <i32> [#uses=1]
 	%conv = sitofp i32 %call to x86_fp80		; <x86_fp80> [#uses=1]
 	store x86_fp80 %conv, x86_fp80* %retval
 	%0 = load x86_fp80, x86_fp80* %retval		; <x86_fp80> [#uses=1]
diff --git a/test/CodeGen/Generic/2003-07-06-BadIntCmp.ll b/test/CodeGen/Generic/2003-07-06-BadIntCmp.ll
index a130085..7e402f5 100644
--- a/test/CodeGen/Generic/2003-07-06-BadIntCmp.ll
+++ b/test/CodeGen/Generic/2003-07-06-BadIntCmp.ll
@@ -31,11 +31,11 @@ entry:
         br i1 %tmp.8, label %then, label %else
 
 then:           ; preds = %entry
-        %tmp.11 = call i32 (i8*, ...)* @printf( i8* getelementptr ([6 x i8], [6 x i8]* @.str_1, i64 0, i64 0) )           ; <i32> [#uses=0]
+        %tmp.11 = call i32 (i8*, ...) @printf( i8* getelementptr ([6 x i8], [6 x i8]* @.str_1, i64 0, i64 0) )           ; <i32> [#uses=0]
         br label %UnifiedExitNode
 
 else:           ; preds = %entry
-        %tmp.13 = call i32 (i8*, ...)* @printf( i8* getelementptr ([7 x i8], [7 x i8]* @.str_2, i64 0, i64 0) )           ; <i32> [#uses=0]
+        %tmp.13 = call i32 (i8*, ...) @printf( i8* getelementptr ([7 x i8], [7 x i8]* @.str_2, i64 0, i64 0) )           ; <i32> [#uses=0]
         br label %UnifiedExitNode
 
 UnifiedExitNode:                ; preds = %else, %then
diff --git a/test/CodeGen/Generic/2003-07-07-BadLongConst.ll b/test/CodeGen/Generic/2003-07-07-BadLongConst.ll
index e58fc97..928b57e 100644
--- a/test/CodeGen/Generic/2003-07-07-BadLongConst.ll
+++ b/test/CodeGen/Generic/2003-07-07-BadLongConst.ll
@@ -14,7 +14,7 @@ entry:
         %tmp.11 = call i64 @getL( )             ; <i64> [#uses=2]
         %tmp.5 = trunc i64 %tmp.11 to i32               ; <i32> [#uses=2]
         %tmp.23 = and i64 %tmp.11, -4294967296          ; <i64> [#uses=2]
-        %tmp.16 = call i32 (i8*, ...)* @printf( i8* getelementptr ([42 x i8], [42 x i8]* @.str_1, i64 0, i64 0), i32 %tmp.5, i32 %tmp.5, i64 %tmp.23, i64 %tmp.23 )              ; <i32> [#uses=0]
+        %tmp.16 = call i32 (i8*, ...) @printf( i8* getelementptr ([42 x i8], [42 x i8]* @.str_1, i64 0, i64 0), i32 %tmp.5, i32 %tmp.5, i64 %tmp.23, i64 %tmp.23 )              ; <i32> [#uses=0]
         ret i32 0
 }
 
diff --git a/test/CodeGen/Generic/2003-07-08-BadCastToBool.ll b/test/CodeGen/Generic/2003-07-08-BadCastToBool.ll
index 72968d7..73ad186 100644
--- a/test/CodeGen/Generic/2003-07-08-BadCastToBool.ll
+++ b/test/CodeGen/Generic/2003-07-08-BadCastToBool.ll
@@ -28,7 +28,7 @@ entry:
 define i32 @main() {
 entry:
         %result = call i32 @adj( i32 3, i32 2 )         ; <i32> [#uses=1]
-        %tmp.0 = call i32 (i8*, ...)* @printf( i8* getelementptr ([30 x i8], [30 x i8]* @.str_1, i64 0, i64 0), i32 3, i32 2, i32 %result )              ; <i32> [#uses=0]
+        %tmp.0 = call i32 (i8*, ...) @printf( i8* getelementptr ([30 x i8], [30 x i8]* @.str_1, i64 0, i64 0), i32 3, i32 2, i32 %result )              ; <i32> [#uses=0]
         ret i32 0
 }
 
diff --git a/test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll b/test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll
index 0d0c37b..010c0c5 100644
--- a/test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll
+++ b/test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll
@@ -28,8 +28,8 @@ loopentry:              ; preds = %loopentry, %entry
         %i = phi i64 [ 0, %entry ], [ %inc.i, %loopentry ]              ; <i64> [#uses=3]
         %cptr = getelementptr [6 x i8], [6 x i8]* @yy_ec, i64 0, i64 %i           ; <i8*> [#uses=1]
         %c = load i8, i8* %cptr             ; <i8> [#uses=1]
-        %ignore = call i32 (i8*, ...)* @printf( i8* getelementptr ([8 x i8], [8 x i8]* @.str_3, i64 0, i64 0), i64 %i )        ; <i32> [#uses=0]
-        %ignore2 = call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @.str_4, i64 0, i64 0), i8 %c )        ; <i32> [#uses=0]
+        %ignore = call i32 (i8*, ...) @printf( i8* getelementptr ([8 x i8], [8 x i8]* @.str_3, i64 0, i64 0), i64 %i )        ; <i32> [#uses=0]
+        %ignore2 = call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @.str_4, i64 0, i64 0), i8 %c )        ; <i32> [#uses=0]
         %inc.i = add i64 %i, 1          ; <i64> [#uses=2]
         %done = icmp sle i64 %inc.i, 5          ; <i1> [#uses=1]
         br i1 %done, label %loopentry, label %exit.1
diff --git a/test/CodeGen/Generic/2005-12-01-Crash.ll b/test/CodeGen/Generic/2005-12-01-Crash.ll
index 45d6204..e6ab9d2 100644
--- a/test/CodeGen/Generic/2005-12-01-Crash.ll
+++ b/test/CodeGen/Generic/2005-12-01-Crash.ll
@@ -11,7 +11,7 @@
 define void @printArgsNoRet(i32 %a1, float %a2, i8 %a3, double %a4, i8* %a5, i32 %a6, float %a7, i8 %a8, double %a9, i8* %a10, i32 %a11, float %a12, i8 %a13, double %a14, i8* %a15) {
 entry:
 	%tmp17 = sext i8 %a13 to i32		; <i32> [#uses=1]
-	%tmp23 = call i32 (i8*, ...)* @printf( i8* getelementptr ([29 x i8], [29 x i8]* @str2, i32 0, i64 0), i32 %a11, double 0.000000e+00, i32 %tmp17, double %a14, i32 0 )		; <i32> [#uses=0]
+	%tmp23 = call i32 (i8*, ...) @printf( i8* getelementptr ([29 x i8], [29 x i8]* @str2, i32 0, i64 0), i32 %a11, double 0.000000e+00, i32 %tmp17, double %a14, i32 0 )		; <i32> [#uses=0]
 	ret void
 }
 
diff --git a/test/CodeGen/Generic/2006-05-06-GEP-Cast-Sink-Crash.ll b/test/CodeGen/Generic/2006-05-06-GEP-Cast-Sink-Crash.ll
index 57673bb..12a4011 100644
--- a/test/CodeGen/Generic/2006-05-06-GEP-Cast-Sink-Crash.ll
+++ b/test/CodeGen/Generic/2006-05-06-GEP-Cast-Sink-Crash.ll
@@ -18,7 +18,7 @@ bb.i:		; preds = %cond_next.i, %entry
 	br i1 false, label %cond_true.i31, label %cond_next.i
 
 cond_true.i31:		; preds = %bb.i
-	call void (i32, ...)* @fprintf( i32 0, i8* %tmp11, i8* null )
+	call void (i32, ...) @fprintf( i32 0, i8* %tmp11, i8* null )
 	ret void
 
 cond_next.i:		; preds = %bb.i
diff --git a/test/CodeGen/Generic/2008-02-04-Ctlz.ll b/test/CodeGen/Generic/2008-02-04-Ctlz.ll
index 27a37a9..3244e5c 100644
--- a/test/CodeGen/Generic/2008-02-04-Ctlz.ll
+++ b/test/CodeGen/Generic/2008-02-04-Ctlz.ll
@@ -10,7 +10,7 @@ entry:
 	%tmp38 = trunc i64 %tmp37 to i32		; <i32>:0 [#uses=1]
 	%tmp48 = trunc i64 %tmp47 to i32		; <i32>:0 [#uses=1]
 	%tmp58 = trunc i64 %tmp57 to i32		; <i32>:0 [#uses=1]
-	%tmp40 = tail call i32 (i8*, ...)* @printf( i8* noalias  getelementptr ([14 x i8], [14 x i8]* @.str, i32 0, i32 0), i64 %arg, i32 %tmp38, i32 %tmp48, i32 %tmp58 ) nounwind 		; <i32> [#uses=0]
+	%tmp40 = tail call i32 (i8*, ...) @printf( i8* noalias  getelementptr ([14 x i8], [14 x i8]* @.str, i32 0, i32 0), i64 %arg, i32 %tmp38, i32 %tmp48, i32 %tmp58 ) nounwind 		; <i32> [#uses=0]
 	ret i32 0
 }
 
diff --git a/test/CodeGen/Generic/2008-02-25-NegateZero.ll b/test/CodeGen/Generic/2008-02-25-NegateZero.ll
index 35c0f20..14800ce 100644
--- a/test/CodeGen/Generic/2008-02-25-NegateZero.ll
+++ b/test/CodeGen/Generic/2008-02-25-NegateZero.ll
@@ -7,7 +7,7 @@ entry:
 	%tmp106 = load float, float* null, align 4		; <float> [#uses=1]
 	%tmp113 = fadd float %tmp98, %tmp106		; <float> [#uses=1]
 	%tmp119 = fsub float %tmp113, 0.000000e+00		; <float> [#uses=1]
-	call void (i32, ...)* @foo( i32 0, float 0.000000e+00, float %tmp119 ) nounwind 
+	call void (i32, ...) @foo( i32 0, float 0.000000e+00, float %tmp119 ) nounwind 
 	ret void
 }
 
diff --git a/test/CodeGen/Generic/ConstantExprLowering.ll b/test/CodeGen/Generic/ConstantExprLowering.ll
index 5c57b47..3119dfa 100644
--- a/test/CodeGen/Generic/ConstantExprLowering.ll
+++ b/test/CodeGen/Generic/ConstantExprLowering.ll
@@ -16,7 +16,7 @@ less:           ; preds = %entry
 
 not_less:               ; preds = %less, %entry
         %t2 = phi i32 [ sub (i32 ptrtoint (i32* @XA to i32), i32 ptrtoint (i32* @XB to i32)), %less ], [ sub (i32 ptrtoint (i32* @XA to i32), i32 ptrtoint (i32* @XB to i32)), %entry ]               ; <i32> [#uses=1]
-        %tmp.39 = call i32 (i8*, ...)* @printf( i8* getelementptr ([16 x i8], [16 x i8]* @.str_1, i64 0, i64 0), i32 %t2 )      ; <i32> [#uses=0]
+        %tmp.39 = call i32 (i8*, ...) @printf( i8* getelementptr ([16 x i8], [16 x i8]* @.str_1, i64 0, i64 0), i32 %t2 )      ; <i32> [#uses=0]
         ret void
 }
 
diff --git a/test/CodeGen/Generic/PBQP.ll b/test/CodeGen/Generic/PBQP.ll
index 91fcfba..31fc4e6 100644
--- a/test/CodeGen/Generic/PBQP.ll
+++ b/test/CodeGen/Generic/PBQP.ll
@@ -2,23 +2,23 @@
 
 define i32 @foo() {
 entry:
-  %call = tail call i32 (...)* @baz()
-  %call1 = tail call i32 (...)* @baz()
-  %call2 = tail call i32 (...)* @baz()
-  %call3 = tail call i32 (...)* @baz()
-  %call4 = tail call i32 (...)* @baz()
-  %call5 = tail call i32 (...)* @baz()
-  %call6 = tail call i32 (...)* @baz()
-  %call7 = tail call i32 (...)* @baz()
-  %call8 = tail call i32 (...)* @baz()
-  %call9 = tail call i32 (...)* @baz()
-  %call10 = tail call i32 (...)* @baz()
-  %call11 = tail call i32 (...)* @baz()
-  %call12 = tail call i32 (...)* @baz()
-  %call13 = tail call i32 (...)* @baz()
-  %call14 = tail call i32 (...)* @baz()
-  %call15 = tail call i32 (...)* @baz()
-  %call16 = tail call i32 (...)* @baz()
+  %call = tail call i32 (...) @baz()
+  %call1 = tail call i32 (...) @baz()
+  %call2 = tail call i32 (...) @baz()
+  %call3 = tail call i32 (...) @baz()
+  %call4 = tail call i32 (...) @baz()
+  %call5 = tail call i32 (...) @baz()
+  %call6 = tail call i32 (...) @baz()
+  %call7 = tail call i32 (...) @baz()
+  %call8 = tail call i32 (...) @baz()
+  %call9 = tail call i32 (...) @baz()
+  %call10 = tail call i32 (...) @baz()
+  %call11 = tail call i32 (...) @baz()
+  %call12 = tail call i32 (...) @baz()
+  %call13 = tail call i32 (...) @baz()
+  %call14 = tail call i32 (...) @baz()
+  %call15 = tail call i32 (...) @baz()
+  %call16 = tail call i32 (...) @baz()
   %call17 = tail call i32 @bar(i32 %call, i32 %call1, i32 %call2, i32 %call3, i32 %call4, i32 %call5, i32 %call6, i32 %call7, i32 %call8, i32 %call9, i32 %call10, i32 %call11, i32 %call12, i32 %call13, i32 %call14, i32 %call15, i32 %call16)
   ret i32 %call17
 }
diff --git a/test/CodeGen/Generic/add-with-overflow-128.ll b/test/CodeGen/Generic/add-with-overflow-128.ll
index b091915..2a7456c 100644
--- a/test/CodeGen/Generic/add-with-overflow-128.ll
+++ b/test/CodeGen/Generic/add-with-overflow-128.ll
@@ -14,11 +14,11 @@ entry:
   br i1 %obit, label %carry, label %normal
 
 normal:
-  %t1 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum32 ) nounwind
+  %t1 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum32 ) nounwind
   ret i1 true
 
 carry:
-  %t2 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
+  %t2 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
   ret i1 false
 }
 
diff --git a/test/CodeGen/Generic/add-with-overflow-24.ll b/test/CodeGen/Generic/add-with-overflow-24.ll
index 7edc1f8..6f06ae6 100644
--- a/test/CodeGen/Generic/add-with-overflow-24.ll
+++ b/test/CodeGen/Generic/add-with-overflow-24.ll
@@ -12,11 +12,11 @@ entry:
   br i1 %obit, label %overflow, label %normal
 
 normal:
-  %t1 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum32 ) nounwind
+  %t1 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum32 ) nounwind
   ret i1 true
 
 overflow:
-  %t2 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
+  %t2 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
   ret i1 false
 }
 
@@ -29,11 +29,11 @@ entry:
   br i1 %obit, label %carry, label %normal
 
 normal:
-  %t1 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum32 ) nounwind
+  %t1 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum32 ) nounwind
   ret i1 true
 
 carry:
-  %t2 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
+  %t2 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
   ret i1 false
 }
 
diff --git a/test/CodeGen/Generic/add-with-overflow.ll b/test/CodeGen/Generic/add-with-overflow.ll
index 2204055..b6bbaa1 100644
--- a/test/CodeGen/Generic/add-with-overflow.ll
+++ b/test/CodeGen/Generic/add-with-overflow.ll
@@ -12,11 +12,11 @@ entry:
   br i1 %obit, label %overflow, label %normal
 
 normal:
-  %t1 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
+  %t1 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
   ret i1 true
 
 overflow:
-  %t2 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
+  %t2 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
   ret i1 false
 }
 
@@ -28,11 +28,11 @@ entry:
   br i1 %obit, label %overflow, label %normal
 
 normal:
-  %t1 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
+  %t1 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
   ret i1 true
 
 overflow:
-  %t2 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
+  %t2 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
   ret i1 false
 }
 
diff --git a/test/CodeGen/Generic/badarg6.ll b/test/CodeGen/Generic/badarg6.ll
index 7388bb4..34736ec 100644
--- a/test/CodeGen/Generic/badarg6.ll
+++ b/test/CodeGen/Generic/badarg6.ll
@@ -27,6 +27,6 @@ bb43:		; preds = %bb42, %bb25
 	%reg323 = phi double [ -1.000000e+00, %bb25 ], [ %reg317, %bb42 ]		; <double> [#uses=1]
 	%reg324 = phi double [ -1.000000e+00, %bb25 ], [ %reg318, %bb42 ]		; <double> [#uses=1]
 	%reg325 = phi double [ 1.000000e+00, %bb25 ], [ %reg319, %bb42 ]		; <double> [#uses=1]
-	%reg609 = call i32 (i8*, ...)* @printf( i8* getelementptr ([44 x i8], [44 x i8]* @.LC12, i64 0, i64 0), double %reg325, double %reg324, double %reg323, double %reg322, double %reg321 )		; <i32> [#uses=0]
+	%reg609 = call i32 (i8*, ...) @printf( i8* getelementptr ([44 x i8], [44 x i8]* @.LC12, i64 0, i64 0), double %reg325, double %reg324, double %reg323, double %reg322, double %reg321 )		; <i32> [#uses=0]
 	ret i32 0
 }
diff --git a/test/CodeGen/Generic/builtin-expect.ll b/test/CodeGen/Generic/builtin-expect.ll
index 2f76acf..def687e 100644
--- a/test/CodeGen/Generic/builtin-expect.ll
+++ b/test/CodeGen/Generic/builtin-expect.ll
@@ -14,7 +14,7 @@ entry:
   br i1 %tobool, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
-  %call = call i32 (...)* @f()
+  %call = call i32 (...) @f()
   store i32 %call, i32* %retval
   br label %return
 
@@ -43,7 +43,7 @@ entry:
   br i1 %tobool, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
-  %call = call i32 (...)* @f()
+  %call = call i32 (...) @f()
   store i32 %call, i32* %retval
   br label %return
 
@@ -71,7 +71,7 @@ entry:
   br i1 %tobool1, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
-  %call = call i32 (...)* @f()
+  %call = call i32 (...) @f()
   store i32 %call, i32* %retval
   br label %return
 
@@ -100,7 +100,7 @@ entry:
   br i1 %tobool2, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
-  %call = call i32 (...)* @f()
+  %call = call i32 (...) @f()
   store i32 %call, i32* %retval
   br label %return
 
@@ -127,7 +127,7 @@ entry:
   br i1 %tobool, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
-  %call = call i32 (...)* @f()
+  %call = call i32 (...) @f()
   store i32 %call, i32* %retval
   br label %return
 
@@ -206,7 +206,7 @@ entry:
   br i1 %tobool, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
-  %call = call i32 (...)* @f()
+  %call = call i32 (...) @f()
   store i32 %call, i32* %retval
   br label %return
 
diff --git a/test/CodeGen/Generic/cast-fp.ll b/test/CodeGen/Generic/cast-fp.ll
index 3b03096..a2611f5 100644
--- a/test/CodeGen/Generic/cast-fp.ll
+++ b/test/CodeGen/Generic/cast-fp.ll
@@ -12,22 +12,22 @@ declare i32 @printf(i8*, ...)
 define i32 @main() {
 	%a = load double, double* @A		; <double> [#uses=4]
 	%a_fs = getelementptr [8 x i8], [8 x i8]* @a_fstr, i64 0, i64 0		; <i8*> [#uses=1]
-	call i32 (i8*, ...)* @printf( i8* %a_fs, double %a )		; <i32>:1 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_fs, double %a )		; <i32>:1 [#uses=0]
 	%a_d2l = fptosi double %a to i64		; <i64> [#uses=1]
 	%a_ls = getelementptr [10 x i8], [10 x i8]* @a_lstr, i64 0, i64 0		; <i8*> [#uses=1]
-	call i32 (i8*, ...)* @printf( i8* %a_ls, i64 %a_d2l )		; <i32>:2 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_ls, i64 %a_d2l )		; <i32>:2 [#uses=0]
 	%a_d2i = fptosi double %a to i32		; <i32> [#uses=2]
 	%a_ds = getelementptr [8 x i8], [8 x i8]* @a_dstr, i64 0, i64 0		; <i8*> [#uses=3]
-	call i32 (i8*, ...)* @printf( i8* %a_ds, i32 %a_d2i )		; <i32>:3 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_ds, i32 %a_d2i )		; <i32>:3 [#uses=0]
 	%a_d2sb = fptosi double %a to i8		; <i8> [#uses=1]
-	call i32 (i8*, ...)* @printf( i8* %a_ds, i8 %a_d2sb )		; <i32>:4 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_ds, i8 %a_d2sb )		; <i32>:4 [#uses=0]
 	%a_d2i2sb = trunc i32 %a_d2i to i8		; <i8> [#uses=1]
-	call i32 (i8*, ...)* @printf( i8* %a_ds, i8 %a_d2i2sb )		; <i32>:5 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_ds, i8 %a_d2i2sb )		; <i32>:5 [#uses=0]
 	%b = load i32, i32* @B		; <i32> [#uses=2]
 	%b_ds = getelementptr [8 x i8], [8 x i8]* @b_dstr, i64 0, i64 0		; <i8*> [#uses=1]
-	call i32 (i8*, ...)* @printf( i8* %b_ds, i32 %b )		; <i32>:6 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %b_ds, i32 %b )		; <i32>:6 [#uses=0]
 	%b_i2d = sitofp i32 %b to double		; <double> [#uses=1]
 	%b_fs = getelementptr [8 x i8], [8 x i8]* @b_fstr, i64 0, i64 0		; <i8*> [#uses=1]
-	call i32 (i8*, ...)* @printf( i8* %b_fs, double %b_i2d )		; <i32>:7 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %b_fs, double %b_i2d )		; <i32>:7 [#uses=0]
 	ret i32 0
 }
diff --git a/test/CodeGen/Generic/constindices.ll b/test/CodeGen/Generic/constindices.ll
index 3b43db0..837836f 100644
--- a/test/CodeGen/Generic/constindices.ll
+++ b/test/CodeGen/Generic/constindices.ll
@@ -39,6 +39,6 @@ define i32 @main() {
         %dpi = fpext float %pi to double                ; <double> [#uses=1]
         %dfive = fpext float %five to double            ; <double> [#uses=1]
         %castFmt = getelementptr [44 x i8], [44 x i8]* @fmtArg, i64 0, i64 0               ; <i8*> [#uses=1]
-        call i32 (i8*, ...)* @printf( i8* %castFmt, double %dsqrtTwo, double %dexp, double %dpi, double %dfive )     ; <i32>:1 [#uses=0]
+        call i32 (i8*, ...) @printf( i8* %castFmt, double %dsqrtTwo, double %dexp, double %dpi, double %dfive )     ; <i32>:1 [#uses=0]
         ret i32 0
 }
diff --git a/test/CodeGen/Generic/dbg_value.ll b/test/CodeGen/Generic/dbg_value.ll
index c5200d7..2ee667c 100644
--- a/test/CodeGen/Generic/dbg_value.ll
+++ b/test/CodeGen/Generic/dbg_value.ll
@@ -4,11 +4,11 @@
 %0 = type { i32, i32 }
 
 define void @t(%0*, i32, i32, i32, i32) nounwind {
-  tail call void @llvm.dbg.value(metadata %0* %0, i64 0, metadata !0, metadata !MDExpression())
+  tail call void @llvm.dbg.value(metadata %0* %0, i64 0, metadata !0, metadata !MDExpression()), !dbg !MDLocation(scope: !MDSubprogram())
   unreachable
 }
 
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 ; !0 should conform to the format of DIVariable.
-!0 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "a", arg: 0, scope: null)
+!0 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "a", arg: 0, scope: !MDSubprogram())
diff --git a/test/CodeGen/Generic/hello.ll b/test/CodeGen/Generic/hello.ll
index dff4791..a8147da 100644
--- a/test/CodeGen/Generic/hello.ll
+++ b/test/CodeGen/Generic/hello.ll
@@ -6,6 +6,6 @@ declare i32 @printf(i8*, ...)
 
 define i32 @main() {
         %s = getelementptr [7 x i8], [7 x i8]* @.str_1, i64 0, i64 0              ; <i8*> [#uses=1]
-        call i32 (i8*, ...)* @printf( i8* %s )          ; <i32>:1 [#uses=0]
+        call i32 (i8*, ...) @printf( i8* %s )          ; <i32>:1 [#uses=0]
         ret i32 0
 }
diff --git a/test/CodeGen/Generic/negintconst.ll b/test/CodeGen/Generic/negintconst.ll
index 1cf69de..4c0a654 100644
--- a/test/CodeGen/Generic/negintconst.ll
+++ b/test/CodeGen/Generic/negintconst.ll
@@ -41,7 +41,7 @@ define i32 @main() {
         %ioff.upgrd.1 = zext i32 %ioff to i64           ; <i64> [#uses=1]
         %fptr = getelementptr %Results, %Results* %fval, i64 %ioff.upgrd.1                ; <%Results*> [#uses=1]
         %castFmt = getelementptr [39 x i8], [39 x i8]* @fmtArg, i64 0, i64 0               ; <i8*> [#uses=1]
-        call i32 (i8*, ...)* @printf( i8* %castFmt, i32 %ioff, %Results* %fval, %Results* %fptr )               ; <i32>:1 [#uses=0]
+        call i32 (i8*, ...) @printf( i8* %castFmt, i32 %ioff, %Results* %fval, %Results* %fptr )               ; <i32>:1 [#uses=0]
         ret i32 0
 }
 
diff --git a/test/CodeGen/Generic/overloaded-intrinsic-name.ll b/test/CodeGen/Generic/overloaded-intrinsic-name.ll
index aa6a031..915ba9f 100644
--- a/test/CodeGen/Generic/overloaded-intrinsic-name.ll
+++ b/test/CodeGen/Generic/overloaded-intrinsic-name.ll
@@ -12,29 +12,29 @@
 ; will serve the purpose.
 
 ; function and integer
-define i32* @test_iAny(i32* %v) {
-       %tok = call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, i32* %v)
+define i32* @test_iAny(i32* %v) gc "statepoint-example" {
+       %tok = call i32 (i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, i32* %v)
        %v-new = call i32* @llvm.experimental.gc.relocate.p0i32(i32 %tok, i32 4, i32 4)
        ret i32* %v-new
 }
 
 ; float
-define float* @test_fAny(float* %v) {
-       %tok = call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, float* %v)
+define float* @test_fAny(float* %v) gc "statepoint-example" {
+       %tok = call i32 (i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, float* %v)
        %v-new = call float* @llvm.experimental.gc.relocate.p0f32(i32 %tok, i32 4, i32 4)
        ret float* %v-new
 }
 
 ; array of integers
-define [3 x i32]* @test_aAny([3 x i32]* %v) {
-       %tok = call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, [3 x i32]* %v)
+define [3 x i32]* @test_aAny([3 x i32]* %v) gc "statepoint-example" {
+       %tok = call i32 (i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, [3 x i32]* %v)
        %v-new = call [3 x i32]* @llvm.experimental.gc.relocate.p0a3i32(i32 %tok, i32 4, i32 4)
        ret [3 x i32]* %v-new
 }
 
 ; vector of integers
-define <3 x i32>* @test_vAny(<3 x i32>* %v) {
-       %tok = call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, <3 x i32>* %v)
+define <3 x i32>* @test_vAny(<3 x i32>* %v) gc "statepoint-example" {
+       %tok = call i32 (i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, <3 x i32>* %v)
        %v-new = call <3 x i32>* @llvm.experimental.gc.relocate.p0v3i32(i32 %tok, i32 4, i32 4)
        ret <3 x i32>* %v-new
 }
@@ -42,8 +42,8 @@ define <3 x i32>* @test_vAny(<3 x i32>* %v) {
 %struct.test = type { i32, i1 }
 
 ; struct
-define %struct.test* @test_struct(%struct.test* %v) {
-       %tok = call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, %struct.test* %v)
+define %struct.test* @test_struct(%struct.test* %v) gc "statepoint-example" {
+       %tok = call i32 (i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, %struct.test* %v)
        %v-new = call %struct.test* @llvm.experimental.gc.relocate.p0struct.test(i32 %tok, i32 4, i32 4)
        ret %struct.test* %v-new
 }
diff --git a/test/CodeGen/Generic/print-add.ll b/test/CodeGen/Generic/print-add.ll
index 553438a..0507aba 100644
--- a/test/CodeGen/Generic/print-add.ll
+++ b/test/CodeGen/Generic/print-add.ll
@@ -7,12 +7,12 @@ declare i32 @printf(i8*, ...)
 define i32 @main() {
         %f = getelementptr [4 x i8], [4 x i8]* @.str_1, i64 0, i64 0              ; <i8*> [#uses=3]
         %d = add i32 1, 0               ; <i32> [#uses=3]
-        call i32 (i8*, ...)* @printf( i8* %f, i32 %d )          ; <i32>:1 [#uses=0]
+        call i32 (i8*, ...) @printf( i8* %f, i32 %d )          ; <i32>:1 [#uses=0]
         %e = add i32 38, 2              ; <i32> [#uses=2]
-        call i32 (i8*, ...)* @printf( i8* %f, i32 %e )          ; <i32>:2 [#uses=0]
+        call i32 (i8*, ...) @printf( i8* %f, i32 %e )          ; <i32>:2 [#uses=0]
         %g = add i32 %d, %d             ; <i32> [#uses=1]
         %h = add i32 %e, %g             ; <i32> [#uses=1]
-        call i32 (i8*, ...)* @printf( i8* %f, i32 %h )          ; <i32>:3 [#uses=0]
+        call i32 (i8*, ...) @printf( i8* %f, i32 %h )          ; <i32>:3 [#uses=0]
         ret i32 0
 }
 
diff --git a/test/CodeGen/Generic/print-arith-fp.ll b/test/CodeGen/Generic/print-arith-fp.ll
index b00229c..93b158e 100644
--- a/test/CodeGen/Generic/print-arith-fp.ll
+++ b/test/CodeGen/Generic/print-arith-fp.ll
@@ -22,8 +22,8 @@ define i32 @main() {
 	%b = load double, double* @B		; <double> [#uses=12]
 	%a_s = getelementptr [8 x i8], [8 x i8]* @a_str, i64 0, i64 0		; <i8*> [#uses=1]
 	%b_s = getelementptr [8 x i8], [8 x i8]* @b_str, i64 0, i64 0		; <i8*> [#uses=1]
-	call i32 (i8*, ...)* @printf( i8* %a_s, double %a )		; <i32>:1 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %b_s, double %b )		; <i32>:2 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_s, double %a )		; <i32>:1 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %b_s, double %b )		; <i32>:2 [#uses=0]
 	%add_r = fadd double %a, %b		; <double> [#uses=1]
 	%sub_r = fsub double %a, %b		; <double> [#uses=1]
 	%mul_r = fmul double %a, %b		; <double> [#uses=1]
@@ -34,11 +34,11 @@ define i32 @main() {
 	%mul_s = getelementptr [12 x i8], [12 x i8]* @mul_str, i64 0, i64 0		; <i8*> [#uses=1]
 	%div_s = getelementptr [12 x i8], [12 x i8]* @div_str, i64 0, i64 0		; <i8*> [#uses=1]
 	%rem_s = getelementptr [13 x i8], [13 x i8]* @rem_str, i64 0, i64 0		; <i8*> [#uses=1]
-	call i32 (i8*, ...)* @printf( i8* %add_s, double %add_r )		; <i32>:3 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %sub_s, double %sub_r )		; <i32>:4 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %mul_s, double %mul_r )		; <i32>:5 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %div_s, double %div_r )		; <i32>:6 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %rem_s, double %rem_r )		; <i32>:7 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %add_s, double %add_r )		; <i32>:3 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %sub_s, double %sub_r )		; <i32>:4 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %mul_s, double %mul_r )		; <i32>:5 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %div_s, double %div_r )		; <i32>:6 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %rem_s, double %rem_r )		; <i32>:7 [#uses=0]
 	%lt_r = fcmp olt double %a, %b		; <i1> [#uses=1]
 	%le_r = fcmp ole double %a, %b		; <i1> [#uses=1]
 	%gt_r = fcmp ogt double %a, %b		; <i1> [#uses=1]
@@ -51,11 +51,11 @@ define i32 @main() {
 	%ge_s = getelementptr [13 x i8], [13 x i8]* @ge_str, i64 0, i64 0		; <i8*> [#uses=1]
 	%eq_s = getelementptr [13 x i8], [13 x i8]* @eq_str, i64 0, i64 0		; <i8*> [#uses=1]
 	%ne_s = getelementptr [13 x i8], [13 x i8]* @ne_str, i64 0, i64 0		; <i8*> [#uses=1]
-	call i32 (i8*, ...)* @printf( i8* %lt_s, i1 %lt_r )		; <i32>:8 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %le_s, i1 %le_r )		; <i32>:9 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %gt_s, i1 %gt_r )		; <i32>:10 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %ge_s, i1 %ge_r )		; <i32>:11 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %eq_s, i1 %eq_r )		; <i32>:12 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %ne_s, i1 %ne_r )		; <i32>:13 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %lt_s, i1 %lt_r )		; <i32>:8 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %le_s, i1 %le_r )		; <i32>:9 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %gt_s, i1 %gt_r )		; <i32>:10 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %ge_s, i1 %ge_r )		; <i32>:11 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %eq_s, i1 %eq_r )		; <i32>:12 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %ne_s, i1 %ne_r )		; <i32>:13 [#uses=0]
 	ret i32 0
 }
diff --git a/test/CodeGen/Generic/print-arith-int.ll b/test/CodeGen/Generic/print-arith-int.ll
index 2e176e4..a5c519c 100644
--- a/test/CodeGen/Generic/print-arith-int.ll
+++ b/test/CodeGen/Generic/print-arith-int.ll
@@ -27,8 +27,8 @@ define i32 @main() {
 	%b = load i32, i32* @B		; <i32> [#uses=17]
 	%a_s = getelementptr [8 x i8], [8 x i8]* @a_str, i64 0, i64 0		; <i8*> [#uses=1]
 	%b_s = getelementptr [8 x i8], [8 x i8]* @b_str, i64 0, i64 0		; <i8*> [#uses=1]
-	call i32 (i8*, ...)* @printf( i8* %a_s, i32 %a )		; <i32>:1 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %b_s, i32 %b )		; <i32>:2 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_s, i32 %a )		; <i32>:1 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %b_s, i32 %b )		; <i32>:2 [#uses=0]
 	%add_r = add i32 %a, %b		; <i32> [#uses=1]
 	%sub_r = sub i32 %a, %b		; <i32> [#uses=1]
 	%mul_r = mul i32 %a, %b		; <i32> [#uses=1]
@@ -39,11 +39,11 @@ define i32 @main() {
 	%mul_s = getelementptr [12 x i8], [12 x i8]* @mul_str, i64 0, i64 0		; <i8*> [#uses=1]
 	%div_s = getelementptr [12 x i8], [12 x i8]* @div_str, i64 0, i64 0		; <i8*> [#uses=1]
 	%rem_s = getelementptr [13 x i8], [13 x i8]* @rem_str, i64 0, i64 0		; <i8*> [#uses=1]
-	call i32 (i8*, ...)* @printf( i8* %add_s, i32 %add_r )		; <i32>:3 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %sub_s, i32 %sub_r )		; <i32>:4 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %mul_s, i32 %mul_r )		; <i32>:5 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %div_s, i32 %div_r )		; <i32>:6 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %rem_s, i32 %rem_r )		; <i32>:7 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %add_s, i32 %add_r )		; <i32>:3 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %sub_s, i32 %sub_r )		; <i32>:4 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %mul_s, i32 %mul_r )		; <i32>:5 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %div_s, i32 %div_r )		; <i32>:6 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %rem_s, i32 %rem_r )		; <i32>:7 [#uses=0]
 	%lt_r = icmp slt i32 %a, %b		; <i1> [#uses=1]
 	%le_r = icmp sle i32 %a, %b		; <i1> [#uses=1]
 	%gt_r = icmp sgt i32 %a, %b		; <i1> [#uses=1]
@@ -56,12 +56,12 @@ define i32 @main() {
 	%ge_s = getelementptr [13 x i8], [13 x i8]* @ge_str, i64 0, i64 0		; <i8*> [#uses=1]
 	%eq_s = getelementptr [13 x i8], [13 x i8]* @eq_str, i64 0, i64 0		; <i8*> [#uses=1]
 	%ne_s = getelementptr [13 x i8], [13 x i8]* @ne_str, i64 0, i64 0		; <i8*> [#uses=1]
-	call i32 (i8*, ...)* @printf( i8* %lt_s, i1 %lt_r )		; <i32>:8 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %le_s, i1 %le_r )		; <i32>:9 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %gt_s, i1 %gt_r )		; <i32>:10 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %ge_s, i1 %ge_r )		; <i32>:11 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %eq_s, i1 %eq_r )		; <i32>:12 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %ne_s, i1 %ne_r )		; <i32>:13 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %lt_s, i1 %lt_r )		; <i32>:8 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %le_s, i1 %le_r )		; <i32>:9 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %gt_s, i1 %gt_r )		; <i32>:10 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %ge_s, i1 %ge_r )		; <i32>:11 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %eq_s, i1 %eq_r )		; <i32>:12 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %ne_s, i1 %ne_r )		; <i32>:13 [#uses=0]
 	%and_r = and i32 %a, %b		; <i32> [#uses=1]
 	%or_r = or i32 %a, %b		; <i32> [#uses=1]
 	%xor_r = xor i32 %a, %b		; <i32> [#uses=1]
@@ -75,10 +75,10 @@ define i32 @main() {
 	%xor_s = getelementptr [12 x i8], [12 x i8]* @xor_str, i64 0, i64 0		; <i8*> [#uses=1]
 	%shl_s = getelementptr [13 x i8], [13 x i8]* @shl_str, i64 0, i64 0		; <i8*> [#uses=1]
 	%shr_s = getelementptr [13 x i8], [13 x i8]* @shr_str, i64 0, i64 0		; <i8*> [#uses=1]
-	call i32 (i8*, ...)* @printf( i8* %and_s, i32 %and_r )		; <i32>:14 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %or_s, i32 %or_r )		; <i32>:15 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %xor_s, i32 %xor_r )		; <i32>:16 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %shl_s, i32 %shl_r )		; <i32>:17 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %shr_s, i32 %shr_r )		; <i32>:18 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %and_s, i32 %and_r )		; <i32>:14 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %or_s, i32 %or_r )		; <i32>:15 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %xor_s, i32 %xor_r )		; <i32>:16 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %shl_s, i32 %shl_r )		; <i32>:17 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %shr_s, i32 %shr_r )		; <i32>:18 [#uses=0]
 	ret i32 0
 }
diff --git a/test/CodeGen/Generic/print-int.ll b/test/CodeGen/Generic/print-int.ll
index 0afc0e9..85b40c0 100644
--- a/test/CodeGen/Generic/print-int.ll
+++ b/test/CodeGen/Generic/print-int.ll
@@ -7,7 +7,7 @@ declare i32 @printf(i8*, ...)
 define i32 @main() {
         %f = getelementptr [4 x i8], [4 x i8]* @.str_1, i64 0, i64 0              ; <i8*> [#uses=1]
         %d = add i32 0, 0               ; <i32> [#uses=1]
-        %tmp.0 = call i32 (i8*, ...)* @printf( i8* %f, i32 %d )         ; <i32> [#uses=0]
+        %tmp.0 = call i32 (i8*, ...) @printf( i8* %f, i32 %d )         ; <i32> [#uses=0]
         ret i32 0
 }
 
diff --git a/test/CodeGen/Generic/print-mul-exp.ll b/test/CodeGen/Generic/print-mul-exp.ll
index a08333d..91c8147 100644
--- a/test/CodeGen/Generic/print-mul-exp.ll
+++ b/test/CodeGen/Generic/print-mul-exp.ll
@@ -10,7 +10,7 @@ define i32 @main() {
 	%a = load i32, i32* @A		; <i32> [#uses=21]
 	%a_s = getelementptr [8 x i8], [8 x i8]* @a_str, i64 0, i64 0		; <i8*> [#uses=1]
 	%a_mul_s = getelementptr [13 x i8], [13 x i8]* @a_mul_str, i64 0, i64 0		; <i8*> [#uses=20]
-	call i32 (i8*, ...)* @printf( i8* %a_s, i32 %a )		; <i32>:1 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_s, i32 %a )		; <i32>:1 [#uses=0]
 	%r_0 = mul i32 %a, 0		; <i32> [#uses=1]
 	%r_1 = mul i32 %a, 1		; <i32> [#uses=1]
 	%r_2 = mul i32 %a, 2		; <i32> [#uses=1]
@@ -31,25 +31,25 @@ define i32 @main() {
 	%r_17 = mul i32 %a, 17		; <i32> [#uses=1]
 	%r_18 = mul i32 %a, 18		; <i32> [#uses=1]
 	%r_19 = mul i32 %a, 19		; <i32> [#uses=1]
-	call i32 (i8*, ...)* @printf( i8* %a_mul_s, i32 0, i32 %r_0 )		; <i32>:2 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %a_mul_s, i32 1, i32 %r_1 )		; <i32>:3 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %a_mul_s, i32 2, i32 %r_2 )		; <i32>:4 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %a_mul_s, i32 3, i32 %r_3 )		; <i32>:5 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %a_mul_s, i32 4, i32 %r_4 )		; <i32>:6 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %a_mul_s, i32 5, i32 %r_5 )		; <i32>:7 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %a_mul_s, i32 6, i32 %r_6 )		; <i32>:8 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %a_mul_s, i32 7, i32 %r_7 )		; <i32>:9 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %a_mul_s, i32 8, i32 %r_8 )		; <i32>:10 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %a_mul_s, i32 9, i32 %r_9 )		; <i32>:11 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %a_mul_s, i32 10, i32 %r_10 )		; <i32>:12 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %a_mul_s, i32 11, i32 %r_11 )		; <i32>:13 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %a_mul_s, i32 12, i32 %r_12 )		; <i32>:14 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %a_mul_s, i32 13, i32 %r_13 )		; <i32>:15 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %a_mul_s, i32 14, i32 %r_14 )		; <i32>:16 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %a_mul_s, i32 15, i32 %r_15 )		; <i32>:17 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %a_mul_s, i32 16, i32 %r_16 )		; <i32>:18 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %a_mul_s, i32 17, i32 %r_17 )		; <i32>:19 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %a_mul_s, i32 18, i32 %r_18 )		; <i32>:20 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %a_mul_s, i32 19, i32 %r_19 )		; <i32>:21 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_mul_s, i32 0, i32 %r_0 )		; <i32>:2 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_mul_s, i32 1, i32 %r_1 )		; <i32>:3 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_mul_s, i32 2, i32 %r_2 )		; <i32>:4 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_mul_s, i32 3, i32 %r_3 )		; <i32>:5 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_mul_s, i32 4, i32 %r_4 )		; <i32>:6 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_mul_s, i32 5, i32 %r_5 )		; <i32>:7 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_mul_s, i32 6, i32 %r_6 )		; <i32>:8 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_mul_s, i32 7, i32 %r_7 )		; <i32>:9 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_mul_s, i32 8, i32 %r_8 )		; <i32>:10 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_mul_s, i32 9, i32 %r_9 )		; <i32>:11 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_mul_s, i32 10, i32 %r_10 )		; <i32>:12 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_mul_s, i32 11, i32 %r_11 )		; <i32>:13 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_mul_s, i32 12, i32 %r_12 )		; <i32>:14 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_mul_s, i32 13, i32 %r_13 )		; <i32>:15 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_mul_s, i32 14, i32 %r_14 )		; <i32>:16 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_mul_s, i32 15, i32 %r_15 )		; <i32>:17 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_mul_s, i32 16, i32 %r_16 )		; <i32>:18 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_mul_s, i32 17, i32 %r_17 )		; <i32>:19 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_mul_s, i32 18, i32 %r_18 )		; <i32>:20 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_mul_s, i32 19, i32 %r_19 )		; <i32>:21 [#uses=0]
 	ret i32 0
 }
diff --git a/test/CodeGen/Generic/print-mul.ll b/test/CodeGen/Generic/print-mul.ll
index 06f2b40..4b60d75 100644
--- a/test/CodeGen/Generic/print-mul.ll
+++ b/test/CodeGen/Generic/print-mul.ll
@@ -15,14 +15,14 @@ entry:
 	%a_s = getelementptr [8 x i8], [8 x i8]* @a_str, i64 0, i64 0		; <i8*> [#uses=1]
 	%b_s = getelementptr [8 x i8], [8 x i8]* @b_str, i64 0, i64 0		; <i8*> [#uses=1]
 	%a_mul_s = getelementptr [13 x i8], [13 x i8]* @a_mul_str, i64 0, i64 0		; <i8*> [#uses=1]
-	call i32 (i8*, ...)* @printf( i8* %a_s, i32 %a )		; <i32>:0 [#uses=0]
-	call i32 (i8*, ...)* @printf( i8* %b_s, i32 %b )		; <i32>:1 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_s, i32 %a )		; <i32>:0 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %b_s, i32 %b )		; <i32>:1 [#uses=0]
 	br label %shl_test
 
 shl_test:		; preds = %shl_test, %entry
 	%s = phi i32 [ 0, %entry ], [ %s_inc, %shl_test ]		; <i32> [#uses=4]
 	%result = mul i32 %a, %s		; <i32> [#uses=1]
-	call i32 (i8*, ...)* @printf( i8* %a_mul_s, i32 %s, i32 %result )		; <i32>:2 [#uses=0]
+	call i32 (i8*, ...) @printf( i8* %a_mul_s, i32 %s, i32 %result )		; <i32>:2 [#uses=0]
 	%s_inc = add i32 %s, 1		; <i32> [#uses=1]
 	%done = icmp eq i32 %s, 256		; <i1> [#uses=1]
 	br i1 %done, label %fini, label %shl_test
diff --git a/test/CodeGen/Generic/print-shift.ll b/test/CodeGen/Generic/print-shift.ll
index af14f77..56b3ec1 100644
--- a/test/CodeGen/Generic/print-shift.ll
+++ b/test/CodeGen/Generic/print-shift.ll
@@ -15,15 +15,15 @@ entry:
         %a_s = getelementptr [8 x i8], [8 x i8]* @a_str, i64 0, i64 0             ; <i8*> [#uses=1]
         %b_s = getelementptr [8 x i8], [8 x i8]* @b_str, i64 0, i64 0             ; <i8*> [#uses=1]
         %a_shl_s = getelementptr [14 x i8], [14 x i8]* @a_shl_str, i64 0, i64 0            ; <i8*> [#uses=1]
-        call i32 (i8*, ...)* @printf( i8* %a_s, i32 %a )                ; <i32>:0 [#uses=0]
-        call i32 (i8*, ...)* @printf( i8* %b_s, i32 %b )                ; <i32>:1 [#uses=0]
+        call i32 (i8*, ...) @printf( i8* %a_s, i32 %a )                ; <i32>:0 [#uses=0]
+        call i32 (i8*, ...) @printf( i8* %b_s, i32 %b )                ; <i32>:1 [#uses=0]
         br label %shl_test
 
 shl_test:               ; preds = %shl_test, %entry
         %s = phi i8 [ 0, %entry ], [ %s_inc, %shl_test ]                ; <i8> [#uses=4]
         %shift.upgrd.1 = zext i8 %s to i32              ; <i32> [#uses=1]
         %result = shl i32 %a, %shift.upgrd.1            ; <i32> [#uses=1]
-        call i32 (i8*, ...)* @printf( i8* %a_shl_s, i8 %s, i32 %result )                ; <i32>:2 [#uses=0]
+        call i32 (i8*, ...) @printf( i8* %a_shl_s, i8 %s, i32 %result )                ; <i32>:2 [#uses=0]
         %s_inc = add i8 %s, 1           ; <i8> [#uses=1]
         %done = icmp eq i8 %s, 32               ; <i1> [#uses=1]
         br i1 %done, label %fini, label %shl_test
diff --git a/test/CodeGen/Hexagon/adde.ll b/test/CodeGen/Hexagon/adde.ll
index 6d060c1..5a8345c 100644
--- a/test/CodeGen/Hexagon/adde.ll
+++ b/test/CodeGen/Hexagon/adde.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-expand-condsets=0 < %s | FileCheck %s
 
 ; CHECK: r{{[0-9]+:[0-9]+}} = #0
 ; CHECK: r{{[0-9]+:[0-9]+}} = #1
diff --git a/test/CodeGen/Hexagon/expand-condsets-basic.ll b/test/CodeGen/Hexagon/expand-condsets-basic.ll
new file mode 100644
index 0000000..16fe8af
--- /dev/null
+++ b/test/CodeGen/Hexagon/expand-condsets-basic.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: if{{.*}}add
+; CHECK: if{{.*}}sub
+
+define i32 @foo (i1 %a, i32 %b, i32 %c, i32 %d) nounwind {
+  %1 = add i32 %b, %d
+  %2 = sub i32 %c, %d
+  %3 = select i1 %a, i32 %1, i32 %2
+  ret i32 %3
+}
+
diff --git a/test/CodeGen/Hexagon/expand-condsets-rm-segment.ll b/test/CodeGen/Hexagon/expand-condsets-rm-segment.ll
new file mode 100644
index 0000000..cde7e6a
--- /dev/null
+++ b/test/CodeGen/Hexagon/expand-condsets-rm-segment.ll
@@ -0,0 +1,131 @@
+; RUN: llc -O2 < %s
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32"
+target triple = "hexagon-unknown--elf"
+
+%struct.cpumask = type { [1 x i32] }
+%struct.load_weight = type { i32, i32 }
+
+@sysctl_sched_latency = global i32 6000000, align 4
+@normalized_sysctl_sched_latency = global i32 6000000, align 4
+@sysctl_sched_tunable_scaling = global i8 1, align 1
+@sysctl_sched_min_granularity = global i32 750000, align 4
+@normalized_sysctl_sched_min_granularity = global i32 750000, align 4
+@sysctl_sched_wakeup_granularity = global i32 1000000, align 4
+@normalized_sysctl_sched_wakeup_granularity = global i32 1000000, align 4
+@sysctl_sched_migration_cost = constant i32 500000, align 4
+@sysctl_sched_shares_window = global i32 10000000, align 4
+@sysctl_sched_child_runs_first = common global i32 0, align 4
+@cpu_online_mask = external constant %struct.cpumask*
+
+; Function Attrs: noinline nounwind
+define void @sched_init_granularity() #0 {
+entry:
+  tail call fastcc void @update_sysctl()
+  ret void
+}
+
+; Function Attrs: noinline nounwind
+define internal fastcc void @update_sysctl() #0 {
+entry:
+  %call = tail call i32 @get_update_sysctl_factor()
+  %0 = load i32, i32* @normalized_sysctl_sched_min_granularity, align 4, !tbaa !1
+  %mul = mul i32 %0, %call
+  store i32 %mul, i32* @sysctl_sched_min_granularity, align 4, !tbaa !1
+  %1 = load i32, i32* @normalized_sysctl_sched_latency, align 4, !tbaa !1
+  %mul1 = mul i32 %1, %call
+  store i32 %mul1, i32* @sysctl_sched_latency, align 4, !tbaa !1
+  %2 = load i32, i32* @normalized_sysctl_sched_wakeup_granularity, align 4, !tbaa !1
+  %mul2 = mul i32 %2, %call
+  store i32 %mul2, i32* @sysctl_sched_wakeup_granularity, align 4, !tbaa !1
+  ret void
+}
+
+; Function Attrs: noinline nounwind
+define i32 @calc_delta_mine(i32 %delta_exec, i32 %weight, %struct.load_weight* nocapture %lw) #0 {
+entry:
+  %cmp = icmp ugt i32 %weight, 1
+  %conv = zext i32 %delta_exec to i64
+  br i1 %cmp, label %if.then, label %if.end, !prof !5
+
+if.then:                                          ; preds = %entry
+  %conv2 = zext i32 %weight to i64
+  %mul = mul i64 %conv2, %conv
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %tmp.0 = phi i64 [ %mul, %if.then ], [ %conv, %entry ]
+  %inv_weight = getelementptr inbounds %struct.load_weight, %struct.load_weight* %lw, i32 0, i32 1
+  %0 = load i32, i32* %inv_weight, align 4, !tbaa !6
+  %tobool4 = icmp eq i32 %0, 0
+  br i1 %tobool4, label %if.then5, label %if.end22
+
+if.then5:                                         ; preds = %if.end
+  %weight7 = getelementptr inbounds %struct.load_weight, %struct.load_weight* %lw, i32 0, i32 0
+  %1 = load i32, i32* %weight7, align 4, !tbaa !9
+  %lnot9 = icmp eq i32 %1, 0
+  br i1 %lnot9, label %if.then17, label %if.else19, !prof !10
+
+if.then17:                                        ; preds = %if.then5
+  store i32 -1, i32* %inv_weight, align 4, !tbaa !6
+  br label %if.end22
+
+if.else19:                                        ; preds = %if.then5
+  %div = udiv i32 -1, %1
+  store i32 %div, i32* %inv_weight, align 4, !tbaa !6
+  br label %if.end22
+
+if.end22:                                         ; preds = %if.end, %if.then17, %if.else19
+  %2 = phi i32 [ %0, %if.end ], [ -1, %if.then17 ], [ %div, %if.else19 ]
+  %cmp23 = icmp ugt i64 %tmp.0, 4294967295
+  br i1 %cmp23, label %if.then31, label %if.else37, !prof !10
+
+if.then31:                                        ; preds = %if.end22
+  %add = add i64 %tmp.0, 32768
+  %shr = lshr i64 %add, 16
+  %conv33 = zext i32 %2 to i64
+  %mul34 = mul i64 %conv33, %shr
+  %add35 = add i64 %mul34, 32768
+  %shr36 = lshr i64 %add35, 16
+  br label %if.end43
+
+if.else37:                                        ; preds = %if.end22
+  %conv39 = zext i32 %2 to i64
+  %mul40 = mul i64 %conv39, %tmp.0
+  %add41 = add i64 %mul40, 2147483648
+  %shr42 = lshr i64 %add41, 32
+  br label %if.end43
+
+if.end43:                                         ; preds = %if.else37, %if.then31
+  %tmp.1 = phi i64 [ %shr36, %if.then31 ], [ %shr42, %if.else37 ]
+  %cmp49 = icmp ult i64 %tmp.1, 2147483647
+  %3 = trunc i64 %tmp.1 to i32
+  %conv51 = select i1 %cmp49, i32 %3, i32 2147483647
+  ret i32 %conv51
+}
+
+declare i32 @get_update_sysctl_factor() #0
+declare i32 @__bitmap_weight(i32*, i32) #1
+
+attributes #0 = { noinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"Clang 3.1"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!"branch_weights", i32 64, i32 4}
+!6 = !{!7, !8, i64 4}
+!7 = !{!"load_weight", !8, i64 0, !8, i64 4}
+!8 = !{!"long", !3, i64 0}
+!9 = !{!7, !8, i64 0}
+!10 = !{!"branch_weights", i32 4, i32 64}
+!11 = !{!12, !12, i64 0}
+!12 = !{!"any pointer", !3, i64 0}
+!13 = !{!3, !3, i64 0}
+!14 = !{i32 45854, i32 45878}
diff --git a/test/CodeGen/Hexagon/expand-condsets-undef.ll b/test/CodeGen/Hexagon/expand-condsets-undef.ll
new file mode 100644
index 0000000..85e72aa
--- /dev/null
+++ b/test/CodeGen/Hexagon/expand-condsets-undef.ll
@@ -0,0 +1,28 @@
+; RUN: llc -O2 < %s
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0-v32:32-n16:32"
+target triple = "hexagon"
+
+; Function Attrs: nounwind optsize ssp
+define internal fastcc void @foo() nounwind {
+if.else473:
+  %0 = load i64, i64* undef, align 8
+  %sub = sub nsw i64 undef, %0
+  %conv476 = sitofp i64 %sub to double
+  %mul477 = fmul double %conv476, 0x3F50624DE0000000
+  br i1 undef, label %cond.true540, label %cond.end548
+
+cond.true540:
+  %1 = fptrunc double %mul477 to float
+  %2 = fptosi float %1 to i32
+  br label %cond.end548
+
+cond.end548:
+  %cond549 = phi i32 [ %2, %cond.true540 ], [ undef, %if.else473 ]
+  call void @bar(i32 %cond549) nounwind
+  unreachable
+}
+
+declare void @bar(i32) nounwind
+
diff --git a/test/CodeGen/Hexagon/i16_VarArg.ll b/test/CodeGen/Hexagon/i16_VarArg.ll
index 41cecec..ba98f62 100644
--- a/test/CodeGen/Hexagon/i16_VarArg.ll
+++ b/test/CodeGen/Hexagon/i16_VarArg.ll
@@ -35,6 +35,6 @@ define i32 @main() {
         %ge_s = getelementptr [13 x i8], [13 x i8]* @ge_str, i64 0, i64 0
         %eq_s = getelementptr [13 x i8], [13 x i8]* @eq_str, i64 0, i64 0
         %ne_s = getelementptr [13 x i8], [13 x i8]* @ne_str, i64 0, i64 0
-        call i32 (i8*, ...)* @printf( i8* %lt_s, i16 %val1 )
+        call i32 (i8*, ...) @printf( i8* %lt_s, i16 %val1 )
         ret i32 0
 }
diff --git a/test/CodeGen/Hexagon/i1_VarArg.ll b/test/CodeGen/Hexagon/i1_VarArg.ll
index 8b5625c..1908b3c 100644
--- a/test/CodeGen/Hexagon/i1_VarArg.ll
+++ b/test/CodeGen/Hexagon/i1_VarArg.ll
@@ -34,11 +34,11 @@ define i32 @main() {
         %ge_s = getelementptr [13 x i8], [13 x i8]* @ge_str, i64 0, i64 0
         %eq_s = getelementptr [13 x i8], [13 x i8]* @eq_str, i64 0, i64 0
         %ne_s = getelementptr [13 x i8], [13 x i8]* @ne_str, i64 0, i64 0
-        call i32 (i8*, ...)* @printf( i8* %lt_s, i1 %lt_r )
-        call i32 (i8*, ...)* @printf( i8* %le_s, i1 %le_r )
-        call i32 (i8*, ...)* @printf( i8* %gt_s, i1 %gt_r )
-        call i32 (i8*, ...)* @printf( i8* %ge_s, i1 %ge_r )
-        call i32 (i8*, ...)* @printf( i8* %eq_s, i1 %eq_r )
-        call i32 (i8*, ...)* @printf( i8* %ne_s, i1 %ne_r )
+        call i32 (i8*, ...) @printf( i8* %lt_s, i1 %lt_r )
+        call i32 (i8*, ...) @printf( i8* %le_s, i1 %le_r )
+        call i32 (i8*, ...) @printf( i8* %gt_s, i1 %gt_r )
+        call i32 (i8*, ...) @printf( i8* %ge_s, i1 %ge_r )
+        call i32 (i8*, ...) @printf( i8* %eq_s, i1 %eq_r )
+        call i32 (i8*, ...) @printf( i8* %ne_s, i1 %ne_r )
         ret i32 0
 }
diff --git a/test/CodeGen/Hexagon/i8_VarArg.ll b/test/CodeGen/Hexagon/i8_VarArg.ll
index 7283ba4..c40a6a9 100644
--- a/test/CodeGen/Hexagon/i8_VarArg.ll
+++ b/test/CodeGen/Hexagon/i8_VarArg.ll
@@ -35,6 +35,6 @@ define i32 @main() {
         %ge_s = getelementptr [13 x i8], [13 x i8]* @ge_str, i64 0, i64 0
         %eq_s = getelementptr [13 x i8], [13 x i8]* @eq_str, i64 0, i64 0
         %ne_s = getelementptr [13 x i8], [13 x i8]* @ne_str, i64 0, i64 0
-        call i32 (i8*, ...)* @printf( i8* %lt_s, i8 %val1 )
+        call i32 (i8*, ...) @printf( i8* %lt_s, i8 %val1 )
         ret i32 0
 }
diff --git a/test/CodeGen/Hexagon/sube.ll b/test/CodeGen/Hexagon/sube.ll
index 735ac9e..1a78822 100644
--- a/test/CodeGen/Hexagon/sube.ll
+++ b/test/CodeGen/Hexagon/sube.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-expand-condsets=0 < %s | FileCheck %s
 
 ; CHECK: r{{[0-9]+:[0-9]+}} = #0
 ; CHECK: r{{[0-9]+:[0-9]+}} = #1
diff --git a/test/CodeGen/Hexagon/tail-call-mem-intrinsics.ll b/test/CodeGen/Hexagon/tail-call-mem-intrinsics.ll
new file mode 100644
index 0000000..90fb75e
--- /dev/null
+++ b/test/CodeGen/Hexagon/tail-call-mem-intrinsics.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; CHECK-LABEL: tail_memcpy:
+; CHECK: jump memcpy
+define void @tail_memcpy(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: tail_memmove:
+; CHECK: jump memmove
+define void @tail_memmove(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 {
+entry:
+  tail call void @llvm.memmove.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: tail_memset:
+; CHECK: jump memset
+define void @tail_memset(i8* nocapture %p, i8 %c, i32 %n) #0 {
+entry:
+  tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #0
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #0
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/Inputs/DbgValueOtherTargets.ll b/test/CodeGen/Inputs/DbgValueOtherTargets.ll
index fc44a98..0af0293 100644
--- a/test/CodeGen/Inputs/DbgValueOtherTargets.ll
+++ b/test/CodeGen/Inputs/DbgValueOtherTargets.ll
@@ -20,7 +20,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !3 = !MDSubroutineType(types: !4)
 !4 = !{!5}
 !5 = !MDBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!6 = !{i32 0}
+!6 = !{}
 !7 = !MDLocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 3, scope: !8, file: !1, type: !5)
 !8 = distinct !MDLexicalBlock(line: 2, column: 12, file: !12, scope: !0)
 !9 = !MDLocation(line: 3, column: 11, scope: !8)
diff --git a/test/CodeGen/Mips/Fast-ISel/logopm.ll b/test/CodeGen/Mips/Fast-ISel/logopm.ll
index cfb751f..0f0c3bf 100644
--- a/test/CodeGen/Mips/Fast-ISel/logopm.ll
+++ b/test/CodeGen/Mips/Fast-ISel/logopm.ll
@@ -68,11 +68,12 @@ entry:
 
 ; Function Attrs: noinline nounwind
 define void @andUb1() #0 {
+; clang uses i8 constants for booleans, so we test with an i8 1.
 entry:
-  %0 = load i8, i8* @ub1, align 1, !tbaa !2
-  %conv = trunc i8 %0 to i1
-  %and = and i1 %conv, 1
-  %conv1 = zext i1 %and to i8
+  %x = load i8, i8* @ub1, align 1, !tbaa !2
+  %and = and i8 %x, 1
+  %conv = trunc i8 %and to i1
+  %conv1 = zext i1 %conv to i8
   store i8 %conv1, i8* @ub, align 1, !tbaa !2
 ; CHECK-LABEL:  .ent    andUb1
 ; CHECK:        lui     $[[REG_GPa:[0-9]+]], %hi(_gp_disp)
@@ -138,10 +139,10 @@ entry:
 ; Function Attrs: noinline nounwind
 define void @orUb1() #0 {
 entry:
-  %0 = load i8, i8* @ub1, align 1, !tbaa !2
-  %conv = trunc i8 %0 to i1
-  %or = or i1 %conv, 1
-  %conv1 = zext i1 %or to i8
+  %x = load i8, i8* @ub1, align 1, !tbaa !2
+  %or = or i8 %x, 1
+  %conv = trunc i8 %or to i1
+  %conv1 = zext i1 %conv to i8
   store i8 %conv1, i8* @ub, align 1, !tbaa !2
 ; CHECK-LABEL:  .ent    orUb1
 ; CHECK:        lui     $[[REG_GPa:[0-9]+]], %hi(_gp_disp)
@@ -208,10 +209,10 @@ entry:
 ; Function Attrs: noinline nounwind
 define void @xorUb1() #0 {
 entry:
-  %0 = load i8, i8* @ub1, align 1, !tbaa !2
-  %conv = trunc i8 %0 to i1
-  %xor = xor i1 %conv, 1
-  %conv1 = zext i1 %xor to i8
+  %x = load i8, i8* @ub1, align 1, !tbaa !2
+  %xor = xor i8 1, %x
+  %conv = trunc i8 %xor to i1
+  %conv1 = zext i1 %conv to i8
   store i8 %conv1, i8* @ub, align 1, !tbaa !2
 ; CHECK-LABEL:  .ent    xorUb1
 ; CHECK:        lui     $[[REG_GPa:[0-9]+]], %hi(_gp_disp)
diff --git a/test/CodeGen/Mips/adjust-callstack-sp.ll b/test/CodeGen/Mips/adjust-callstack-sp.ll
new file mode 100644
index 0000000..8c61a65
--- /dev/null
+++ b/test/CodeGen/Mips/adjust-callstack-sp.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=mips -mcpu=mips16 | FileCheck %s -check-prefix=M16
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips3 | FileCheck %s -check-prefix=GP64
+; RUN: llc < %s -march=mips -mcpu=mips64 | FileCheck %s -check-prefix=GP64
+; RUN: llc < %s -march=mips -mcpu=mips64r6 | FileCheck %s -check-prefix=GP64
+
+declare void @bar(i32*)
+
+define void @foo(i32 %sz) {
+  ; ALL-LABEL: foo:
+
+    ; M16-NOT:        addiu     $sp, 0 # 16 bit inst
+    ; GP32-NOT:       addiu     $sp, $sp, 0
+    ; GP64-NOT:       daddiu    $sp, $sp, 0
+  %a = alloca i32, i32 %sz
+  call void @bar(i32* %a)
+  ret void
+}
diff --git a/test/CodeGen/Mips/alloca.ll b/test/CodeGen/Mips/alloca.ll
index 8967d57..747a136 100644
--- a/test/CodeGen/Mips/alloca.ll
+++ b/test/CodeGen/Mips/alloca.ll
@@ -76,7 +76,7 @@ if.end:                                           ; preds = %if.else, %if.then
   %arrayidx24 = getelementptr inbounds i8, i8* %tmp1, i32 24
   %7 = bitcast i8* %arrayidx24 to i32*
   %tmp25 = load i32, i32* %7, align 4
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str, i32 0, i32 0), i32 %tmp7, i32 %tmp10, i32 %tmp13, i32 %tmp16, i32 %tmp19, i32 %tmp22, i32 %tmp25) nounwind
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str, i32 0, i32 0), i32 %tmp7, i32 %tmp10, i32 %tmp13, i32 %tmp16, i32 %tmp19, i32 %tmp22, i32 %tmp25) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/analyzebranch.ll b/test/CodeGen/Mips/analyzebranch.ll
index 4b5d097..d5ecaae 100644
--- a/test/CodeGen/Mips/analyzebranch.ll
+++ b/test/CodeGen/Mips/analyzebranch.ll
@@ -60,7 +60,7 @@ if.then:                                          ; preds = %entry
   unreachable
 
 if.end:                                           ; preds = %entry
-  tail call void (...)* @f2() nounwind
+  tail call void (...) @f2() nounwind
   ret void
 }
 
diff --git a/test/CodeGen/Mips/and1.ll b/test/CodeGen/Mips/and1.ll
index be9ba3e..57076a4 100644
--- a/test/CodeGen/Mips/and1.ll
+++ b/test/CodeGen/Mips/and1.ll
@@ -10,7 +10,7 @@ entry:
   %1 = load i32, i32* @y, align 4
   %and = and i32 %0, %1
 ; 16:	and	${{[0-9]+}}, ${{[0-9]+}}
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0), i32 %and)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0), i32 %and)
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/atomicops.ll b/test/CodeGen/Mips/atomicops.ll
index 920357d..0ff9f5c 100644
--- a/test/CodeGen/Mips/atomicops.ll
+++ b/test/CodeGen/Mips/atomicops.ll
@@ -19,14 +19,14 @@ entry:
   %0 = atomicrmw add i32* %x, i32 1 seq_cst
   %add.i = add nsw i32 %0, 2
   %1 = load volatile i32, i32* %x, align 4
-  %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %add.i, i32 %1) nounwind
+  %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %add.i, i32 %1) nounwind
   %pair = cmpxchg i32* %x, i32 1, i32 2 seq_cst seq_cst
   %2 = extractvalue { i32, i1 } %pair, 0
   %3 = load volatile i32, i32* %x, align 4
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %2, i32 %3) nounwind
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %2, i32 %3) nounwind
   %4 = atomicrmw xchg i32* %x, i32 1 seq_cst
   %5 = load volatile i32, i32* %x, align 4
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %4, i32 %5) nounwind
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %4, i32 %5) nounwind
 ; 16-LABEL: main:
 ; 16:	lw	${{[0-9]+}}, %call16(__sync_synchronize)(${{[0-9]+}})
 ; 16: 	lw	${{[0-9]+}}, %call16(__sync_fetch_and_add_4)(${{[0-9]+}})
diff --git a/test/CodeGen/Mips/cache-intrinsic.ll b/test/CodeGen/Mips/cache-intrinsic.ll
index 461c181..987032e 100644
--- a/test/CodeGen/Mips/cache-intrinsic.ll
+++ b/test/CodeGen/Mips/cache-intrinsic.ll
@@ -10,10 +10,10 @@ define i32 @main() {
 entry:
   %retval = alloca i32, align 4
   store i32 0, i32* %retval
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0))
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0))
   %call1 = call i8* @strcpy(i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str1, i32 0, i32 0)) #3
   call void @llvm.clear_cache(i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0), i8* getelementptr inbounds (i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0), i32 32)) #3
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0))
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0))
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/cconv/arguments-varargs-small-structs-byte.ll b/test/CodeGen/Mips/cconv/arguments-varargs-small-structs-byte.ll
index abb3601..ba3aeb5 100644
--- a/test/CodeGen/Mips/cconv/arguments-varargs-small-structs-byte.ll
+++ b/test/CodeGen/Mips/cconv/arguments-varargs-small-structs-byte.ll
@@ -144,7 +144,7 @@ entry:
   %1 = bitcast %struct.SmallStruct_1b* %0 to { i8 }*
   %2 = getelementptr { i8 }, { i8 }* %1, i32 0, i32 0
   %3 = load i8, i8* %2, align 1
-  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i8 inreg %3)
+  call void (i8*, ...) @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i8 inreg %3)
   ret void
  ; CHECK-LABEL: smallStruct_1b: 
  ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 56
@@ -158,7 +158,7 @@ entry:
   %1 = bitcast %struct.SmallStruct_2b* %0 to { i16 }*
   %2 = getelementptr { i16 }, { i16 }* %1, i32 0, i32 0
   %3 = load i16, i16* %2, align 1
-  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i16 inreg %3)
+  call void (i8*, ...) @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i16 inreg %3)
   ret void
  ; CHECK-LABEL: smallStruct_2b:
  ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 48
@@ -175,7 +175,7 @@ entry:
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 3, i32 0, i1 false)
   %3 = getelementptr { i24 }, { i24 }* %.coerce, i32 0, i32 0
   %4 = load i24, i24* %3, align 1
-  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i24 inreg %4)
+  call void (i8*, ...) @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i24 inreg %4)
   ret void
  ; CHECK-LABEL: smallStruct_3b:
  ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 40
@@ -191,7 +191,7 @@ entry:
   %1 = bitcast %struct.SmallStruct_4b* %0 to { i32 }*
   %2 = getelementptr { i32 }, { i32 }* %1, i32 0, i32 0
   %3 = load i32, i32* %2, align 1
-  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i32 inreg %3)
+  call void (i8*, ...) @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i32 inreg %3)
   ret void
  ; CHECK-LABEL: smallStruct_4b:
  ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 32
@@ -208,7 +208,7 @@ entry:
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 5, i32 0, i1 false)
   %3 = getelementptr { i40 }, { i40 }* %.coerce, i32 0, i32 0
   %4 = load i40, i40* %3, align 1
-  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i40 inreg %4)
+  call void (i8*, ...) @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i40 inreg %4)
   ret void
  ; CHECK-LABEL: smallStruct_5b:
  ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 24
@@ -225,7 +225,7 @@ entry:
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 6, i32 0, i1 false)
   %3 = getelementptr { i48 }, { i48 }* %.coerce, i32 0, i32 0
   %4 = load i48, i48* %3, align 1
-  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i48 inreg %4)
+  call void (i8*, ...) @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i48 inreg %4)
   ret void
  ; CHECK-LABEL: smallStruct_6b:
  ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 16
@@ -242,7 +242,7 @@ entry:
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 7, i32 0, i1 false)
   %3 = getelementptr { i56 }, { i56 }* %.coerce, i32 0, i32 0
   %4 = load i56, i56* %3, align 1
-  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i56 inreg %4)
+  call void (i8*, ...) @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i56 inreg %4)
   ret void
  ; CHECK-LABEL: smallStruct_7b:
  ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 8
@@ -256,7 +256,7 @@ entry:
   %1 = bitcast %struct.SmallStruct_8b* %0 to { i64 }*
   %2 = getelementptr { i64 }, { i64 }* %1, i32 0, i32 0
   %3 = load i64, i64* %2, align 1
-  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i64 inreg %3)
+  call void (i8*, ...) @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i64 inreg %3)
   ret void
  ; CHECK-LABEL: smallStruct_8b:
  ; CHECK-NOT: dsll
@@ -275,7 +275,7 @@ entry:
   %4 = load i64, i64* %3, align 1
   %5 = getelementptr { i64, i8 }, { i64, i8 }* %.coerce, i32 0, i32 1
   %6 = load i8, i8* %5, align 1
-  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i64 inreg %4, i8 inreg %6)
+  call void (i8*, ...) @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i64 inreg %4, i8 inreg %6)
   ret void
  ; CHECK-LABEL: smallStruct_9b:
  ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 56
diff --git a/test/CodeGen/Mips/cconv/arguments-varargs-small-structs-combinations.ll b/test/CodeGen/Mips/cconv/arguments-varargs-small-structs-combinations.ll
index 7da6ab1..74d3d85 100644
--- a/test/CodeGen/Mips/cconv/arguments-varargs-small-structs-combinations.ll
+++ b/test/CodeGen/Mips/cconv/arguments-varargs-small-structs-combinations.ll
@@ -78,7 +78,7 @@ entry:
   %1 = bitcast %struct.SmallStruct_1b1s* %0 to { i32 }*
   %2 = getelementptr { i32 }, { i32 }* %1, i32 0, i32 0
   %3 = load i32, i32* %2, align 1
-  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i32 inreg %3)
+  call void (i8*, ...) @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i32 inreg %3)
   ret void
  ; CHECK-LABEL: smallStruct_1b1s:
  ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 32
@@ -92,7 +92,7 @@ entry:
   %1 = bitcast %struct.SmallStruct_1b1i* %0 to { i64 }*
   %2 = getelementptr { i64 }, { i64 }* %1, i32 0, i32 0
   %3 = load i64, i64* %2, align 1
-  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i64 inreg %3)
+  call void (i8*, ...) @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i64 inreg %3)
   ret void
  ; CHECK-LABEL: smallStruct_1b1i:
  ; CHECK-NOT: dsll
@@ -109,7 +109,7 @@ entry:
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 6, i32 0, i1 false)
   %3 = getelementptr { i48 }, { i48 }* %.coerce, i32 0, i32 0
   %4 = load i48, i48* %3, align 1
-  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i48 inreg %4)
+  call void (i8*, ...) @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i48 inreg %4)
   ret void
  ; CHECK-LABEL: smallStruct_1b1s1b:
  ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 16
@@ -125,7 +125,7 @@ entry:
   %1 = bitcast %struct.SmallStruct_1s1i* %0 to { i64 }*
   %2 = getelementptr { i64 }, { i64 }* %1, i32 0, i32 0
   %3 = load i64, i64* %2, align 1
-  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i64 inreg %3)
+  call void (i8*, ...) @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i64 inreg %3)
   ret void
  ; CHECK-LABEL: smallStruct_1s1i:
  ; CHECK-NOT: dsll
@@ -142,7 +142,7 @@ entry:
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 6, i32 0, i1 false)
   %3 = getelementptr { i48 }, { i48 }* %.coerce, i32 0, i32 0
   %4 = load i48, i48* %3, align 1
-  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i48 inreg %4)
+  call void (i8*, ...) @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i48 inreg %4)
   ret void
  ; CHECK-LABEL: smallStruct_3b1s:
  ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 16
diff --git a/test/CodeGen/Mips/cconv/arguments-varargs-small-structs-multiple-args.ll b/test/CodeGen/Mips/cconv/arguments-varargs-small-structs-multiple-args.ll
index f70b75f..a4ac5e7 100644
--- a/test/CodeGen/Mips/cconv/arguments-varargs-small-structs-multiple-args.ll
+++ b/test/CodeGen/Mips/cconv/arguments-varargs-small-structs-multiple-args.ll
@@ -146,7 +146,7 @@ entry:
   %33 = bitcast %struct.SmallStruct_1b* %8 to { i8 }*
   %34 = getelementptr { i8 }, { i8 }* %33, i32 0, i32 0
   %35 = load i8, i8* %34, align 1
-  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i8 inreg %11, i8 inreg %14, i8 inreg %17, i8 inreg %20, i8 inreg %23, i8 inreg %26, i8 inreg %29, i8 inreg %32, i8 inreg %35)
+  call void (i8*, ...) @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i8 inreg %11, i8 inreg %14, i8 inreg %17, i8 inreg %20, i8 inreg %23, i8 inreg %26, i8 inreg %29, i8 inreg %32, i8 inreg %35)
   ret void
  ; CHECK-LABEL: smallStruct_1b_x9:
  ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 56
diff --git a/test/CodeGen/Mips/cfi_offset.ll b/test/CodeGen/Mips/cfi_offset.ll
index 6e78344..9723332 100644
--- a/test/CodeGen/Mips/cfi_offset.ll
+++ b/test/CodeGen/Mips/cfi_offset.ll
@@ -34,7 +34,7 @@ define void @bar() {
 
     %val1 = load volatile double, double* @var
     %val2 = load volatile double, double* @var
-    call void (...)* @foo() nounwind
+    call void (...) @foo() nounwind
     store volatile double %val1, double* @var
     store volatile double %val2, double* @var
     ret void
diff --git a/test/CodeGen/Mips/dagcombine_crash.ll b/test/CodeGen/Mips/dagcombine_crash.ll
new file mode 100644
index 0000000..6fcf2b4
--- /dev/null
+++ b/test/CodeGen/Mips/dagcombine_crash.ll
@@ -0,0 +1,25 @@
+; RUN: llc -o - %s | FileCheck %s
+; The selection DAG select(select()) normalisation crashed for different types
+; on the condition inputs.
+target datalayout = "E-m:m-p:32:32-i8:8:32-i16:16:32-i64:64-n32-S64"
+target triple = "mips--"
+
+; CHECK-LABEL: foobar
+; CHECK: sltiu ${{[0-9]*}}, ${{[0-9]*}}, 42
+; CHECK: sltiu ${{[0-9]*}}, ${{[0-9]*}}, 23
+; CHECK: and ${{[0-9]*}}, ${{[0-9]*}}, ${{[0-9]*}}
+; CHECK: sltu ${{[0-9]*}}, ${{[0-9]*}}, ${{[0-9]*}}
+; CHECK: addiu ${{[0-9]*}}, ${{[0-9]*}}, -1
+; CHECK: movn ${{[0-9]*}}, ${{[0-9]*}}, ${{[0-9]*}}
+; CHECK: jr $ra
+; CHECK: move ${{[0-9]*}}, ${{[0-9]*}}
+define i64 @foobar(i32 %arg) #0 {
+entry:
+  %cmp0 = icmp ult i32 %arg, 23
+  %cmp1 = icmp ult i32 %arg, 42
+  %and = and i1 %cmp0, %cmp1
+  %cmp2 = icmp ugt i32 %arg, 0
+  %sext = sext i1 %cmp1 to i64
+  %retval.0 = select i1 %and, i64 %sext, i64 0
+  ret i64 %retval.0
+}
diff --git a/test/CodeGen/Mips/eh-return32.ll b/test/CodeGen/Mips/eh-return32.ll
index 748050c..542c5bf 100644
--- a/test/CodeGen/Mips/eh-return32.ll
+++ b/test/CodeGen/Mips/eh-return32.ll
@@ -7,7 +7,7 @@ declare void @foo(...)
 
 define i8* @f1(i32 %offset, i8* %handler) {
 entry:
-  call void (...)* @foo()
+  call void (...) @foo()
   call void @llvm.eh.return.i32(i32 %offset, i8* %handler)
   unreachable
 
diff --git a/test/CodeGen/Mips/eh-return64.ll b/test/CodeGen/Mips/eh-return64.ll
index 74a4323..2f8203d 100644
--- a/test/CodeGen/Mips/eh-return64.ll
+++ b/test/CodeGen/Mips/eh-return64.ll
@@ -8,7 +8,7 @@ declare void @foo(...)
 
 define void @f1(i64 %offset, i8* %handler) {
 entry:
-  call void (...)* @foo()
+  call void (...) @foo()
   call void @llvm.eh.return.i64(i64 %offset, i8* %handler)
   unreachable
 
diff --git a/test/CodeGen/Mips/fpbr.ll b/test/CodeGen/Mips/fpbr.ll
index 311b830..27d7094 100644
--- a/test/CodeGen/Mips/fpbr.ll
+++ b/test/CodeGen/Mips/fpbr.ll
@@ -24,11 +24,11 @@ entry:
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  tail call void (...)* @g0() nounwind
+  tail call void (...) @g0() nounwind
   br label %if.end
 
 if.else:                                          ; preds = %entry
-  tail call void (...)* @g1() nounwind
+  tail call void (...) @g1() nounwind
   br label %if.end
 
 if.end:                                           ; preds = %if.else, %if.then
@@ -57,11 +57,11 @@ entry:
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  tail call void (...)* @g0() nounwind
+  tail call void (...) @g0() nounwind
   br label %if.end
 
 if.else:                                          ; preds = %entry
-  tail call void (...)* @g1() nounwind
+  tail call void (...) @g1() nounwind
   br label %if.end
 
 if.end:                                           ; preds = %if.else, %if.then
@@ -86,11 +86,11 @@ entry:
   br i1 %cmp, label %if.else, label %if.then
 
 if.then:                                          ; preds = %entry
-  tail call void (...)* @g0() nounwind
+  tail call void (...) @g0() nounwind
   br label %if.end
 
 if.else:                                          ; preds = %entry
-  tail call void (...)* @g1() nounwind
+  tail call void (...) @g1() nounwind
   br label %if.end
 
 if.end:                                           ; preds = %if.else, %if.then
@@ -116,11 +116,11 @@ entry:
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  tail call void (...)* @g0() nounwind
+  tail call void (...) @g0() nounwind
   br label %if.end
 
 if.else:                                          ; preds = %entry
-  tail call void (...)* @g1() nounwind
+  tail call void (...) @g1() nounwind
   br label %if.end
 
 if.end:                                           ; preds = %if.else, %if.then
@@ -145,11 +145,11 @@ entry:
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  tail call void (...)* @g0() nounwind
+  tail call void (...) @g0() nounwind
   br label %if.end
 
 if.else:                                          ; preds = %entry
-  tail call void (...)* @g1() nounwind
+  tail call void (...) @g1() nounwind
   br label %if.end
 
 if.end:                                           ; preds = %if.else, %if.then
@@ -174,11 +174,11 @@ entry:
   br i1 %cmp, label %if.else, label %if.then
 
 if.then:                                          ; preds = %entry
-  tail call void (...)* @g0() nounwind
+  tail call void (...) @g0() nounwind
   br label %if.end
 
 if.else:                                          ; preds = %entry
-  tail call void (...)* @g1() nounwind
+  tail call void (...) @g1() nounwind
   br label %if.end
 
 if.end:                                           ; preds = %if.else, %if.then
diff --git a/test/CodeGen/Mips/gprestore.ll b/test/CodeGen/Mips/gprestore.ll
index 0b005ab..b1c2ad1 100644
--- a/test/CodeGen/Mips/gprestore.ll
+++ b/test/CodeGen/Mips/gprestore.ll
@@ -17,7 +17,7 @@ entry:
 ; CHECK: jalr
 ; CHECK-NOT: got({{.*}})($gp)
 ; CHECK: lw $gp
-  tail call void (...)* @f1() nounwind
+  tail call void (...) @f1() nounwind
   %tmp = load i32, i32* @p, align 4
   tail call void @f2(i32 %tmp) nounwind
   %tmp1 = load i32, i32* @q, align 4
diff --git a/test/CodeGen/Mips/helloworld.ll b/test/CodeGen/Mips/helloworld.ll
index 768abc2..a0dbdf3 100644
--- a/test/CodeGen/Mips/helloworld.ll
+++ b/test/CodeGen/Mips/helloworld.ll
@@ -12,7 +12,7 @@
 
 define i32 @main() nounwind {
 entry:
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0))
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0))
   ret i32 0
 
 ; SR: 	.set	mips16
diff --git a/test/CodeGen/Mips/hf16call32.ll b/test/CodeGen/Mips/hf16call32.ll
index 59cf413..3b3f8f7 100644
--- a/test/CodeGen/Mips/hf16call32.ll
+++ b/test/CodeGen/Mips/hf16call32.ll
@@ -77,7 +77,7 @@ entry:
   %4 = load float, float* @lx, align 4
   %cmp = fcmp oeq float %3, %4
   %conv2 = zext i1 %cmp to i32
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), double %conv, double %conv1, i32 %conv2)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), double %conv, double %conv1, i32 %conv2)
   call void @clear()
   store double 0x41678C29C0000000, double* @lxd, align 8
   %5 = load double, double* @lxd, align 8
@@ -88,7 +88,7 @@ entry:
   %9 = load double, double* @lxd, align 8
   %cmp3 = fcmp oeq double %8, %9
   %conv4 = zext i1 %cmp3 to i32
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), double %6, double %7, i32 %conv4)
+  %call5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), double %6, double %7, i32 %conv4)
   call void @clear()
   store float 9.000000e+00, float* @lx, align 4
   store float 1.000000e+01, float* @ly, align 4
@@ -117,7 +117,7 @@ land.rhs:                                         ; preds = %entry
 land.end:                                         ; preds = %land.rhs, %entry
   %20 = phi i1 [ false, %entry ], [ %cmp12, %land.rhs ]
   %land.ext = zext i1 %20 to i32
-  %call14 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str1, i32 0, i32 0), double %conv6, double %conv7, double %conv8, double %conv9, i32 %land.ext)
+  %call14 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str1, i32 0, i32 0), double %conv6, double %conv7, double %conv8, double %conv9, i32 %land.ext)
   call void @clear()
   store float 0x3FFE666660000000, float* @lx, align 4
   store double 0x4007E613249FF279, double* @lyd, align 8
@@ -139,7 +139,7 @@ land.end:                                         ; preds = %land.rhs, %entry
   %cmp19 = fcmp oeq double %29, %30
   %conv20 = zext i1 %cmp19 to i32
   %and = and i32 %conv18, %conv20
-  %call21 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str1, i32 0, i32 0), double %conv15, double %conv16, double %25, double %26, i32 %and)
+  %call21 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str1, i32 0, i32 0), double %conv15, double %conv16, double %25, double %26, i32 %and)
   call void @clear()
   store double 0x4194E54F94000000, double* @lxd, align 8
   store float 7.600000e+01, float* @ly, align 4
@@ -161,7 +161,7 @@ land.end:                                         ; preds = %land.rhs, %entry
   %cmp26 = fcmp oeq float %39, %40
   %conv27 = zext i1 %cmp26 to i32
   %and28 = and i32 %conv25, %conv27
-  %call29 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str1, i32 0, i32 0), double %33, double %34, double %conv22, double %conv23, i32 %and28)
+  %call29 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str1, i32 0, i32 0), double %33, double %34, double %conv22, double %conv23, i32 %and28)
   call void @clear()
   store double 7.365198e+07, double* @lxd, align 8
   store double 0x416536CD80000000, double* @lyd, align 8
@@ -181,7 +181,7 @@ land.end:                                         ; preds = %land.rhs, %entry
   %cmp32 = fcmp oeq double %49, %50
   %conv33 = zext i1 %cmp32 to i32
   %and34 = and i32 %conv31, %conv33
-  %call35 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str1, i32 0, i32 0), double %43, double %44, double %45, double %46, i32 %and34)
+  %call35 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str1, i32 0, i32 0), double %43, double %44, double %45, double %46, i32 %and34)
   call void @clear()
   store float 0x4016666660000000, float* @ret_sf, align 4
   %call36 = call float @sf_v()
@@ -194,7 +194,7 @@ land.end:                                         ; preds = %land.rhs, %entry
   %54 = load float, float* @lret_sf, align 4
   %cmp39 = fcmp oeq float %53, %54
   %conv40 = zext i1 %cmp39 to i32
-  %call41 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), double %conv37, double %conv38, i32 %conv40)
+  %call41 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), double %conv37, double %conv38, i32 %conv40)
   call void @clear()
   store float 4.587300e+06, float* @ret_sf, align 4
   store float 3.420000e+02, float* @lx, align 4
@@ -218,7 +218,7 @@ land.end:                                         ; preds = %land.rhs, %entry
   %cmp49 = fcmp oeq float %62, %63
   %conv50 = zext i1 %cmp49 to i32
   %and51 = and i32 %conv48, %conv50
-  %call52 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str1, i32 0, i32 0), double %conv43, double %conv44, double %conv45, double %conv46, i32 %and51)
+  %call52 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str1, i32 0, i32 0), double %conv43, double %conv44, double %conv45, double %conv46, i32 %and51)
   call void @clear()
   store float 4.445910e+06, float* @ret_sf, align 4
   store double 0x419A7DB294000000, double* @lxd, align 8
@@ -240,7 +240,7 @@ land.end:                                         ; preds = %land.rhs, %entry
   %cmp58 = fcmp oeq double %71, %72
   %conv59 = zext i1 %cmp58 to i32
   %and60 = and i32 %conv57, %conv59
-  %call61 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str1, i32 0, i32 0), double %conv54, double %conv55, double %67, double %68, i32 %and60)
+  %call61 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str1, i32 0, i32 0), double %conv54, double %conv55, double %67, double %68, i32 %and60)
   call void @clear()
   store float 0x3FFF4BC6A0000000, float* @ret_sf, align 4
   store float 4.445500e+03, float* @lx, align 4
@@ -281,7 +281,7 @@ land.rhs73:                                       ; preds = %land.lhs.true
 land.end76:                                       ; preds = %land.rhs73, %land.lhs.true, %land.end
   %87 = phi i1 [ false, %land.lhs.true ], [ false, %land.end ], [ %cmp74, %land.rhs73 ]
   %land.ext77 = zext i1 %87 to i32
-  %call78 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str2, i32 0, i32 0), double %conv63, double %conv64, double %conv65, double %conv66, double %conv67, double %conv68, i32 %land.ext77)
+  %call78 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str2, i32 0, i32 0), double %conv63, double %conv64, double %conv65, double %conv66, double %conv67, double %conv68, i32 %land.ext77)
   call void @clear()
   store float 9.991300e+04, float* @ret_sf, align 4
   store float 1.114500e+04, float* @lx, align 4
@@ -320,7 +320,7 @@ land.rhs89:                                       ; preds = %land.lhs.true86
 land.end92:                                       ; preds = %land.rhs89, %land.lhs.true86, %land.end76
   %102 = phi i1 [ false, %land.lhs.true86 ], [ false, %land.end76 ], [ %cmp90, %land.rhs89 ]
   %land.ext93 = zext i1 %102 to i32
-  %call94 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str2, i32 0, i32 0), double %conv80, double %conv81, double %conv82, double %conv83, double %94, double %95, i32 %land.ext93)
+  %call94 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str2, i32 0, i32 0), double %conv80, double %conv81, double %conv82, double %conv83, double %94, double %95, i32 %land.ext93)
   call void @clear()
   store float 0x417CCC7A00000000, float* @ret_sf, align 4
   store double 0x4172034530000000, double* @lxd, align 8
@@ -359,7 +359,7 @@ land.rhs105:                                      ; preds = %land.lhs.true102
 land.end108:                                      ; preds = %land.rhs105, %land.lhs.true102, %land.end92
   %117 = phi i1 [ false, %land.lhs.true102 ], [ false, %land.end92 ], [ %cmp106, %land.rhs105 ]
   %land.ext109 = zext i1 %117 to i32
-  %call110 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str2, i32 0, i32 0), double %conv96, double %conv97, double %107, double %108, double %conv98, double %conv99, i32 %land.ext109)
+  %call110 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str2, i32 0, i32 0), double %conv96, double %conv97, double %107, double %108, double %conv98, double %conv99, i32 %land.ext109)
   call void @clear()
   store float 3.987721e+06, float* @ret_sf, align 4
   store double 0x3FF1F49F6DDDC2D8, double* @lxd, align 8
@@ -396,7 +396,7 @@ land.rhs119:                                      ; preds = %land.lhs.true116
 land.end122:                                      ; preds = %land.rhs119, %land.lhs.true116, %land.end108
   %132 = phi i1 [ false, %land.lhs.true116 ], [ false, %land.end108 ], [ %cmp120, %land.rhs119 ]
   %land.ext123 = zext i1 %132 to i32
-  %call124 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str2, i32 0, i32 0), double %conv112, double %conv113, double %122, double %123, double %124, double %125, i32 %land.ext123)
+  %call124 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str2, i32 0, i32 0), double %conv112, double %conv113, double %122, double %123, double %124, double %125, i32 %land.ext123)
   call void @clear()
   store double 1.561234e+01, double* @ret_df, align 8
   %call125 = call double @df_v()
@@ -407,7 +407,7 @@ land.end122:                                      ; preds = %land.rhs119, %land.
   %136 = load double, double* @lret_df, align 8
   %cmp126 = fcmp oeq double %135, %136
   %conv127 = zext i1 %cmp126 to i32
-  %call128 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), double %133, double %134, i32 %conv127)
+  %call128 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), double %133, double %134, i32 %conv127)
   call void @clear()
   store double 1.345873e+01, double* @ret_df, align 8
   store float 3.434520e+05, float* @lx, align 4
@@ -429,7 +429,7 @@ land.end122:                                      ; preds = %land.rhs119, %land.
   %cmp134 = fcmp oeq float %144, %145
   %conv135 = zext i1 %cmp134 to i32
   %and136 = and i32 %conv133, %conv135
-  %call137 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str1, i32 0, i32 0), double %138, double %139, double %conv130, double %conv131, i32 %and136)
+  %call137 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str1, i32 0, i32 0), double %138, double %139, double %conv130, double %conv131, i32 %and136)
   call void @clear()
   store double 0x4084F3AB7AA25D8D, double* @ret_df, align 8
   store double 0x4114F671D2F1A9FC, double* @lxd, align 8
@@ -449,7 +449,7 @@ land.end122:                                      ; preds = %land.rhs119, %land.
   %cmp141 = fcmp oeq double %153, %154
   %conv142 = zext i1 %cmp141 to i32
   %and143 = and i32 %conv140, %conv142
-  %call144 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str1, i32 0, i32 0), double %147, double %148, double %149, double %150, i32 %and143)
+  %call144 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str1, i32 0, i32 0), double %147, double %148, double %149, double %150, i32 %and143)
   call void @clear()
   store double 6.781956e+03, double* @ret_df, align 8
   store float 4.445500e+03, float* @lx, align 4
@@ -488,7 +488,7 @@ land.rhs155:                                      ; preds = %land.lhs.true152
 land.end158:                                      ; preds = %land.rhs155, %land.lhs.true152, %land.end122
   %169 = phi i1 [ false, %land.lhs.true152 ], [ false, %land.end122 ], [ %cmp156, %land.rhs155 ]
   %land.ext159 = zext i1 %169 to i32
-  %call160 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str2, i32 0, i32 0), double %157, double %158, double %conv146, double %conv147, double %conv148, double %conv149, i32 %land.ext159)
+  %call160 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str2, i32 0, i32 0), double %157, double %158, double %conv146, double %conv147, double %conv148, double %conv149, i32 %land.ext159)
   call void @clear()
   store double 1.889130e+05, double* @ret_df, align 8
   store float 9.111450e+05, float* @lx, align 4
@@ -525,7 +525,7 @@ land.rhs169:                                      ; preds = %land.lhs.true166
 land.end172:                                      ; preds = %land.rhs169, %land.lhs.true166, %land.end158
   %184 = phi i1 [ false, %land.lhs.true166 ], [ false, %land.end158 ], [ %cmp170, %land.rhs169 ]
   %land.ext173 = zext i1 %184 to i32
-  %call174 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str2, i32 0, i32 0), double %172, double %173, double %conv162, double %conv163, double %176, double %177, i32 %land.ext173)
+  %call174 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str2, i32 0, i32 0), double %172, double %173, double %conv162, double %conv163, double %176, double %177, i32 %land.ext173)
   call void @clear()
   store double 0x418B2DB900000000, double* @ret_df, align 8
   store double 0x41B1EF2ED3000000, double* @lxd, align 8
@@ -562,7 +562,7 @@ land.rhs183:                                      ; preds = %land.lhs.true180
 land.end186:                                      ; preds = %land.rhs183, %land.lhs.true180, %land.end172
   %199 = phi i1 [ false, %land.lhs.true180 ], [ false, %land.end172 ], [ %cmp184, %land.rhs183 ]
   %land.ext187 = zext i1 %199 to i32
-  %call188 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str2, i32 0, i32 0), double %187, double %188, double %189, double %190, double %conv176, double %conv177, i32 %land.ext187)
+  %call188 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str2, i32 0, i32 0), double %187, double %188, double %189, double %190, double %conv176, double %conv177, i32 %land.ext187)
   call void @clear()
   store double 3.987721e+06, double* @ret_df, align 8
   store double 5.223560e+00, double* @lxd, align 8
@@ -597,7 +597,7 @@ land.rhs195:                                      ; preds = %land.lhs.true192
 land.end198:                                      ; preds = %land.rhs195, %land.lhs.true192, %land.end186
   %214 = phi i1 [ false, %land.lhs.true192 ], [ false, %land.end186 ], [ %cmp196, %land.rhs195 ]
   %land.ext199 = zext i1 %214 to i32
-  %call200 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str2, i32 0, i32 0), double %202, double %203, double %204, double %205, double %206, double %207, i32 %land.ext199)
+  %call200 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str2, i32 0, i32 0), double %202, double %203, double %204, double %205, double %206, double %207, i32 %land.ext199)
   call void @clear()
   store float 4.500000e+00, float* getelementptr inbounds ({ float, float }, { float, float }* @ret_sc, i32 0, i32 0)
   store float 7.000000e+00, float* getelementptr inbounds ({ float, float }, { float, float }* @ret_sc, i32 0, i32 1)
@@ -630,7 +630,7 @@ land.end198:                                      ; preds = %land.rhs195, %land.
   %cmp.i = fcmp oeq float %ret_sc.imag215, %lret_sc.imag217
   %and.ri = and i1 %cmp.r, %cmp.i
   %conv218 = zext i1 %and.ri to i32
-  %call219 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str3, i32 0, i32 0), double %conv202, double %conv207, double %conv208, double %conv213, i32 %conv218)
+  %call219 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str3, i32 0, i32 0), double %conv202, double %conv207, double %conv208, double %conv213, i32 %conv218)
   call void @clear()
   store float 0x3FF7A99300000000, float* @lx, align 4
   store float 4.500000e+00, float* getelementptr inbounds ({ float, float }, { float, float }* @ret_sc, i32 0, i32 0)
@@ -679,7 +679,7 @@ land.rhs247:                                      ; preds = %land.end198
 land.end250:                                      ; preds = %land.rhs247, %land.end198
   %224 = phi i1 [ false, %land.end198 ], [ %cmp248, %land.rhs247 ]
   %land.ext251 = zext i1 %224 to i32
-  %call252 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str4, i32 0, i32 0), double %conv223, double %conv228, double %conv231, double %conv236, double %conv237, double %conv238, i32 %land.ext251)
+  %call252 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str4, i32 0, i32 0), double %conv223, double %conv228, double %conv231, double %conv236, double %conv237, double %conv238, i32 %land.ext251)
   call void @clear()
   store double 1.234500e+03, double* getelementptr inbounds ({ double, double }, { double, double }* @ret_dc, i32 0, i32 0)
   store double 7.677000e+03, double* getelementptr inbounds ({ double, double }, { double, double }* @ret_dc, i32 0, i32 1)
@@ -704,7 +704,7 @@ land.end250:                                      ; preds = %land.rhs247, %land.
   %cmp.i263 = fcmp oeq double %ret_dc.imag259, %lret_dc.imag261
   %and.ri264 = and i1 %cmp.r262, %cmp.i263
   %conv265 = zext i1 %and.ri264 to i32
-  %call266 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str3, i32 0, i32 0), double %ret_dc.real, double %ret_dc.imag255, double %lret_dc.real, double %lret_dc.imag257, i32 %conv265)
+  %call266 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str3, i32 0, i32 0), double %ret_dc.real, double %ret_dc.imag255, double %lret_dc.real, double %lret_dc.imag257, i32 %conv265)
   call void @clear()
   store double 0x40AAF6F532617C1C, double* @lxd, align 8
   store double 4.444500e+03, double* getelementptr inbounds ({ double, double }, { double, double }* @ret_dc, i32 0, i32 0)
@@ -745,7 +745,7 @@ land.rhs286:                                      ; preds = %land.end250
 land.end289:                                      ; preds = %land.rhs286, %land.end250
   %234 = phi i1 [ false, %land.end250 ], [ %cmp287, %land.rhs286 ]
   %land.ext290 = zext i1 %234 to i32
-  %call291 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str4, i32 0, i32 0), double %ret_dc.real268, double %ret_dc.imag271, double %lret_dc.real272, double %lret_dc.imag275, double %conv276, double %conv277, i32 %land.ext290)
+  %call291 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str4, i32 0, i32 0), double %ret_dc.real268, double %ret_dc.imag271, double %lret_dc.real272, double %lret_dc.imag275, double %conv276, double %conv277, i32 %land.ext290)
   %235 = load i32, i32* %retval
   ret i32 %235
 }
diff --git a/test/CodeGen/Mips/hfptrcall.ll b/test/CodeGen/Mips/hfptrcall.ll
index de809f1..c9f1fe9 100644
--- a/test/CodeGen/Mips/hfptrcall.ll
+++ b/test/CodeGen/Mips/hfptrcall.ll
@@ -70,12 +70,12 @@ entry:
   store float %call, float* @x, align 4
   %1 = load float, float* @x, align 4
   %conv = fpext float %1 to double
-  %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double %conv)
+  %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double %conv)
   %2 = load double ()*, double ()** @ptrdv, align 4
   %call2 = call double %2()
   store double %call2, double* @xd, align 8
   %3 = load double, double* @xd, align 8
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double %3)
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double %3)
   %4 = load { float, float } ()*, { float, float } ()** @ptrscv, align 4
   %call4 = call { float, float } %4()
   %5 = extractvalue { float, float } %call4, 0
@@ -90,7 +90,7 @@ entry:
   %xy.imag8 = load float, float* getelementptr inbounds ({ float, float }, { float, float }* @xy, i32 0, i32 1)
   %conv9 = fpext float %xy.real7 to double
   %conv10 = fpext float %xy.imag8 to double
-  %call11 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str1, i32 0, i32 0), double %conv5, double %conv10)
+  %call11 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str1, i32 0, i32 0), double %conv5, double %conv10)
   %7 = load { double, double } ()*, { double, double } ()** @ptrdcv, align 4
   %call12 = call { double, double } %7()
   %8 = extractvalue { double, double } %call12, 0
@@ -101,7 +101,7 @@ entry:
   %xyd.imag = load double, double* getelementptr inbounds ({ double, double }, { double, double }* @xyd, i32 0, i32 1)
   %xyd.real13 = load double, double* getelementptr inbounds ({ double, double }, { double, double }* @xyd, i32 0, i32 0)
   %xyd.imag14 = load double, double* getelementptr inbounds ({ double, double }, { double, double }* @xyd, i32 0, i32 1)
-  %call15 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str1, i32 0, i32 0), double %xyd.real, double %xyd.imag14)
+  %call15 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str1, i32 0, i32 0), double %xyd.real, double %xyd.imag14)
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/i32k.ll b/test/CodeGen/Mips/i32k.ll
index 5c5761f..ba9cf73 100644
--- a/test/CodeGen/Mips/i32k.ll
+++ b/test/CodeGen/Mips/i32k.ll
@@ -4,14 +4,14 @@
 
 define i32 @main() nounwind {
 entry:
-  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 1075344593) nounwind
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 1075344593) nounwind
 ; 16:	lw	${{[0-9]+}}, 1f
 ; 16:	b	2f
 ; 16:	.align	2
 ; 16: 1: 	.word	1075344593
 ; 16: 2:
 
-  %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 -1075344593) nounwind
+  %call1 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 -1075344593) nounwind
 
 ; 16:	lw	${{[0-9]+}}, 1f
 ; 16:	b	2f
diff --git a/test/CodeGen/Mips/inlineasm_constraint.ll b/test/CodeGen/Mips/inlineasm_constraint.ll
index 76b73dc..868433e 100644
--- a/test/CodeGen/Mips/inlineasm_constraint.ll
+++ b/test/CodeGen/Mips/inlineasm_constraint.ll
@@ -51,14 +51,5 @@ entry:
 ; CHECK: #NO_APP	
   tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,P"(i32 7, i32 65535) nounwind
 
-; Now R Which takes the address of c
-  %c = alloca i32, align 4
-  store i32 -4469539, i32* %c, align 4
-  %8 = call i32 asm sideeffect "lw $0, 1 + $1\0A\09lw $0, 2 + $1\0A\09", "=r,*R"(i32* %c) #1
-; CHECK: #APP
-; CHECK: lw ${{[0-9]+}}, 1 + 0(${{[0-9]+}})
-; CHECK: lw ${{[0-9]+}}, 2 + 0(${{[0-9]+}})
-; CHECK: #NO_APP	
-
   ret i32 0
 }
diff --git a/test/CodeGen/Mips/inlineasm_constraint_R.ll b/test/CodeGen/Mips/inlineasm_constraint_R.ll
new file mode 100644
index 0000000..c4105ae
--- /dev/null
+++ b/test/CodeGen/Mips/inlineasm_constraint_R.ll
@@ -0,0 +1,60 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s
+
+@data = global [8193 x i32] zeroinitializer
+
+define void @R(i32 *%p) nounwind {
+entry:
+  ; CHECK-LABEL: R:
+
+  call void asm sideeffect "lw $$1, $0", "*R,~{$1}"(i32* getelementptr inbounds ([8193 x i32], [8193 x i32]* @data, i32 0, i32 0))
+
+  ; CHECK: lw $[[BASEPTR:[0-9]+]], %got(data)(
+  ; CHECK: #APP
+  ; CHECK: lw $1, 0($[[BASEPTR]])
+  ; CHECK: #NO_APP
+
+  ret void
+}
+
+define void @R_offset_4(i32 *%p) nounwind {
+entry:
+  ; CHECK-LABEL: R_offset_4:
+
+  call void asm sideeffect "lw $$1, $0", "*R,~{$1}"(i32* getelementptr inbounds ([8193 x i32], [8193 x i32]* @data, i32 0, i32 1))
+
+  ; CHECK: lw $[[BASEPTR:[0-9]+]], %got(data)(
+  ; CHECK: #APP
+  ; CHECK: lw $1, 4($[[BASEPTR]])
+  ; CHECK: #NO_APP
+
+  ret void
+}
+
+define void @R_offset_254(i32 *%p) nounwind {
+entry:
+  ; CHECK-LABEL: R_offset_254:
+
+  call void asm sideeffect "lw $$1, $0", "*R,~{$1}"(i32* getelementptr inbounds ([8193 x i32], [8193 x i32]* @data, i32 0, i32 63))
+
+  ; CHECK-DAG: lw $[[BASEPTR:[0-9]+]], %got(data)(
+  ; CHECK: #APP
+  ; CHECK: lw $1, 252($[[BASEPTR]])
+  ; CHECK: #NO_APP
+
+  ret void
+}
+
+define void @R_offset_256(i32 *%p) nounwind {
+entry:
+  ; CHECK-LABEL: R_offset_256:
+
+  call void asm sideeffect "lw $$1, $0", "*R,~{$1}"(i32* getelementptr inbounds ([8193 x i32], [8193 x i32]* @data, i32 0, i32 64))
+
+  ; CHECK-DAG: lw $[[BASEPTR:[0-9]+]], %got(data)(
+  ; CHECK: addiu $[[BASEPTR2:[0-9]+]], $[[BASEPTR]], 256
+  ; CHECK: #APP
+  ; CHECK: lw $1, 0($[[BASEPTR2]])
+  ; CHECK: #NO_APP
+
+  ret void
+}
diff --git a/test/CodeGen/Mips/internalfunc.ll b/test/CodeGen/Mips/internalfunc.ll
index bde7357..2b4a039 100644
--- a/test/CodeGen/Mips/internalfunc.ll
+++ b/test/CodeGen/Mips/internalfunc.ll
@@ -21,7 +21,7 @@ entry:
 
 if.then:                                          ; preds = %entry
   %tmp1 = load void (...)*, void (...)** @caller.sf1, align 4
-  tail call void (...)* %tmp1() nounwind
+  tail call void (...) %tmp1() nounwind
   br label %if.end
 
 if.end:                                           ; preds = %entry, %if.then
@@ -38,7 +38,7 @@ if.end:                                           ; preds = %entry, %if.then
 
 define internal void @sf2() nounwind {
 entry:
-  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0)) nounwind
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0)) nounwind
   ret void
 }
 
@@ -46,7 +46,7 @@ declare i32 @printf(i8* nocapture, ...) nounwind
 
 define internal fastcc void @f2() nounwind noinline {
 entry:
-  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0)) nounwind
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0)) nounwind
   ret void
 }
 
diff --git a/test/CodeGen/Mips/lb1.ll b/test/CodeGen/Mips/lb1.ll
index ad94c5f..21648d7 100644
--- a/test/CodeGen/Mips/lb1.ll
+++ b/test/CodeGen/Mips/lb1.ll
@@ -11,7 +11,7 @@ entry:
   %conv = sext i8 %0 to i32
   store i32 %conv, i32* %i, align 4
   %1 = load i32, i32* %i, align 4
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %1)
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/lbu1.ll b/test/CodeGen/Mips/lbu1.ll
index a8ef2ff..28ca271 100644
--- a/test/CodeGen/Mips/lbu1.ll
+++ b/test/CodeGen/Mips/lbu1.ll
@@ -12,7 +12,7 @@ entry:
   store i32 %conv, i32* %i, align 4
   %1 = load i8, i8* @c, align 1
   %conv1 = zext i8 %1 to i32
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %conv1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %conv1)
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/lh1.ll b/test/CodeGen/Mips/lh1.ll
index 3b245b1..31967e5 100644
--- a/test/CodeGen/Mips/lh1.ll
+++ b/test/CodeGen/Mips/lh1.ll
@@ -11,7 +11,7 @@ entry:
 ; 16:	lh	${{[0-9]+}}, 0(${{[0-9]+}})
   store i32 %conv, i32* %i, align 4
   %1 = load i32, i32* %i, align 4
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %1)
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/lhu1.ll b/test/CodeGen/Mips/lhu1.ll
index 02abfb7..413da46 100644
--- a/test/CodeGen/Mips/lhu1.ll
+++ b/test/CodeGen/Mips/lhu1.ll
@@ -12,7 +12,7 @@ entry:
 ; 16:	lhu	${{[0-9]+}}, 0(${{[0-9]+}})
   store i32 %conv, i32* %i, align 4
   %1 = load i32, i32* %i, align 4
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %1)
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/mbrsize4a.ll b/test/CodeGen/Mips/mbrsize4a.ll
index e7ca776..a99facc 100644
--- a/test/CodeGen/Mips/mbrsize4a.ll
+++ b/test/CodeGen/Mips/mbrsize4a.ll
@@ -17,7 +17,7 @@ z:                                                ; preds = %y, %entry
   br label %y
 
 y:                                                ; preds = %z
-  %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0))
+  %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0))
   br label %z
 
 return:                                           ; No predecessors!
diff --git a/test/CodeGen/Mips/micromips-addiu.ll b/test/CodeGen/Mips/micromips-addiu.ll
index 3035782..e0743c9 100644
--- a/test/CodeGen/Mips/micromips-addiu.ll
+++ b/test/CodeGen/Mips/micromips-addiu.ll
@@ -10,17 +10,17 @@ define i32 @main() nounwind {
 entry:
   %0 = load i32, i32* @x, align 4
   %addiu1 = add i32 %0, -7
-  %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds
+  %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds
                                   ([7 x i8], [7 x i8]* @.str, i32 0, i32 0), i32 %addiu1)
 
   %1 = load i32, i32* @y, align 4
   %addiu2 = add i32 %1, 55
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds
                                   ([7 x i8], [7 x i8]* @.str, i32 0, i32 0), i32 %addiu2)
 
   %2 = load i32, i32* @z, align 4
   %addiu3 = add i32 %2, 24
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds
                                   ([7 x i8], [7 x i8]* @.str, i32 0, i32 0), i32 %addiu3)
   ret i32 0
 }
diff --git a/test/CodeGen/Mips/micromips-andi.ll b/test/CodeGen/Mips/micromips-andi.ll
index cec30e2..cd7a794 100644
--- a/test/CodeGen/Mips/micromips-andi.ll
+++ b/test/CodeGen/Mips/micromips-andi.ll
@@ -9,12 +9,12 @@ define i32 @main() nounwind {
 entry:
   %0 = load i32, i32* @x, align 4
   %and1 = and i32 %0, 4
-  %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds
+  %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds
                                   ([7 x i8], [7 x i8]* @.str, i32 0, i32 0), i32 %and1)
 
   %1 = load i32, i32* @y, align 4
   %and2 = and i32 %1, 5
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds
                                   ([7 x i8], [7 x i8]* @.str, i32 0, i32 0), i32 %and2)
   ret i32 0
 }
diff --git a/test/CodeGen/Mips/mips16_32_8.ll b/test/CodeGen/Mips/mips16_32_8.ll
index e79cda5..5e03928 100644
--- a/test/CodeGen/Mips/mips16_32_8.ll
+++ b/test/CodeGen/Mips/mips16_32_8.ll
@@ -28,7 +28,7 @@ entry:
   store float %add, float* @f, align 4
   %2 = load float, float* @f, align 4
   %conv = fpext float %2 to double
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), double %conv)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), double %conv)
   ret void
 }
 
@@ -49,10 +49,10 @@ define i32 @main() #3 {
 entry:
   call void @foo()
   %0 = load i32, i32* @i, align 4
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str1, i32 0, i32 0), i32 %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str1, i32 0, i32 0), i32 %0)
   call void @nofoo()
   %1 = load i32, i32* @i, align 4
-  %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str2, i32 0, i32 0), i32 %1)
+  %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str2, i32 0, i32 0), i32 %1)
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/mips16ex.ll b/test/CodeGen/Mips/mips16ex.ll
index 25957fb..0f4dc7e 100644
--- a/test/CodeGen/Mips/mips16ex.ll
+++ b/test/CodeGen/Mips/mips16ex.ll
@@ -16,7 +16,7 @@ entry:
   %ehselector.slot = alloca i32
   %e = alloca i32, align 4
   store i32 0, i32* %retval
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0))
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0))
   %exception = call i8* @__cxa_allocate_exception(i32 4) nounwind
   %0 = bitcast i8* %exception to i32*
   store i32 20, i32* %0
diff --git a/test/CodeGen/Mips/neg1.ll b/test/CodeGen/Mips/neg1.ll
index e2b10e0..36275a2 100644
--- a/test/CodeGen/Mips/neg1.ll
+++ b/test/CodeGen/Mips/neg1.ll
@@ -8,7 +8,7 @@ entry:
   %0 = load i32, i32* @i, align 4
   %sub = sub nsw i32 0, %0
 ; 16:	neg	${{[0-9]+}}, ${{[0-9]+}}
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %sub)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %sub)
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/not1.ll b/test/CodeGen/Mips/not1.ll
index bf5d06e..f5ec5b6 100644
--- a/test/CodeGen/Mips/not1.ll
+++ b/test/CodeGen/Mips/not1.ll
@@ -9,7 +9,7 @@ entry:
   %0 = load i32, i32* @x, align 4
   %neg = xor i32 %0, -1
 ; 16:	not	${{[0-9]+}}, ${{[0-9]+}}
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0), i32 %neg)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0), i32 %neg)
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/octeon.ll b/test/CodeGen/Mips/octeon.ll
index 97e12e7..499ce3c 100644
--- a/test/CodeGen/Mips/octeon.ll
+++ b/test/CodeGen/Mips/octeon.ll
@@ -93,7 +93,7 @@ entry:
 ; ALL-LABEL: bbit0:
 ; OCTEON: bbit0   $4, 3, $[[BB0:BB[0-9_]+]]
 ; MIPS64: andi  $[[T0:[0-9]+]], $4, 8
-; MIPS64: beqz  $[[T0]], $[[BB0:BB[0-9_]+]]
+; MIPS64: bnez  $[[T0]], $[[BB0:BB[0-9_]+]]
   %bit = and i64 %a, 8
   %res = icmp eq i64 %bit, 0
   br i1 %res, label %endif, label %if
@@ -111,7 +111,7 @@ entry:
 ; MIPS64: daddiu  $[[T0:[0-9]+]], $zero, 1
 ; MIPS64: dsll    $[[T1:[0-9]+]], $[[T0]], 35
 ; MIPS64: and     $[[T2:[0-9]+]], $4, $[[T1]]
-; MIPS64: beqz    $[[T2]], $[[BB0:BB[0-9_]+]]
+; MIPS64: bnez    $[[T2]], $[[BB0:BB[0-9_]+]]
   %bit = and i64 %a, 34359738368
   %res = icmp eq i64 %bit, 0
   br i1 %res, label %endif, label %if
diff --git a/test/CodeGen/Mips/or1.ll b/test/CodeGen/Mips/or1.ll
index 66dd070..51b6ebf 100644
--- a/test/CodeGen/Mips/or1.ll
+++ b/test/CodeGen/Mips/or1.ll
@@ -10,7 +10,7 @@ entry:
   %1 = load i32, i32* @y, align 4
   %or = or i32 %0, %1
 ; 16:	or	${{[0-9]+}}, ${{[0-9]+}}
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0), i32 %or)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0), i32 %or)
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/return-vector.ll b/test/CodeGen/Mips/return-vector.ll
index 0e0d515..3870fe0 100644
--- a/test/CodeGen/Mips/return-vector.ll
+++ b/test/CodeGen/Mips/return-vector.ll
@@ -12,7 +12,7 @@ declare <4 x double> @d4(...)
 
 define i32 @call_i8() {
 entry:
-  %call = call <8 x i32> (...)* @i8()
+  %call = call <8 x i32> (...) @i8()
   %v0 = extractelement <8 x i32> %call, i32 0
   %v1 = extractelement <8 x i32> %call, i32 1
   %v2 = extractelement <8 x i32> %call, i32 2
@@ -46,7 +46,7 @@ entry:
 
 define float @call_f4() {
 entry:
-  %call = call <4 x float> (...)* @f4()
+  %call = call <4 x float> (...) @f4()
   %v0 = extractelement <4 x float> %call, i32 0
   %v1 = extractelement <4 x float> %call, i32 1
   %v2 = extractelement <4 x float> %call, i32 2
@@ -68,7 +68,7 @@ entry:
 
 define double @call_d4() {
 entry:
-  %call = call <4 x double> (...)* @d4()
+  %call = call <4 x double> (...) @d4()
   %v0 = extractelement <4 x double> %call, i32 0
   %v1 = extractelement <4 x double> %call, i32 1
   %v2 = extractelement <4 x double> %call, i32 2
@@ -99,7 +99,7 @@ declare <2 x double> @d2(...)
 
 define i32 @call_i4() {
 entry:
-  %call = call <4 x i32> (...)* @i4()
+  %call = call <4 x i32> (...) @i4()
   %v0 = extractelement <4 x i32> %call, i32 0
   %v1 = extractelement <4 x i32> %call, i32 1
   %v2 = extractelement <4 x i32> %call, i32 2
@@ -120,7 +120,7 @@ entry:
 
 define float @call_f2() {
 entry:
-  %call = call <2 x float> (...)* @f2()
+  %call = call <2 x float> (...) @f2()
   %v0 = extractelement <2 x float> %call, i32 0
   %v1 = extractelement <2 x float> %call, i32 1
   %add1 = fadd float %v0, %v1
@@ -135,7 +135,7 @@ entry:
 
 define double @call_d2() {
 entry:
-  %call = call <2 x double> (...)* @d2()
+  %call = call <2 x double> (...) @d2()
   %v0 = extractelement <2 x double> %call, i32 0
   %v1 = extractelement <2 x double> %call, i32 1
   %add1 = fadd double %v0, %v1
diff --git a/test/CodeGen/Mips/sb1.ll b/test/CodeGen/Mips/sb1.ll
index 4c17a50..d2e8510 100644
--- a/test/CodeGen/Mips/sb1.ll
+++ b/test/CodeGen/Mips/sb1.ll
@@ -13,7 +13,7 @@ entry:
   %2 = load i8, i8* @c, align 1
   %conv1 = sext i8 %2 to i32
 ; 16:	sb	${{[0-9]+}}, 0(${{[0-9]+}})
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %1, i32 %conv1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %1, i32 %conv1)
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/selnek.ll b/test/CodeGen/Mips/selnek.ll
index e8a5105..5b6aa2a 100644
--- a/test/CodeGen/Mips/selnek.ll
+++ b/test/CodeGen/Mips/selnek.ll
@@ -79,13 +79,13 @@ define i32 @main() nounwind "target-cpu"="mips16" "target-features"="+mips16,+o3
 entry:
   call void @calc_z() "target-cpu"="mips16" "target-features"="+mips16,+o32"
   %0 = load i32, i32* @z1, align 4
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %0) "target-cpu"="mips16" "target-features"="+mips16,+o32"
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %0) "target-cpu"="mips16" "target-features"="+mips16,+o32"
   %1 = load i32, i32* @z2, align 4
-  %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %1) "target-cpu"="mips16" "target-features"="+mips16,+o32"
+  %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %1) "target-cpu"="mips16" "target-features"="+mips16,+o32"
   %2 = load i32, i32* @z3, align 4
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %2) "target-cpu"="mips16" "target-features"="+mips16,+o32"
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %2) "target-cpu"="mips16" "target-features"="+mips16,+o32"
   %3 = load i32, i32* @z4, align 4
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %3) "target-cpu"="mips16" "target-features"="+mips16,+o32"
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %3) "target-cpu"="mips16" "target-features"="+mips16,+o32"
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/sh1.ll b/test/CodeGen/Mips/sh1.ll
index 1ab7779..3f70b9b 100644
--- a/test/CodeGen/Mips/sh1.ll
+++ b/test/CodeGen/Mips/sh1.ll
@@ -13,7 +13,7 @@ entry:
   %2 = load i16, i16* @s, align 2
   %conv1 = sext i16 %2 to i32
 ; 16:	sh	${{[0-9]+}}, 0(${{[0-9]+}})
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), i32 %1, i32 %conv1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), i32 %1, i32 %conv1)
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/sll1.ll b/test/CodeGen/Mips/sll1.ll
index 52173b8..4d35b64 100644
--- a/test/CodeGen/Mips/sll1.ll
+++ b/test/CodeGen/Mips/sll1.ll
@@ -12,7 +12,7 @@ entry:
 ; 16:	sll	${{[0-9]+}}, ${{[0-9]+}}, {{[0-9]+}}
   store i32 %shl, i32* @j, align 4
   %1 = load i32, i32* @j, align 4
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %1)
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/sll2.ll b/test/CodeGen/Mips/sll2.ll
index 0e7194e..dc2236b 100644
--- a/test/CodeGen/Mips/sll2.ll
+++ b/test/CodeGen/Mips/sll2.ll
@@ -12,7 +12,7 @@ entry:
 ; 16:	sllv	${{[0-9]+}}, ${{[0-9]+}}
   store i32 %shl, i32* @i, align 4
   %2 = load i32, i32* @j, align 4
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %2)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %2)
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/sra1.ll b/test/CodeGen/Mips/sra1.ll
index ecaba2c..1c7d417 100644
--- a/test/CodeGen/Mips/sra1.ll
+++ b/test/CodeGen/Mips/sra1.ll
@@ -8,7 +8,7 @@ entry:
   %0 = load i32, i32* @i, align 4
   %shr = ashr i32 %0, 3
 ; 16:	sra	${{[0-9]+}}, ${{[0-9]+}}, {{[0-9]+}}
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %shr)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %shr)
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/sra2.ll b/test/CodeGen/Mips/sra2.ll
index d5fac8d..771d0f4 100644
--- a/test/CodeGen/Mips/sra2.ll
+++ b/test/CodeGen/Mips/sra2.ll
@@ -10,7 +10,7 @@ entry:
   %1 = load i32, i32* @j, align 4
   %shr = ashr i32 %0, %1
 ; 16:	srav	${{[0-9]+}}, ${{[0-9]+}}
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %shr)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %shr)
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/srl1.ll b/test/CodeGen/Mips/srl1.ll
index dc4d88a..a748eab 100644
--- a/test/CodeGen/Mips/srl1.ll
+++ b/test/CodeGen/Mips/srl1.ll
@@ -11,7 +11,7 @@ entry:
 ; 16:	srl	${{[0-9]+}}, ${{[0-9]+}}, {{[0-9]+}}
   store i32 %shr, i32* @j, align 4
   %1 = load i32, i32* @j, align 4
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %1)
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/srl2.ll b/test/CodeGen/Mips/srl2.ll
index 8fe088c..6e338b3 100644
--- a/test/CodeGen/Mips/srl2.ll
+++ b/test/CodeGen/Mips/srl2.ll
@@ -13,7 +13,7 @@ entry:
 ; 16:	srlv	${{[0-9]+}}, ${{[0-9]+}}
   store i32 %shr, i32* @j, align 4
   %2 = load i32, i32* @j, align 4
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %2)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), i32 %2)
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/stchar.ll b/test/CodeGen/Mips/stchar.ll
index ad58794..6bc4889 100644
--- a/test/CodeGen/Mips/stchar.ll
+++ b/test/CodeGen/Mips/stchar.ll
@@ -9,7 +9,7 @@ define void @p1(i16 signext %s, i8 signext %c) nounwind {
 entry:
   %conv = sext i16 %s to i32
   %conv1 = sext i8 %c to i32
-  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), i32 %conv, i32 %conv1) nounwind
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), i32 %conv, i32 %conv1) nounwind
   ret void
 }
 
@@ -23,7 +23,7 @@ entry:
   %3 = load i8, i8* %2, align 1
   %conv.i = sext i16 %1 to i32
   %conv1.i = sext i8 %3 to i32
-  %call.i = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), i32 %conv.i, i32 %conv1.i) nounwind
+  %call.i = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), i32 %conv.i, i32 %conv1.i) nounwind
   %4 = load i16*, i16** @sp, align 4
   store i16 32, i16* %4, align 2
   %5 = load i8*, i8** @cp, align 4
@@ -39,7 +39,7 @@ entry:
   store i8 99, i8* %c, align 4
   store i16* %s, i16** @sp, align 4
   store i8* %c, i8** @cp, align 4
-  %call.i.i = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), i32 16, i32 99) nounwind
+  %call.i.i = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), i32 16, i32 99) nounwind
   %0 = load i16*, i16** @sp, align 4
   store i16 32, i16* %0, align 2
   %1 = load i8*, i8** @cp, align 4
@@ -48,7 +48,7 @@ entry:
   %3 = load i8, i8* %c, align 4
   %conv.i = sext i16 %2 to i32
   %conv1.i = sext i8 %3 to i32
-  %call.i = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), i32 %conv.i, i32 %conv1.i) nounwind
+  %call.i = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), i32 %conv.i, i32 %conv1.i) nounwind
   ret void
 ; 16_b-LABEL: test:
 ; 16_h-LABEL: test:
@@ -69,7 +69,7 @@ entry:
   store i8 99, i8* %c.i, align 4
   store i16* %s.i, i16** @sp, align 4
   store i8* %c.i, i8** @cp, align 4
-  %call.i.i.i = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), i32 16, i32 99) nounwind
+  %call.i.i.i = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), i32 16, i32 99) nounwind
   %1 = load i16*, i16** @sp, align 4
   store i16 32, i16* %1, align 2
   %2 = load i8*, i8** @cp, align 4
@@ -78,7 +78,7 @@ entry:
   %4 = load i8, i8* %c.i, align 4
   %conv.i.i = sext i16 %3 to i32
   %conv1.i.i = sext i8 %4 to i32
-  %call.i.i = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), i32 %conv.i.i, i32 %conv1.i.i) nounwind
+  %call.i.i = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), i32 %conv.i.i, i32 %conv1.i.i) nounwind
   call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
   call void @llvm.lifetime.end(i64 -1, i8* %c.i) nounwind
   ret i32 0
diff --git a/test/CodeGen/Mips/stldst.ll b/test/CodeGen/Mips/stldst.ll
index 63e1e14..4eef5ec 100644
--- a/test/CodeGen/Mips/stldst.ll
+++ b/test/CodeGen/Mips/stldst.ll
@@ -29,8 +29,8 @@ entry:
   %7 = load i32, i32* @rrrr, align 4
   %add6 = add nsw i32 %7, 6
 
-  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8], [32 x i8]* @.str, i32 0, i32 0), i32 %sub5, i32 %add6, i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7) nounwind
-  %call7 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8], [32 x i8]* @.str, i32 0, i32 0), i32 %0, i32 %1, i32 %add, i32 %add1, i32 %sub, i32 %add2, i32 %add3, i32 %sub4, i32 %sub5, i32 %add6) nounwind
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([32 x i8], [32 x i8]* @.str, i32 0, i32 0), i32 %sub5, i32 %add6, i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7) nounwind
+  %call7 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([32 x i8], [32 x i8]* @.str, i32 0, i32 0), i32 %0, i32 %1, i32 %add, i32 %add1, i32 %sub, i32 %add2, i32 %add3, i32 %sub4, i32 %sub5, i32 %add6) nounwind
   ret i32 0
 }
 ; 16:	sw	${{[0-9]+}}, {{[0-9]+}} ( $sp );         # 4-byte Folded Spill
diff --git a/test/CodeGen/Mips/sub1.ll b/test/CodeGen/Mips/sub1.ll
index 4c91252..636ab8f 100644
--- a/test/CodeGen/Mips/sub1.ll
+++ b/test/CodeGen/Mips/sub1.ll
@@ -8,7 +8,7 @@ entry:
   %0 = load i32, i32* @i, align 4
   %sub = sub nsw i32 %0, 5
 ; 16:	addiu	${{[0-9]+}}, -{{[0-9]+}}
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %sub)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %sub)
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/sub2.ll b/test/CodeGen/Mips/sub2.ll
index e978d45..a97f5e9 100644
--- a/test/CodeGen/Mips/sub2.ll
+++ b/test/CodeGen/Mips/sub2.ll
@@ -10,7 +10,7 @@ entry:
   %1 = load i32, i32* @i, align 4
   %sub = sub nsw i32 %0, %1
 ; 16:	subu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %sub)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %sub)
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/tailcall.ll b/test/CodeGen/Mips/tailcall.ll
index 01b2d73..6a0d64b 100644
--- a/test/CodeGen/Mips/tailcall.ll
+++ b/test/CodeGen/Mips/tailcall.ll
@@ -136,7 +136,7 @@ entry:
 ; PIC16: jalrc
 ; PIC16: .end caller8_1
 
-  %call = tail call i32 (i32, ...)* @callee8(i32 2, i32 1) nounwind
+  %call = tail call i32 (i32, ...) @callee8(i32 2, i32 1) nounwind
   ret i32 %call
 }
 
@@ -239,7 +239,7 @@ entry:
 ; PIC16: .ent caller13
 ; PIC16: jalrc
 
-  %call = tail call i32 (i32, ...)* @callee13(i32 1, i32 2) nounwind
+  %call = tail call i32 (i32, ...) @callee13(i32 1, i32 2) nounwind
   ret i32 %call
 }
 
diff --git a/test/CodeGen/Mips/xor1.ll b/test/CodeGen/Mips/xor1.ll
index 4fcfc45..dd51f14 100644
--- a/test/CodeGen/Mips/xor1.ll
+++ b/test/CodeGen/Mips/xor1.ll
@@ -10,7 +10,7 @@ entry:
   %1 = load i32, i32* @y, align 4
   %xor = xor i32 %0, %1
 ; 16:	xor	${{[0-9]+}}, ${{[0-9]+}}
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0), i32 %xor)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0), i32 %xor)
   ret i32 0
 }
 
diff --git a/test/CodeGen/NVPTX/ptx-version-30.ll b/test/CodeGen/NVPTX/ptx-version-30.ll
deleted file mode 100644
index 0422b01..0000000
--- a/test/CodeGen/NVPTX/ptx-version-30.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -mattr=ptx30 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=ptx30 | FileCheck %s
-
-
-; CHECK: .version 3.0
-
diff --git a/test/CodeGen/NVPTX/ptx-version-31.ll b/test/CodeGen/NVPTX/ptx-version-31.ll
deleted file mode 100644
index d6e5730..0000000
--- a/test/CodeGen/NVPTX/ptx-version-31.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -mattr=ptx31 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=ptx31 | FileCheck %s
-
-
-; CHECK: .version 3.1
-
diff --git a/test/CodeGen/NVPTX/sm-version-30.ll b/test/CodeGen/NVPTX/sm-version-30.ll
index 692b49a..4f35cf0 100644
--- a/test/CodeGen/NVPTX/sm-version-30.ll
+++ b/test/CodeGen/NVPTX/sm-version-30.ll
@@ -2,5 +2,6 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s
 
 
+; CHECK: .version 3.2
 ; CHECK: .target sm_30
 
diff --git a/test/CodeGen/NVPTX/sm-version-32.ll b/test/CodeGen/NVPTX/sm-version-32.ll
new file mode 100644
index 0000000..d6a5082
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-32.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_32 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_32 | FileCheck %s
+
+
+; CHECK: .version 4.0
+; CHECK: .target sm_32
+
diff --git a/test/CodeGen/NVPTX/sm-version-35.ll b/test/CodeGen/NVPTX/sm-version-35.ll
index 25368a0..8456c66 100644
--- a/test/CodeGen/NVPTX/sm-version-35.ll
+++ b/test/CodeGen/NVPTX/sm-version-35.ll
@@ -2,5 +2,6 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s
 
 
+; CHECK: .version 3.2
 ; CHECK: .target sm_35
 
diff --git a/test/CodeGen/NVPTX/sm-version-37.ll b/test/CodeGen/NVPTX/sm-version-37.ll
new file mode 100644
index 0000000..fd51a9c
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-37.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_37 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_37 | FileCheck %s
+
+
+; CHECK: .version 4.1
+; CHECK: .target sm_37
+
diff --git a/test/CodeGen/NVPTX/sm-version-50.ll b/test/CodeGen/NVPTX/sm-version-50.ll
new file mode 100644
index 0000000..374c6ea
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-50.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_50 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_50 | FileCheck %s
+
+
+; CHECK: .version 4.0
+; CHECK: .target sm_50
+
diff --git a/test/CodeGen/NVPTX/sm-version-52.ll b/test/CodeGen/NVPTX/sm-version-52.ll
new file mode 100644
index 0000000..18881b2
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-52.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_52 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_52 | FileCheck %s
+
+
+; CHECK: .version 4.1
+; CHECK: .target sm_52
+
diff --git a/test/CodeGen/NVPTX/sm-version-53.ll b/test/CodeGen/NVPTX/sm-version-53.ll
new file mode 100644
index 0000000..50d2dec
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-53.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_53 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_53 | FileCheck %s
+
+
+; CHECK: .version 4.2
+; CHECK: .target sm_53
+
diff --git a/test/CodeGen/NVPTX/symbol-naming.ll b/test/CodeGen/NVPTX/symbol-naming.ll
index f8e6bf1..0f17693 100644
--- a/test/CodeGen/NVPTX/symbol-naming.ll
+++ b/test/CodeGen/NVPTX/symbol-naming.ll
@@ -24,7 +24,7 @@ target triple = "nvptx64-unknown-unknown"
 ; Function Attrs: nounwind
 define void @foo(i32 %a, float %b, i8 signext %c, i32 %e) {
 entry:
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0))
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0))
   ret void
 }
 
diff --git a/test/CodeGen/PowerPC/2006-10-13-Miscompile.ll b/test/CodeGen/PowerPC/2006-10-13-Miscompile.ll
index 002a064..5992ad4 100644
--- a/test/CodeGen/PowerPC/2006-10-13-Miscompile.ll
+++ b/test/CodeGen/PowerPC/2006-10-13-Miscompile.ll
@@ -6,7 +6,7 @@ entry:
         %tmp = icmp sgt i64 %tmp1, 2            ; <i1> [#uses=1]
         br i1 %tmp, label %UnifiedReturnBlock, label %cond_true
 cond_true:              ; preds = %entry
-        %tmp.upgrd.1 = tail call i32 (...)* @bar( )             ; <i32> [#uses=0]
+        %tmp.upgrd.1 = tail call i32 (...) @bar( )             ; <i32> [#uses=0]
         ret void
 UnifiedReturnBlock:             ; preds = %entry
         ret void
diff --git a/test/CodeGen/PowerPC/2006-10-17-brcc-miscompile.ll b/test/CodeGen/PowerPC/2006-10-17-brcc-miscompile.ll
index 3d462b4..ab5f37d 100644
--- a/test/CodeGen/PowerPC/2006-10-17-brcc-miscompile.ll
+++ b/test/CodeGen/PowerPC/2006-10-17-brcc-miscompile.ll
@@ -10,7 +10,7 @@ entry:
         %tmp = icmp eq i32 %tmp2, 0             ; <i1> [#uses=1]
         br i1 %tmp, label %UnifiedReturnBlock, label %cond_true
 cond_true:              ; preds = %entry
-        tail call i32 (...)* @bar( )            ; <i32>:0 [#uses=0]
+        tail call i32 (...) @bar( )            ; <i32>:0 [#uses=0]
         ret void
 UnifiedReturnBlock:             ; preds = %entry
         ret void
diff --git a/test/CodeGen/PowerPC/2007-02-23-lr-saved-twice.ll b/test/CodeGen/PowerPC/2007-02-23-lr-saved-twice.ll
index 6e8b5b4..5a6fbf0 100644
--- a/test/CodeGen/PowerPC/2007-02-23-lr-saved-twice.ll
+++ b/test/CodeGen/PowerPC/2007-02-23-lr-saved-twice.ll
@@ -7,7 +7,7 @@ target triple = "powerpc-apple-darwin8"
 
 define i32 @main() {
 entry:
-        %tmp = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([18 x i8], [18 x i8]* @str, i32 0, i32 0) )                ; <i32> [#uses=0]
+        %tmp = tail call i32 (i8*, ...) @printf( i8* getelementptr ([18 x i8], [18 x i8]* @str, i32 0, i32 0) )                ; <i32> [#uses=0]
         ret i32 0
 }
 
diff --git a/test/CodeGen/PowerPC/2007-05-22-tailmerge-3.ll b/test/CodeGen/PowerPC/2007-05-22-tailmerge-3.ll
index bba3707..b3b7323 100644
--- a/test/CodeGen/PowerPC/2007-05-22-tailmerge-3.ll
+++ b/test/CodeGen/PowerPC/2007-05-22-tailmerge-3.ll
@@ -22,8 +22,8 @@ entry:
 	br i1 %toBool, label %cond_true, label %cond_false
 
 cond_true:		; preds = %entry
-	%tmp3 = call i32 (...)* @bar( )		; <i32> [#uses=0]
-	%tmp4 = call i32 (...)* @baz( i32 5, i32 6 )		; <i32> [#uses=0]
+	%tmp3 = call i32 (...) @bar( )		; <i32> [#uses=0]
+	%tmp4 = call i32 (...) @baz( i32 5, i32 6 )		; <i32> [#uses=0]
 	%tmp7 = load i32, i32* %q_addr		; <i32> [#uses=1]
 	%tmp8 = icmp ne i32 %tmp7, 0		; <i1> [#uses=1]
 	%tmp89 = zext i1 %tmp8 to i8		; <i8> [#uses=1]
@@ -31,8 +31,8 @@ cond_true:		; preds = %entry
 	br i1 %toBool10, label %cond_true11, label %cond_false15
 
 cond_false:		; preds = %entry
-	%tmp5 = call i32 (...)* @foo( )		; <i32> [#uses=0]
-	%tmp6 = call i32 (...)* @baz( i32 5, i32 6 )		; <i32> [#uses=0]
+	%tmp5 = call i32 (...) @foo( )		; <i32> [#uses=0]
+	%tmp6 = call i32 (...) @baz( i32 5, i32 6 )		; <i32> [#uses=0]
 	%tmp27 = load i32, i32* %q_addr		; <i32> [#uses=1]
 	%tmp28 = icmp ne i32 %tmp27, 0		; <i1> [#uses=1]
 	%tmp289 = zext i1 %tmp28 to i8		; <i8> [#uses=1]
@@ -40,17 +40,17 @@ cond_false:		; preds = %entry
 	br i1 %toBool210, label %cond_true11, label %cond_false15
 
 cond_true11:		; preds = %cond_next
-	%tmp13 = call i32 (...)* @foo( )		; <i32> [#uses=0]
-	%tmp14 = call i32 (...)* @quux( i32 3, i32 4 )		; <i32> [#uses=0]
+	%tmp13 = call i32 (...) @foo( )		; <i32> [#uses=0]
+	%tmp14 = call i32 (...) @quux( i32 3, i32 4 )		; <i32> [#uses=0]
 	br label %cond_next18
 
 cond_false15:		; preds = %cond_next
-	%tmp16 = call i32 (...)* @bar( )		; <i32> [#uses=0]
-	%tmp17 = call i32 (...)* @quux( i32 3, i32 4 )		; <i32> [#uses=0]
+	%tmp16 = call i32 (...) @bar( )		; <i32> [#uses=0]
+	%tmp17 = call i32 (...) @quux( i32 3, i32 4 )		; <i32> [#uses=0]
 	br label %cond_next18
 
 cond_next18:		; preds = %cond_false15, %cond_true11
-	%tmp19 = call i32 (...)* @bar( )		; <i32> [#uses=0]
+	%tmp19 = call i32 (...) @bar( )		; <i32> [#uses=0]
 	br label %return
 
 return:		; preds = %cond_next18
diff --git a/test/CodeGen/PowerPC/2007-09-07-LoadStoreIdxForms.ll b/test/CodeGen/PowerPC/2007-09-07-LoadStoreIdxForms.ll
index ee1e233..aae914e 100644
--- a/test/CodeGen/PowerPC/2007-09-07-LoadStoreIdxForms.ll
+++ b/test/CodeGen/PowerPC/2007-09-07-LoadStoreIdxForms.ll
@@ -9,7 +9,7 @@ entry:
         %ttype = alloca i32, align 4            ; <i32*> [#uses=1]
         %regs = alloca [1024 x %struct.__db_region], align 16           ; <[1024 x %struct.__db_region]*> [#uses=0]
         %tmp = load i32, i32* %ttype, align 4                ; <i32> [#uses=1]
-        %tmp1 = call i32 (...)* @bork( i32 %tmp )               ; <i32> [#uses=0]
+        %tmp1 = call i32 (...) @bork( i32 %tmp )               ; <i32> [#uses=0]
         ret void
 
 ; CHECK: @foo
diff --git a/test/CodeGen/PowerPC/2007-09-08-unaligned.ll b/test/CodeGen/PowerPC/2007-09-08-unaligned.ll
index 341b632..ccbadb4 100644
--- a/test/CodeGen/PowerPC/2007-09-08-unaligned.ll
+++ b/test/CodeGen/PowerPC/2007-09-08-unaligned.ll
@@ -42,7 +42,7 @@ entry:
 	%tmp4 = getelementptr <{ i8, double }>, <{ i8, double }>* @v, i32 0, i32 1		; <double*> [#uses=1]
 	%tmp5 = load double, double* %tmp4, align 1		; <double> [#uses=1]
 	%tmp6 = getelementptr [8 x i8], [8 x i8]* @.str, i32 0, i32 0		; <i8*> [#uses=1]
-	%tmp7 = call i32 (i8*, ...)* @printf( i8* %tmp6, double %tmp23, double %tmp5 )		; <i32> [#uses=0]
+	%tmp7 = call i32 (i8*, ...) @printf( i8* %tmp6, double %tmp23, double %tmp5 )		; <i32> [#uses=0]
 	br label %return
 
 return:		; preds = %entry
diff --git a/test/CodeGen/PowerPC/2007-10-21-LocalRegAllocAssert.ll b/test/CodeGen/PowerPC/2007-10-21-LocalRegAllocAssert.ll
index 09f331f..13b9be3 100644
--- a/test/CodeGen/PowerPC/2007-10-21-LocalRegAllocAssert.ll
+++ b/test/CodeGen/PowerPC/2007-10-21-LocalRegAllocAssert.ll
@@ -17,11 +17,11 @@
 define %struct.NSManagedObjectContext* @"+[ListGenerator(Private) managedObjectContextWithModelURL:storeURL:]"(%struct.objc_object* %self, %struct._message_ref_t* %_cmd, %struct.NSURL* %modelURL, %struct.NSURL* %storeURL) {
 entry:
 	%storeCoordinator = alloca %struct.NSPersistentStoreCoordinator*		; <%struct.NSPersistentStoreCoordinator**> [#uses=0]
-	%tmp29 = call %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...)* null( %struct.objc_object* null, %struct._message_ref_t* @"\01L_OBJC_MESSAGE_REF_2" )		; <%struct.objc_object*> [#uses=0]
+	%tmp29 = call %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...) null( %struct.objc_object* null, %struct._message_ref_t* @"\01L_OBJC_MESSAGE_REF_2" )		; <%struct.objc_object*> [#uses=0]
 	%tmp34 = load %struct.NSString*, %struct.NSString** @NSXMLStoreType, align 8		; <%struct.NSString*> [#uses=1]
 	%tmp37 = load %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...)*, %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...)** getelementptr (%struct._message_ref_t, %struct._message_ref_t* @"\01L_OBJC_MESSAGE_REF_5", i32 0, i32 0), align 8		; <%struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...)*> [#uses=1]
-	%tmp42 = call %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...)* null( %struct.objc_object* null, %struct._message_ref_t* @"\01L_OBJC_MESSAGE_REF_4", i32 1 )		; <%struct.objc_object*> [#uses=1]
-	%tmp45 = call %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...)* %tmp37( %struct.objc_object* null, %struct._message_ref_t* @"\01L_OBJC_MESSAGE_REF_5", %struct.objc_object* %tmp42, %struct.NSString* null )		; <%struct.objc_object*> [#uses=1]
-	%tmp48 = call %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...)* null( %struct.objc_object* null, %struct._message_ref_t* @"\01L_OBJC_MESSAGE_REF_6", %struct.NSString* %tmp34, i8* null, %struct.NSURL* null, %struct.objc_object* %tmp45, %struct.NSError** null )		; <%struct.objc_object*> [#uses=0]
+	%tmp42 = call %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...) null( %struct.objc_object* null, %struct._message_ref_t* @"\01L_OBJC_MESSAGE_REF_4", i32 1 )		; <%struct.objc_object*> [#uses=1]
+	%tmp45 = call %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...) %tmp37( %struct.objc_object* null, %struct._message_ref_t* @"\01L_OBJC_MESSAGE_REF_5", %struct.objc_object* %tmp42, %struct.NSString* null )		; <%struct.objc_object*> [#uses=1]
+	%tmp48 = call %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...) null( %struct.objc_object* null, %struct._message_ref_t* @"\01L_OBJC_MESSAGE_REF_6", %struct.NSString* %tmp34, i8* null, %struct.NSURL* null, %struct.objc_object* %tmp45, %struct.NSError** null )		; <%struct.objc_object*> [#uses=0]
 	unreachable
 }
diff --git a/test/CodeGen/PowerPC/2007-10-21-LocalRegAllocAssert2.ll b/test/CodeGen/PowerPC/2007-10-21-LocalRegAllocAssert2.ll
index b27eec4..ff5f835 100644
--- a/test/CodeGen/PowerPC/2007-10-21-LocalRegAllocAssert2.ll
+++ b/test/CodeGen/PowerPC/2007-10-21-LocalRegAllocAssert2.ll
@@ -15,11 +15,11 @@
 define %struct.NSManagedObjectContext* @"+[ListGenerator(Private) managedObjectContextWithModelURL:storeURL:]"(%struct.objc_object* %self, %struct._message_ref_t* %_cmd, %struct.NSURL* %modelURL, %struct.NSURL* %storeURL) {
 entry:
 	%tmp27 = load %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...)*, %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...)** getelementptr (%struct._message_ref_t, %struct._message_ref_t* @"\01L_OBJC_MESSAGE_REF_2", i32 0, i32 0), align 8		; <%struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...)*> [#uses=1]
-	%tmp29 = call %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...)* %tmp27( %struct.objc_object* null, %struct._message_ref_t* @"\01L_OBJC_MESSAGE_REF_2" )		; <%struct.objc_object*> [#uses=0]
+	%tmp29 = call %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...) %tmp27( %struct.objc_object* null, %struct._message_ref_t* @"\01L_OBJC_MESSAGE_REF_2" )		; <%struct.objc_object*> [#uses=0]
 	%tmp33 = load %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...)*, %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...)** getelementptr (%struct._message_ref_t, %struct._message_ref_t* @"\01L_OBJC_MESSAGE_REF_6", i32 0, i32 0), align 8		; <%struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...)*> [#uses=1]
 	%tmp34 = load %struct.NSString*, %struct.NSString** @NSXMLStoreType, align 8		; <%struct.NSString*> [#uses=1]
 	%tmp40 = load %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...)*, %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...)** getelementptr (%struct._message_ref_t, %struct._message_ref_t* @"\01L_OBJC_MESSAGE_REF_4", i32 0, i32 0), align 8		; <%struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...)*> [#uses=1]
-	%tmp42 = call %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...)* %tmp40( %struct.objc_object* null, %struct._message_ref_t* @"\01L_OBJC_MESSAGE_REF_4", i32 1 )		; <%struct.objc_object*> [#uses=0]
-	%tmp48 = call %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...)* %tmp33( %struct.objc_object* null, %struct._message_ref_t* @"\01L_OBJC_MESSAGE_REF_6", %struct.NSString* %tmp34, i8* null, %struct.NSURL* null, %struct.objc_object* null, %struct.NSError** null )		; <%struct.objc_object*> [#uses=0]
+	%tmp42 = call %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...) %tmp40( %struct.objc_object* null, %struct._message_ref_t* @"\01L_OBJC_MESSAGE_REF_4", i32 1 )		; <%struct.objc_object*> [#uses=0]
+	%tmp48 = call %struct.objc_object* (%struct.objc_object*, %struct._message_ref_t*, ...) %tmp33( %struct.objc_object* null, %struct._message_ref_t* @"\01L_OBJC_MESSAGE_REF_6", %struct.NSString* %tmp34, i8* null, %struct.NSURL* null, %struct.objc_object* null, %struct.NSError** null )		; <%struct.objc_object*> [#uses=0]
 	unreachable
 }
diff --git a/test/CodeGen/PowerPC/2008-04-23-CoalescerCrash.ll b/test/CodeGen/PowerPC/2008-04-23-CoalescerCrash.ll
index 9158084..45d4399 100644
--- a/test/CodeGen/PowerPC/2008-04-23-CoalescerCrash.ll
+++ b/test/CodeGen/PowerPC/2008-04-23-CoalescerCrash.ll
@@ -39,7 +39,7 @@ bb41:		; preds = %bb31
 	ret i32 0
 
 bb68:		; preds = %bb31
-	tail call void (i8*, ...)* @IOLog( i8* getelementptr ([68 x i8], [68 x i8]* @.str34, i32 0, i32 0), i64 %tmp34, i64 0, i32 131072 ) nounwind 
+	tail call void (i8*, ...) @IOLog( i8* getelementptr ([68 x i8], [68 x i8]* @.str34, i32 0, i32 0), i64 %tmp34, i64 0, i32 131072 ) nounwind 
 	%tmp2021.i = trunc i64 %Pos.0.reg2mem.0 to i32		; <i32> [#uses=1]
 	%tmp202122.i = inttoptr i32 %tmp2021.i to i8*		; <i8*> [#uses=1]
 	tail call void @IODelay( i32 500 ) nounwind 
@@ -55,7 +55,7 @@ bb65.i:		; preds = %bb68
 	br i1 %tmp67.i, label %_Z24unlock_then_erase_sectory.exit, label %bb70.i
 
 bb70.i:		; preds = %bb65.i
-	tail call void (i8*, ...)* @IOLog( i8* getelementptr ([64 x i8], [64 x i8]* @.str19, i32 0, i32 0), i32 %tmp5455.i ) nounwind 
+	tail call void (i8*, ...) @IOLog( i8* getelementptr ([64 x i8], [64 x i8]* @.str19, i32 0, i32 0), i32 %tmp5455.i ) nounwind 
 	ret i32 0
 
 _Z24unlock_then_erase_sectory.exit:		; preds = %bb65.i
@@ -66,11 +66,11 @@ _Z24unlock_then_erase_sectory.exit:		; preds = %bb65.i
 	br i1 %tmp100, label %bb31, label %bb103
 
 bb103:		; preds = %_Z24unlock_then_erase_sectory.exit, %bb
-	tail call void (i8*, ...)* @IOLog( i8* getelementptr ([37 x i8], [37 x i8]* @.str35, i32 0, i32 0) ) nounwind 
+	tail call void (i8*, ...) @IOLog( i8* getelementptr ([37 x i8], [37 x i8]* @.str35, i32 0, i32 0) ) nounwind 
 	ret i32 0
 
 bb107:		; preds = %entry
-	tail call void (i8*, ...)* @IOLog( i8* getelementptr ([48 x i8], [48 x i8]* @.str36, i32 0, i32 0) ) nounwind 
+	tail call void (i8*, ...) @IOLog( i8* getelementptr ([48 x i8], [48 x i8]* @.str36, i32 0, i32 0) ) nounwind 
 	%tmp114115 = bitcast i8* %buffer to i16*		; <i16*> [#uses=1]
 	%tmp256 = lshr i64 %bufferSize, 1		; <i64> [#uses=1]
 	%tmp256257 = trunc i64 %tmp256 to i32		; <i32> [#uses=1]
diff --git a/test/CodeGen/PowerPC/2008-07-24-PPC64-CCBug.ll b/test/CodeGen/PowerPC/2008-07-24-PPC64-CCBug.ll
index 9de1c7f..ee3d0f4 100644
--- a/test/CodeGen/PowerPC/2008-07-24-PPC64-CCBug.ll
+++ b/test/CodeGen/PowerPC/2008-07-24-PPC64-CCBug.ll
@@ -4,7 +4,7 @@
 
 define void @llvm_static_func(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, i32 %a10, i32 %a11, i32 %a12, i32 %a13, i32 %a14, i32 %a15) nounwind  {
 entry:
-	tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i64 0), i32 %a8 ) nounwind 		; <i32>:0 [#uses=0]
+	tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i64 0), i32 %a8 ) nounwind 		; <i32>:0 [#uses=0]
 	ret void
 }
 
diff --git a/test/CodeGen/PowerPC/2008-09-12-CoalescerBug.ll b/test/CodeGen/PowerPC/2008-09-12-CoalescerBug.ll
index d98080b..b107600 100644
--- a/test/CodeGen/PowerPC/2008-09-12-CoalescerBug.ll
+++ b/test/CodeGen/PowerPC/2008-09-12-CoalescerBug.ll
@@ -169,19 +169,19 @@ bb2315:		; preds = %bb2295
 
 bb2317:		; preds = %bb2315
 	%115 = load i64, i64* %2, align 16		; <i64> [#uses=1]
-	%116 = call i32 (...)* @_u16a_cm( i64 %115, %struct.xx_t* %159, double 0.000000e+00, double 1.047551e+06 ) nounwind		; <i32> [#uses=1]
+	%116 = call i32 (...) @_u16a_cm( i64 %115, %struct.xx_t* %159, double 0.000000e+00, double 1.047551e+06 ) nounwind		; <i32> [#uses=1]
 	%117 = sext i32 %116 to i64		; <i64> [#uses=1]
 	store i64 %117, i64* %2, align 16
 	%118 = load i64, i64* %3, align 8		; <i64> [#uses=1]
-	%119 = call i32 (...)* @_u16a_cm( i64 %118, %struct.xx_t* %159, double 0.000000e+00, double 1.047551e+06 ) nounwind		; <i32> [#uses=1]
+	%119 = call i32 (...) @_u16a_cm( i64 %118, %struct.xx_t* %159, double 0.000000e+00, double 1.047551e+06 ) nounwind		; <i32> [#uses=1]
 	%120 = sext i32 %119 to i64		; <i64> [#uses=1]
 	store i64 %120, i64* %3, align 8
 	%121 = load i64, i64* %4, align 16		; <i64> [#uses=1]
-	%122 = call i32 (...)* @_u16a_cm( i64 %121, %struct.xx_t* %159, double 0.000000e+00, double 1.047551e+06 ) nounwind		; <i32> [#uses=1]
+	%122 = call i32 (...) @_u16a_cm( i64 %121, %struct.xx_t* %159, double 0.000000e+00, double 1.047551e+06 ) nounwind		; <i32> [#uses=1]
 	%123 = sext i32 %122 to i64		; <i64> [#uses=1]
 	store i64 %123, i64* %4, align 16
 	%124 = load i64, i64* %5, align 8		; <i64> [#uses=1]
-	%125 = call i32 (...)* @_u16a_cm( i64 %124, %struct.xx_t* %159, double 0.000000e+00, double 1.047551e+06 ) nounwind		; <i32> [#uses=0]
+	%125 = call i32 (...) @_u16a_cm( i64 %124, %struct.xx_t* %159, double 0.000000e+00, double 1.047551e+06 ) nounwind		; <i32> [#uses=0]
 	unreachable
 
 bb2318:		; preds = %bb2315
@@ -190,29 +190,29 @@ bb2318:		; preds = %bb2315
 	%128 = load i64, i64* %127, align 8		; <i64> [#uses=1]
 	%129 = trunc i64 %128 to i32		; <i32> [#uses=4]
 	%130 = load i64, i64* %2, align 16		; <i64> [#uses=1]
-	%131 = call i32 (...)* @_u16_ff( i64 %130, i32 %129 ) nounwind		; <i32> [#uses=1]
+	%131 = call i32 (...) @_u16_ff( i64 %130, i32 %129 ) nounwind		; <i32> [#uses=1]
 	%132 = sext i32 %131 to i64		; <i64> [#uses=1]
 	store i64 %132, i64* %2, align 16
 	%133 = load i64, i64* %3, align 8		; <i64> [#uses=1]
-	%134 = call i32 (...)* @_u16_ff( i64 %133, i32 %129 ) nounwind		; <i32> [#uses=1]
+	%134 = call i32 (...) @_u16_ff( i64 %133, i32 %129 ) nounwind		; <i32> [#uses=1]
 	%135 = sext i32 %134 to i64		; <i64> [#uses=1]
 	store i64 %135, i64* %3, align 8
 	%136 = load i64, i64* %4, align 16		; <i64> [#uses=1]
-	%137 = call i32 (...)* @_u16_ff( i64 %136, i32 %129 ) nounwind		; <i32> [#uses=1]
+	%137 = call i32 (...) @_u16_ff( i64 %136, i32 %129 ) nounwind		; <i32> [#uses=1]
 	%138 = sext i32 %137 to i64		; <i64> [#uses=1]
 	store i64 %138, i64* %4, align 16
 	%139 = load i64, i64* %5, align 8		; <i64> [#uses=1]
-	%140 = call i32 (...)* @_u16_ff( i64 %139, i32 %129 ) nounwind		; <i32> [#uses=0]
+	%140 = call i32 (...) @_u16_ff( i64 %139, i32 %129 ) nounwind		; <i32> [#uses=0]
 	unreachable
 
 bb2319:		; preds = %bb2326
 	%141 = getelementptr %struct.CGLSI, %struct.CGLSI* %src, i32 %indvar5021, i32 2		; <i8**> [#uses=1]
 	%142 = load i8*, i8** %141, align 4		; <i8*> [#uses=4]
 	%143 = getelementptr i8, i8* %142, i32 0		; <i8*> [#uses=1]
-	%144 = call i32 (...)* @_u16_sf32( double 0.000000e+00, double 6.553500e+04, double 5.000000e-01, i8* %143 ) nounwind		; <i32> [#uses=1]
+	%144 = call i32 (...) @_u16_sf32( double 0.000000e+00, double 6.553500e+04, double 5.000000e-01, i8* %143 ) nounwind		; <i32> [#uses=1]
 	%145 = sext i32 %144 to i64		; <i64> [#uses=2]
 	%146 = getelementptr i8, i8* %142, i32 0		; <i8*> [#uses=1]
-	%147 = call i32 (...)* @_u16_sf32( double 0.000000e+00, double 6.553500e+04, double 5.000000e-01, i8* %146 ) nounwind		; <i32> [#uses=1]
+	%147 = call i32 (...) @_u16_sf32( double 0.000000e+00, double 6.553500e+04, double 5.000000e-01, i8* %146 ) nounwind		; <i32> [#uses=1]
 	%148 = sext i32 %147 to i64		; <i64> [#uses=2]
 	%149 = shl i64 %145, 48		; <i64> [#uses=0]
 	%150 = shl i64 %148, 32		; <i64> [#uses=1]
@@ -220,10 +220,10 @@ bb2319:		; preds = %bb2326
 	store i64 %145, i64* %2, align 16
 	store i64 %148, i64* %3, align 8
 	%152 = getelementptr i8, i8* %142, i32 0		; <i8*> [#uses=1]
-	%153 = call i32 (...)* @_u16_sf32( double 0.000000e+00, double 6.553500e+04, double 5.000000e-01, i8* %152 ) nounwind		; <i32> [#uses=1]
+	%153 = call i32 (...) @_u16_sf32( double 0.000000e+00, double 6.553500e+04, double 5.000000e-01, i8* %152 ) nounwind		; <i32> [#uses=1]
 	%154 = sext i32 %153 to i64		; <i64> [#uses=0]
 	%155 = getelementptr i8, i8* %142, i32 0		; <i8*> [#uses=1]
-	%156 = call i32 (...)* @_u16_sf32( double 0.000000e+00, double 6.553500e+04, double 5.000000e-01, i8* %155 ) nounwind		; <i32> [#uses=0]
+	%156 = call i32 (...) @_u16_sf32( double 0.000000e+00, double 6.553500e+04, double 5.000000e-01, i8* %155 ) nounwind		; <i32> [#uses=0]
 	unreachable
 
 bb2325:		; preds = %bb2326, %bb2295
diff --git a/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll b/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
index d4972a9..0599b74 100644
--- a/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
+++ b/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
@@ -12,7 +12,7 @@ entry:
 ; CHECK: mtctr r12
 ; CHECK: bctrl
   %0 = load void (...)*, void (...)** @p, align 4              ; <void (...)*> [#uses=1]
-  call void (...)* %0() nounwind
+  call void (...) %0() nounwind
   br label %return
 
 return:                                           ; preds = %entry
diff --git a/test/CodeGen/PowerPC/2011-12-05-NoSpillDupCR.ll b/test/CodeGen/PowerPC/2011-12-05-NoSpillDupCR.ll
index a488e68..e592091 100644
--- a/test/CodeGen/PowerPC/2011-12-05-NoSpillDupCR.ll
+++ b/test/CodeGen/PowerPC/2011-12-05-NoSpillDupCR.ll
@@ -66,7 +66,7 @@ for.end12:                                        ; preds = %for.end.7, %for.end
   %sub14 = sub nsw i64 %call13, %call1
   %conv = sitofp i64 %sub14 to double
   %div = fdiv double %conv, 1.000000e+06
-  %call15 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str152, i64 0, i64 0), double %div) nounwind
+  %call15 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str152, i64 0, i64 0), double %div) nounwind
   tail call void @check(i32 1)
   ret i32 0
 
diff --git a/test/CodeGen/PowerPC/2011-12-06-SpillAndRestoreCR.ll b/test/CodeGen/PowerPC/2011-12-06-SpillAndRestoreCR.ll
index 84c9989..9347682 100644
--- a/test/CodeGen/PowerPC/2011-12-06-SpillAndRestoreCR.ll
+++ b/test/CodeGen/PowerPC/2011-12-06-SpillAndRestoreCR.ll
@@ -204,7 +204,7 @@ for.end23:                                        ; preds = %for.end17
   %sub = sub nsw i64 %call24, %call1
   %conv25 = sitofp i64 %sub to double
   %div = fdiv double %conv25, 1.000000e+06
-  %call26 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str235, i64 0, i64 0), double %div) nounwind
+  %call26 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str235, i64 0, i64 0), double %div) nounwind
   %add29 = fadd float %add, 1.000000e+00
   %add31 = fadd float %add29, %conv18
   %add32 = fadd float %add31, 1.000000e+00
diff --git a/test/CodeGen/PowerPC/and-branch.ll b/test/CodeGen/PowerPC/and-branch.ll
index 0484f88..1543205 100644
--- a/test/CodeGen/PowerPC/and-branch.ll
+++ b/test/CodeGen/PowerPC/and-branch.ll
@@ -7,7 +7,7 @@ entry:
         %tmp4 = and i1 %tmp3, %tmp              ; <i1> [#uses=1]
         br i1 %tmp4, label %cond_true, label %UnifiedReturnBlock
 cond_true:              ; preds = %entry
-        %tmp5 = tail call i32 (...)* @bar( )            ; <i32> [#uses=0]
+        %tmp5 = tail call i32 (...) @bar( )            ; <i32> [#uses=0]
         ret void
 UnifiedReturnBlock:             ; preds = %entry
         ret void
diff --git a/test/CodeGen/PowerPC/atomic-2.ll b/test/CodeGen/PowerPC/atomic-2.ll
index 9130921..b4b95a2 100644
--- a/test/CodeGen/PowerPC/atomic-2.ll
+++ b/test/CodeGen/PowerPC/atomic-2.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=ppc64 | FileCheck %s
-; RUN: llc < %s -march=ppc64 -mcpu=pwr7 | FileCheck %s -check-prefix=CHECK-P7U
-; RUN: llc < %s -march=ppc64 -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-P7U
+; RUN: llc < %s -march=ppc64 -mcpu=pwr7 | FileCheck %s
+; RUN: llc < %s -march=ppc64 -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-P8U
 
 define i64 @exchange_and_add(i64* %mem, i64 %val) nounwind {
 ; CHECK-LABEL: exchange_and_add:
@@ -12,17 +12,17 @@ define i64 @exchange_and_add(i64* %mem, i64 %val) nounwind {
 
 define i8 @exchange_and_add8(i8* %mem, i8 %val) nounwind {
 ; CHECK-LABEL: exchange_and_add8:
-; CHECK-P7U: lbarx
+; CHECK-P8U: lbarx
   %tmp = atomicrmw add i8* %mem, i8 %val monotonic
-; CHECK-P7U: stbcx.
+; CHECK-P8U: stbcx.
   ret i8 %tmp
 }
 
 define i16 @exchange_and_add16(i16* %mem, i16 %val) nounwind {
 ; CHECK-LABEL: exchange_and_add16:
-; CHECK-P7U: lharx
+; CHECK-P8U: lharx
   %tmp = atomicrmw add i16* %mem, i16 %val monotonic
-; CHECK-P7U: sthcx.
+; CHECK-P8U: sthcx.
   ret i16 %tmp
 }
 
@@ -38,21 +38,21 @@ define i64 @exchange_and_cmp(i64* %mem) nounwind {
 
 define i8 @exchange_and_cmp8(i8* %mem) nounwind {
 ; CHECK-LABEL: exchange_and_cmp8:
-; CHECK-P7U: lbarx
+; CHECK-P8U: lbarx
   %tmppair = cmpxchg i8* %mem, i8 0, i8 1 monotonic monotonic
   %tmp = extractvalue { i8, i1 } %tmppair, 0
-; CHECK-P7U: stbcx.
-; CHECK-P7U: stbcx.
+; CHECK-P8U: stbcx.
+; CHECK-P8U: stbcx.
   ret i8 %tmp
 }
 
 define i16 @exchange_and_cmp16(i16* %mem) nounwind {
 ; CHECK-LABEL: exchange_and_cmp16:
-; CHECK-P7U: lharx
+; CHECK-P8U: lharx
   %tmppair = cmpxchg i16* %mem, i16 0, i16 1 monotonic monotonic
   %tmp = extractvalue { i16, i1 } %tmppair, 0
-; CHECK-P7U: sthcx.
-; CHECK-P7U: sthcx.
+; CHECK-P8U: sthcx.
+; CHECK-P8U: sthcx.
   ret i16 %tmp
 }
 
@@ -66,17 +66,17 @@ define i64 @exchange(i64* %mem, i64 %val) nounwind {
 
 define i8 @exchange8(i8* %mem, i8 %val) nounwind {
 ; CHECK-LABEL: exchange8:
-; CHECK-P7U: lbarx
+; CHECK-P8U: lbarx
   %tmp = atomicrmw xchg i8* %mem, i8 1 monotonic
-; CHECK-P7U: stbcx.
+; CHECK-P8U: stbcx.
   ret i8 %tmp
 }
 
 define i16 @exchange16(i16* %mem, i16 %val) nounwind {
 ; CHECK-LABEL: exchange16:
-; CHECK-P7U: lharx
+; CHECK-P8U: lharx
   %tmp = atomicrmw xchg i16* %mem, i16 1 monotonic
-; CHECK-P7U: sthcx.
+; CHECK-P8U: sthcx.
   ret i16 %tmp
 }
 
diff --git a/test/CodeGen/PowerPC/branch-opt.ll b/test/CodeGen/PowerPC/branch-opt.ll
index dda1538..d6928dd 100644
--- a/test/CodeGen/PowerPC/branch-opt.ll
+++ b/test/CodeGen/PowerPC/branch-opt.ll
@@ -11,7 +11,7 @@ entry:
 	br i1 %tmp1.upgrd.1, label %cond_false, label %bb5
 bb:		; preds = %bb5, %bb
 	%indvar77 = phi i32 [ %indvar.next78, %bb ], [ 0, %bb5 ]		; <i32> [#uses=1]
-	%tmp2 = tail call i32 (...)* @bar( )		; <i32> [#uses=0]
+	%tmp2 = tail call i32 (...) @bar( )		; <i32> [#uses=0]
 	%indvar.next78 = add i32 %indvar77, 1		; <i32> [#uses=2]
 	%exitcond79 = icmp eq i32 %indvar.next78, %X		; <i1> [#uses=1]
 	br i1 %exitcond79, label %cond_next48, label %bb
@@ -24,7 +24,7 @@ cond_false:		; preds = %entry
 	br i1 %tmp10.upgrd.2, label %cond_false20, label %bb16
 bb12:		; preds = %bb16, %bb12
 	%indvar72 = phi i32 [ %indvar.next73, %bb12 ], [ 0, %bb16 ]		; <i32> [#uses=1]
-	%tmp13 = tail call i32 (...)* @bar( )		; <i32> [#uses=0]
+	%tmp13 = tail call i32 (...) @bar( )		; <i32> [#uses=0]
 	%indvar.next73 = add i32 %indvar72, 1		; <i32> [#uses=2]
 	%exitcond74 = icmp eq i32 %indvar.next73, %Y		; <i1> [#uses=1]
 	br i1 %exitcond74, label %cond_next48, label %bb12
@@ -37,7 +37,7 @@ cond_false20:		; preds = %cond_false
 	br i1 %tmp23.upgrd.3, label %cond_false33, label %bb29
 bb25:		; preds = %bb29, %bb25
 	%indvar67 = phi i32 [ %indvar.next68, %bb25 ], [ 0, %bb29 ]		; <i32> [#uses=1]
-	%tmp26 = tail call i32 (...)* @bar( )		; <i32> [#uses=0]
+	%tmp26 = tail call i32 (...) @bar( )		; <i32> [#uses=0]
 	%indvar.next68 = add i32 %indvar67, 1		; <i32> [#uses=2]
 	%exitcond69 = icmp eq i32 %indvar.next68, %Z		; <i1> [#uses=1]
 	br i1 %exitcond69, label %cond_next48, label %bb25
@@ -49,7 +49,7 @@ cond_false33:		; preds = %cond_false20
 	%tmp36.upgrd.4 = icmp eq i32 %tmp36, 0		; <i1> [#uses=1]
 	br i1 %tmp36.upgrd.4, label %cond_next48, label %bb42
 bb38:		; preds = %bb42
-	%tmp39 = tail call i32 (...)* @bar( )		; <i32> [#uses=0]
+	%tmp39 = tail call i32 (...) @bar( )		; <i32> [#uses=0]
 	%indvar.next = add i32 %indvar, 1		; <i32> [#uses=1]
 	br label %bb42
 bb42:		; preds = %bb38, %cond_false33
@@ -62,7 +62,7 @@ cond_next48:		; preds = %bb42, %cond_false33, %bb29, %bb25, %bb16, %bb12, %bb5,
 	%tmp50 = icmp eq i32 %W_addr.1, 0		; <i1> [#uses=1]
 	br i1 %tmp50, label %UnifiedReturnBlock, label %cond_true51
 cond_true51:		; preds = %cond_next48
-	%tmp52 = tail call i32 (...)* @bar( )		; <i32> [#uses=0]
+	%tmp52 = tail call i32 (...) @bar( )		; <i32> [#uses=0]
 	ret void
 UnifiedReturnBlock:		; preds = %cond_next48
 	ret void
diff --git a/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll b/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll
index 477cf2f..2b3ab9b 100644
--- a/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll
+++ b/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll
@@ -7,14 +7,14 @@ target triple = "powerpc-unknown-linux"
 define void @test(i32 %count) nounwind {
 entry:
 ; CHECK: crxor 6, 6, 6
-  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i32 1) nounwind
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i32 1) nounwind
   %cmp2 = icmp sgt i32 %count, 0
   br i1 %cmp2, label %for.body, label %for.end
 
 for.body:                                         ; preds = %entry, %for.body
   %i.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
 ; CHECK: crxor 6, 6, 6
-  %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i32 1) nounwind
+  %call1 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i32 0, i32 0), i32 1) nounwind
   %inc = add nsw i32 %i.03, 1
   %exitcond = icmp eq i32 %inc, %count
   br i1 %exitcond, label %for.end, label %for.body
diff --git a/test/CodeGen/PowerPC/cr1eq.ll b/test/CodeGen/PowerPC/cr1eq.ll
index 5ffc1a2..43cd454 100644
--- a/test/CodeGen/PowerPC/cr1eq.ll
+++ b/test/CodeGen/PowerPC/cr1eq.ll
@@ -9,9 +9,9 @@ target triple = "powerpc-unknown-freebsd"
 define void @foo() nounwind {
 entry:
 ; CHECK: crxor 6, 6, 6
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 1)
 ; CHECK: creqv 6, 6, 6
-  %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str1, i32 0, i32 0), double 1.100000e+00)
+  %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str1, i32 0, i32 0), double 1.100000e+00)
   ret void
 }
 
diff --git a/test/CodeGen/PowerPC/cr_spilling.ll b/test/CodeGen/PowerPC/cr_spilling.ll
index fdbcf4b..8ac4e72 100644
--- a/test/CodeGen/PowerPC/cr_spilling.ll
+++ b/test/CodeGen/PowerPC/cr_spilling.ll
@@ -10,7 +10,7 @@ entry:
 	br i1 false, label %cond_true94, label %cond_next99
 
 cond_true94:		; preds = %entry
-	%tmp98 = call i32 (i8*, ...)* @printf(i8* getelementptr ([3 x i8], [3 x i8]* @.str242, i32 0, i32 0), i8* null)		; <i32> [#uses=0]
+	%tmp98 = call i32 (i8*, ...) @printf(i8* getelementptr ([3 x i8], [3 x i8]* @.str242, i32 0, i32 0), i8* null)		; <i32> [#uses=0]
 	%tmp20971 = icmp sgt i32 %tmp86, 0		; <i1> [#uses=1]
 	br i1 %tmp20971, label %bb101, label %bb212
 
diff --git a/test/CodeGen/PowerPC/ctrloop-sums.ll b/test/CodeGen/PowerPC/ctrloop-sums.ll
index 4d8488c..056ee34 100644
--- a/test/CodeGen/PowerPC/ctrloop-sums.ll
+++ b/test/CodeGen/PowerPC/ctrloop-sums.ll
@@ -119,7 +119,7 @@ for.body3.lr.ph.us.i:                             ; preds = %for.inc17, %for.inc
   br label %for.body3.us.i
 
 SumArray.exit:                                    ; preds = %for.inc6.us.i
-  %call20 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i64 0, i64 0), i32 100, i32 100, i32 %add.us.i) nounwind
+  %call20 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i64 0, i64 0), i32 100, i32 100, i32 %add.us.i) nounwind
   ret i32 0
 
 ; CHECK: @main
diff --git a/test/CodeGen/PowerPC/dbg.ll b/test/CodeGen/PowerPC/dbg.ll
index a64d58f..f09fc4f 100644
--- a/test/CodeGen/PowerPC/dbg.ll
+++ b/test/CodeGen/PowerPC/dbg.ll
@@ -17,7 +17,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22}
 
-!0 = !MDCompileUnit(language: DW_LANG_C99, producer: "clang version 3.1", isOptimized: true, emissionKind: 0, file: !21, enums: !1, retainedTypes: !1, subprograms: !3, globals: !1, imports: !"")
+!0 = !MDCompileUnit(language: DW_LANG_C99, producer: "clang version 3.1", isOptimized: true, emissionKind: 0, file: !21, enums: !1, retainedTypes: !1, subprograms: !3, globals: !1, imports: !1)
 !1 = !{}
 !3 = !{!5}
 !5 = !MDSubprogram(name: "main", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !21, scope: null, type: !7, function: i32 (i32, i8**)* @main, variables: !13)
diff --git a/test/CodeGen/PowerPC/div-e-32.ll b/test/CodeGen/PowerPC/div-e-32.ll
new file mode 100644
index 0000000..588756b
--- /dev/null
+++ b/test/CodeGen/PowerPC/div-e-32.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=powerpc-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
+
+; Function Attrs: nounwind
+define signext i32 @test1() #0 {
+entry:
+  %0 = call i32 @llvm.ppc.divwe(i32 32, i32 16)
+  ret i32 %0
+; CHECK: divwe 3, 4, 3
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.ppc.divwe(i32, i32) #1
+
+; Function Attrs: nounwind
+define signext i32 @test2() #0 {
+entry:
+  %0 = call i32 @llvm.ppc.divweu(i32 32, i32 16)
+  ret i32 %0
+; CHECK: divweu 3, 4, 3
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.ppc.divweu(i32, i32) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.7.0 (trunk 231831) (llvm/trunk 231828:231843M)"}
diff --git a/test/CodeGen/PowerPC/div-e-all.ll b/test/CodeGen/PowerPC/div-e-all.ll
new file mode 100644
index 0000000..912deeb
--- /dev/null
+++ b/test/CodeGen/PowerPC/div-e-all.ll
@@ -0,0 +1,54 @@
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
+
+; Function Attrs: nounwind
+define signext i32 @test1() #0 {
+entry:
+  %0 = call i32 @llvm.ppc.divwe(i32 32, i32 16)
+  ret i32 %0
+; CHECK: divwe 3, 4, 3
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.ppc.divwe(i32, i32) #1
+
+; Function Attrs: nounwind
+define signext i32 @test2() #0 {
+entry:
+  %0 = call i32 @llvm.ppc.divweu(i32 32, i32 16)
+  ret i32 %0
+; CHECK: divweu 3, 4, 3
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.ppc.divweu(i32, i32) #1
+
+; Function Attrs: nounwind
+define i64 @test3() #0 {
+entry:
+  %0 = call i64 @llvm.ppc.divde(i64 32, i64 16)
+  ret i64 %0
+; CHECK: divde 3, 4, 3
+}
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.ppc.divde(i64, i64) #1
+
+; Function Attrs: nounwind
+define i64 @test4() #0 {
+entry:
+  %0 = call i64 @llvm.ppc.divdeu(i64 32, i64 16)
+  ret i64 %0
+; CHECK: divdeu 3, 4, 3
+}
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.ppc.divdeu(i64, i64) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.7.0 (trunk 231831) (llvm/trunk 231828:231843M)"}
diff --git a/test/CodeGen/PowerPC/f32-to-i64.ll b/test/CodeGen/PowerPC/f32-to-i64.ll
new file mode 100644
index 0000000..c138188
--- /dev/null
+++ b/test/CodeGen/PowerPC/f32-to-i64.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-m:e-p:32:32-i64:64-n32"
+target triple = "powerpc-unknown-unknown"
+
+; Function Attrs: nounwind
+define i64 @testullf(float %arg) #0 {
+entry:
+  %arg.addr = alloca float, align 4
+  store float %arg, float* %arg.addr, align 4
+  %0 = load float, float* %arg.addr, align 4
+  %conv = fptoui float %0 to i64
+  ret i64 %conv
+
+; CHECK-LABEL: @testullf
+; CHECK: fctiduz [[REG1:[0-9]+]], 1
+; CHECK: stfd [[REG1]], [[OFF:[0-9]+]](1)
+; CHECK-DAG: lwz 3, [[OFF]](1)
+; CHECK-DAG: lwz 4, {{[0-9]+}}(1)
+; CHECK: blr
+}
+
+attributes #0 = { nounwind "target-cpu"="a2" }
+
diff --git a/test/CodeGen/PowerPC/fast-isel-load-store-vsx.ll b/test/CodeGen/PowerPC/fast-isel-load-store-vsx.ll
index ab439cd..8a873da 100644
--- a/test/CodeGen/PowerPC/fast-isel-load-store-vsx.ll
+++ b/test/CodeGen/PowerPC/fast-isel-load-store-vsx.ll
@@ -1,15 +1,12 @@
-;; There are some known limitations in the VSX support during FastIsel 
-;; (see fast-isel-load-store.ll header). Nevertheless, we are adding some 
-;; regressions here for bugs we fix in the meantime
 ; RUN: llc < %s -O0 -fast-isel -mattr=+vsx -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64VSX
 
 ;; The semantics of VSX stores for when R0 is used is different depending on
 ;; whether it is used as base or offset. If used as base, the effective
-;; address computation will use zero regardless the content of R0. If used as
-;; offset, the content will be used in the effective address. We observed that
+;; address computation will use zero regardless of the content of R0. If used as
+;; an offset the content will be used in the effective address. We observed that
 ;; for some constructors, the initialization values were being stored without
-;; any offset register being specified which was causing R0 to be used as offset
-;; in regions where it contained the value in the link register. This regression
+;; an offset register being specified which was causing R0 to be used as offset
+;; in regions where it contained the value in the link register. This test
 ;; verifies that R0 is used as base in these situations.
 
 %SomeStruct = type { double }
@@ -28,6 +25,4 @@ entry:
   %0 = load double, double* %V.addr, align 8
   store double %0, double* %Val, align 8
   ret void
- } 
-
-
+ }
diff --git a/test/CodeGen/PowerPC/fp-int-conversions-direct-moves.ll b/test/CodeGen/PowerPC/fp-int-conversions-direct-moves.ll
new file mode 100644
index 0000000..1d9b648
--- /dev/null
+++ b/test/CodeGen/PowerPC/fp-int-conversions-direct-moves.ll
@@ -0,0 +1,426 @@
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64-unknown-unknown < %s | FileCheck %s
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
+
+; Function Attrs: nounwind
+define zeroext i8 @_Z6testcff(float %arg) {
+entry:
+  %arg.addr = alloca float, align 4
+  store float %arg, float* %arg.addr, align 4
+  %0 = load float, float* %arg.addr, align 4
+  %conv = fptoui float %0 to i8
+  ret i8 %conv
+; CHECK-LABEL: @_Z6testcff
+; CHECK: xscvdpsxws [[CONVREG01:[0-9]+]], 1
+; CHECK: mfvsrwz 3, [[CONVREG01]]
+}
+
+; Function Attrs: nounwind
+define float @_Z6testfcc(i8 zeroext %arg) {
+entry:
+  %arg.addr = alloca i8, align 1
+  store i8 %arg, i8* %arg.addr, align 1
+  %0 = load i8, i8* %arg.addr, align 1
+  %conv = uitofp i8 %0 to float
+  ret float %conv
+; CHECK-LABEL: @_Z6testfcc
+; CHECK: mtvsrwz [[MOVEREG01:[0-9]+]], 3
+; FIXME: Once we have XSCVUXDSP implemented, this will change
+; CHECK: fcfidus 1, [[MOVEREG01]]
+}
+
+; Function Attrs: nounwind
+define zeroext i8 @_Z6testcdd(double %arg) {
+entry:
+  %arg.addr = alloca double, align 8
+  store double %arg, double* %arg.addr, align 8
+  %0 = load double, double* %arg.addr, align 8
+  %conv = fptoui double %0 to i8
+  ret i8 %conv
+; CHECK-LABEL: @_Z6testcdd
+; CHECK: xscvdpsxws [[CONVREG02:[0-9]+]], 1
+; CHECK: mfvsrwz 3, [[CONVREG02]]
+}
+
+; Function Attrs: nounwind
+define double @_Z6testdcc(i8 zeroext %arg) {
+entry:
+  %arg.addr = alloca i8, align 1
+  store i8 %arg, i8* %arg.addr, align 1
+  %0 = load i8, i8* %arg.addr, align 1
+  %conv = uitofp i8 %0 to double
+  ret double %conv
+; CHECK-LABEL: @_Z6testdcc
+; CHECK: mtvsrwz [[MOVEREG02:[0-9]+]], 3
+; CHECK: xscvuxddp 1, [[MOVEREG02]]
+}
+
+; Function Attrs: nounwind
+define zeroext i8 @_Z7testucff(float %arg) {
+entry:
+  %arg.addr = alloca float, align 4
+  store float %arg, float* %arg.addr, align 4
+  %0 = load float, float* %arg.addr, align 4
+  %conv = fptoui float %0 to i8
+  ret i8 %conv
+; CHECK-LABEL: @_Z7testucff
+; CHECK: xscvdpsxws [[CONVREG03:[0-9]+]], 1
+; CHECK: mfvsrwz 3, [[CONVREG03]]
+}
+
+; Function Attrs: nounwind
+define float @_Z7testfuch(i8 zeroext %arg) {
+entry:
+  %arg.addr = alloca i8, align 1
+  store i8 %arg, i8* %arg.addr, align 1
+  %0 = load i8, i8* %arg.addr, align 1
+  %conv = uitofp i8 %0 to float
+  ret float %conv
+; CHECK-LABEL: @_Z7testfuch
+; CHECK: mtvsrwz [[MOVEREG03:[0-9]+]], 3
+; FIXME: Once we have XSCVUXDSP implemented, this will change
+; CHECK: fcfidus 1, [[MOVEREG03]]
+}
+
+; Function Attrs: nounwind
+define zeroext i8 @_Z7testucdd(double %arg) {
+entry:
+  %arg.addr = alloca double, align 8
+  store double %arg, double* %arg.addr, align 8
+  %0 = load double, double* %arg.addr, align 8
+  %conv = fptoui double %0 to i8
+  ret i8 %conv
+; CHECK-LABEL: @_Z7testucdd
+; CHECK: xscvdpsxws [[CONVREG04:[0-9]+]], 1
+; CHECK: mfvsrwz 3, [[CONVREG04]]
+}
+
+; Function Attrs: nounwind
+define double @_Z7testduch(i8 zeroext %arg) {
+entry:
+  %arg.addr = alloca i8, align 1
+  store i8 %arg, i8* %arg.addr, align 1
+  %0 = load i8, i8* %arg.addr, align 1
+  %conv = uitofp i8 %0 to double
+  ret double %conv
+; CHECK-LABEL: @_Z7testduch
+; CHECK: mtvsrwz [[MOVEREG04:[0-9]+]], 3
+; CHECK: xscvuxddp 1, [[MOVEREG04]]
+}
+
+; Function Attrs: nounwind
+define signext i16 @_Z6testsff(float %arg) {
+entry:
+  %arg.addr = alloca float, align 4
+  store float %arg, float* %arg.addr, align 4
+  %0 = load float, float* %arg.addr, align 4
+  %conv = fptosi float %0 to i16
+  ret i16 %conv
+; CHECK-LABEL: @_Z6testsff
+; CHECK: xscvdpsxws [[CONVREG05:[0-9]+]], 1
+; CHECK: mfvsrwz 3, [[CONVREG05]]
+}
+
+; Function Attrs: nounwind
+define float @_Z6testfss(i16 signext %arg) {
+entry:
+  %arg.addr = alloca i16, align 2
+  store i16 %arg, i16* %arg.addr, align 2
+  %0 = load i16, i16* %arg.addr, align 2
+  %conv = sitofp i16 %0 to float
+  ret float %conv
+; CHECK-LABEL: @_Z6testfss
+; CHECK: mtvsrwa [[MOVEREG05:[0-9]+]], 3
+; FIXME: Once we have XSCVSXDSP implemented, this will change
+; CHECK: fcfids 1, [[MOVEREG05]]
+}
+
+; Function Attrs: nounwind
+define signext i16 @_Z6testsdd(double %arg) {
+entry:
+  %arg.addr = alloca double, align 8
+  store double %arg, double* %arg.addr, align 8
+  %0 = load double, double* %arg.addr, align 8
+  %conv = fptosi double %0 to i16
+  ret i16 %conv
+; CHECK-LABEL: @_Z6testsdd
+; CHECK: xscvdpsxws [[CONVREG06:[0-9]+]], 1
+; CHECK: mfvsrwz 3, [[CONVREG06]]
+}
+
+; Function Attrs: nounwind
+define double @_Z6testdss(i16 signext %arg) {
+entry:
+  %arg.addr = alloca i16, align 2
+  store i16 %arg, i16* %arg.addr, align 2
+  %0 = load i16, i16* %arg.addr, align 2
+  %conv = sitofp i16 %0 to double
+  ret double %conv
+; CHECK-LABEL: @_Z6testdss
+; CHECK: mtvsrwa [[MOVEREG06:[0-9]+]], 3
+; CHECK: xscvsxddp 1, [[MOVEREG06]]
+}
+
+; Function Attrs: nounwind
+define zeroext i16 @_Z7testusff(float %arg) {
+entry:
+  %arg.addr = alloca float, align 4
+  store float %arg, float* %arg.addr, align 4
+  %0 = load float, float* %arg.addr, align 4
+  %conv = fptoui float %0 to i16
+  ret i16 %conv
+; CHECK-LABEL: @_Z7testusff
+; CHECK: xscvdpsxws [[CONVREG07:[0-9]+]], 1
+; CHECK: mfvsrwz 3, [[CONVREG07]]
+}
+
+; Function Attrs: nounwind
+define float @_Z7testfust(i16 zeroext %arg) {
+entry:
+  %arg.addr = alloca i16, align 2
+  store i16 %arg, i16* %arg.addr, align 2
+  %0 = load i16, i16* %arg.addr, align 2
+  %conv = uitofp i16 %0 to float
+  ret float %conv
+; CHECK-LABEL: @_Z7testfust
+; CHECK: mtvsrwz [[MOVEREG07:[0-9]+]], 3
+; FIXME: Once we have XSCVUXDSP implemented, this will change
+; CHECK: fcfidus 1, [[MOVEREG07]]
+}
+
+; Function Attrs: nounwind
+define zeroext i16 @_Z7testusdd(double %arg) {
+entry:
+  %arg.addr = alloca double, align 8
+  store double %arg, double* %arg.addr, align 8
+  %0 = load double, double* %arg.addr, align 8
+  %conv = fptoui double %0 to i16
+  ret i16 %conv
+; CHECK-LABEL: @_Z7testusdd
+; CHECK: xscvdpsxws [[CONVREG08:[0-9]+]], 1
+; CHECK: mfvsrwz 3, [[CONVREG08]]
+}
+
+; Function Attrs: nounwind
+define double @_Z7testdust(i16 zeroext %arg) {
+entry:
+  %arg.addr = alloca i16, align 2
+  store i16 %arg, i16* %arg.addr, align 2
+  %0 = load i16, i16* %arg.addr, align 2
+  %conv = uitofp i16 %0 to double
+  ret double %conv
+; CHECK-LABEL: @_Z7testdust
+; CHECK: mtvsrwz [[MOVEREG08:[0-9]+]], 3
+; CHECK: xscvuxddp 1, [[MOVEREG08]]
+}
+
+; Function Attrs: nounwind
+define signext i32 @_Z6testiff(float %arg) {
+entry:
+  %arg.addr = alloca float, align 4
+  store float %arg, float* %arg.addr, align 4
+  %0 = load float, float* %arg.addr, align 4
+  %conv = fptosi float %0 to i32
+  ret i32 %conv
+; CHECK-LABEL: @_Z6testiff
+; CHECK: xscvdpsxws [[CONVREG09:[0-9]+]], 1
+; CHECK: mfvsrwz 3, [[CONVREG09]]
+}
+
+; Function Attrs: nounwind
+define float @_Z6testfii(i32 signext %arg) {
+entry:
+  %arg.addr = alloca i32, align 4
+  store i32 %arg, i32* %arg.addr, align 4
+  %0 = load i32, i32* %arg.addr, align 4
+  %conv = sitofp i32 %0 to float
+  ret float %conv
+; CHECK-LABEL: @_Z6testfii
+; CHECK: mtvsrwa [[MOVEREG09:[0-9]+]], 3
+; FIXME: Once we have XSCVSXDSP implemented, this will change
+; CHECK: fcfids 1, [[MOVEREG09]]
+}
+
+; Function Attrs: nounwind
+define signext i32 @_Z6testidd(double %arg) {
+entry:
+  %arg.addr = alloca double, align 8
+  store double %arg, double* %arg.addr, align 8
+  %0 = load double, double* %arg.addr, align 8
+  %conv = fptosi double %0 to i32
+  ret i32 %conv
+; CHECK-LABEL: @_Z6testidd
+; CHECK: xscvdpsxws [[CONVREG10:[0-9]+]], 1
+; CHECK: mfvsrwz 3, [[CONVREG10]]
+}
+
+; Function Attrs: nounwind
+define double @_Z6testdii(i32 signext %arg) {
+entry:
+  %arg.addr = alloca i32, align 4
+  store i32 %arg, i32* %arg.addr, align 4
+  %0 = load i32, i32* %arg.addr, align 4
+  %conv = sitofp i32 %0 to double
+  ret double %conv
+; CHECK-LABEL: @_Z6testdii
+; CHECK: mtvsrwa [[MOVEREG10:[0-9]+]], 3
+; CHECK: xscvsxddp 1, [[MOVEREG10]]
+}
+
+; Function Attrs: nounwind
+define zeroext i32 @_Z7testuiff(float %arg) {
+entry:
+  %arg.addr = alloca float, align 4
+  store float %arg, float* %arg.addr, align 4
+  %0 = load float, float* %arg.addr, align 4
+  %conv = fptoui float %0 to i32
+  ret i32 %conv
+; CHECK-LABEL: @_Z7testuiff
+; CHECK: xscvdpuxws [[CONVREG11:[0-9]+]], 1
+; CHECK: mfvsrwz 3, [[CONVREG11]]
+}
+
+; Function Attrs: nounwind
+define float @_Z7testfuij(i32 zeroext %arg) {
+entry:
+  %arg.addr = alloca i32, align 4
+  store i32 %arg, i32* %arg.addr, align 4
+  %0 = load i32, i32* %arg.addr, align 4
+  %conv = uitofp i32 %0 to float
+  ret float %conv
+; CHECK-LABEL: @_Z7testfuij
+; CHECK: mtvsrwz [[MOVEREG11:[0-9]+]], 3
+; FIXME: Once we have XSCVUXDSP implemented, this will change
+; CHECK: fcfidus 1, [[MOVEREG11]]
+}
+
+; Function Attrs: nounwind
+define zeroext i32 @_Z7testuidd(double %arg) {
+entry:
+  %arg.addr = alloca double, align 8
+  store double %arg, double* %arg.addr, align 8
+  %0 = load double, double* %arg.addr, align 8
+  %conv = fptoui double %0 to i32
+  ret i32 %conv
+; CHECK-LABEL: @_Z7testuidd
+; CHECK: xscvdpuxws [[CONVREG12:[0-9]+]], 1
+; CHECK: mfvsrwz 3, [[CONVREG12]]
+}
+
+; Function Attrs: nounwind
+define double @_Z7testduij(i32 zeroext %arg) {
+entry:
+  %arg.addr = alloca i32, align 4
+  store i32 %arg, i32* %arg.addr, align 4
+  %0 = load i32, i32* %arg.addr, align 4
+  %conv = uitofp i32 %0 to double
+  ret double %conv
+; CHECK-LABEL: @_Z7testduij
+; CHECK: mtvsrwz [[MOVEREG12:[0-9]+]], 3
+; CHECK: xscvuxddp 1, [[MOVEREG12]]
+}
+
+; Function Attrs: nounwind
+define i64 @_Z7testllff(float %arg) {
+entry:
+  %arg.addr = alloca float, align 4
+  store float %arg, float* %arg.addr, align 4
+  %0 = load float, float* %arg.addr, align 4
+  %conv = fptosi float %0 to i64
+  ret i64 %conv
+; CHECK-LABEL: @_Z7testllff
+; CHECK: xscvdpsxds [[CONVREG13:[0-9]+]], 1
+; CHECK: mfvsrd 3, [[CONVREG13]]
+}
+
+; Function Attrs: nounwind
+define float @_Z7testfllx(i64 %arg) {
+entry:
+  %arg.addr = alloca i64, align 8
+  store i64 %arg, i64* %arg.addr, align 8
+  %0 = load i64, i64* %arg.addr, align 8
+  %conv = sitofp i64 %0 to float
+  ret float %conv
+; CHECK-LABEL:@_Z7testfllx
+; CHECK: mtvsrd [[MOVEREG13:[0-9]+]], 3
+; FIXME: Once we have XSCVSXDSP implemented, this will change
+; CHECK: fcfids 1, [[MOVEREG13]]
+}
+
+; Function Attrs: nounwind
+define i64 @_Z7testlldd(double %arg) {
+entry:
+  %arg.addr = alloca double, align 8
+  store double %arg, double* %arg.addr, align 8
+  %0 = load double, double* %arg.addr, align 8
+  %conv = fptosi double %0 to i64
+  ret i64 %conv
+; CHECK-LABEL: @_Z7testlldd
+; CHECK: xscvdpsxds [[CONVREG14:[0-9]+]], 1
+; CHECK: mfvsrd 3, [[CONVREG14]]
+}
+
+; Function Attrs: nounwind
+define double @_Z7testdllx(i64 %arg) {
+entry:
+  %arg.addr = alloca i64, align 8
+  store i64 %arg, i64* %arg.addr, align 8
+  %0 = load i64, i64* %arg.addr, align 8
+  %conv = sitofp i64 %0 to double
+  ret double %conv
+; CHECK-LABEL: @_Z7testdllx
+; CHECK: mtvsrd [[MOVEREG14:[0-9]+]], 3
+; CHECK: xscvsxddp 1, [[MOVEREG14]]
+}
+
+; Function Attrs: nounwind
+define i64 @_Z8testullff(float %arg) {
+entry:
+  %arg.addr = alloca float, align 4
+  store float %arg, float* %arg.addr, align 4
+  %0 = load float, float* %arg.addr, align 4
+  %conv = fptoui float %0 to i64
+  ret i64 %conv
+; CHECK-LABEL: @_Z8testullff
+; CHECK: xscvdpuxds [[CONVREG15:[0-9]+]], 1
+; CHECK: mfvsrd 3, [[CONVREG15]]
+}
+
+; Function Attrs: nounwind
+define float @_Z8testfully(i64 %arg) {
+entry:
+  %arg.addr = alloca i64, align 8
+  store i64 %arg, i64* %arg.addr, align 8
+  %0 = load i64, i64* %arg.addr, align 8
+  %conv = uitofp i64 %0 to float
+  ret float %conv
+; CHECK-LABEL: @_Z8testfully
+; CHECK: mtvsrd [[MOVEREG15:[0-9]+]], 3
+; FIXME: Once we have XSCVUXDSP implemented, this will change
+; CHECK: fcfidus 1, [[MOVEREG15]]
+}
+
+; Function Attrs: nounwind
+define i64 @_Z8testulldd(double %arg) {
+entry:
+  %arg.addr = alloca double, align 8
+  store double %arg, double* %arg.addr, align 8
+  %0 = load double, double* %arg.addr, align 8
+  %conv = fptoui double %0 to i64
+  ret i64 %conv
+; CHECK-LABEL: @_Z8testulldd
+; CHECK: xscvdpuxds [[CONVREG16:[0-9]+]], 1
+; CHECK: mfvsrd 3, [[CONVREG16]]
+}
+
+; Function Attrs: nounwind
+define double @_Z8testdully(i64 %arg) {
+entry:
+  %arg.addr = alloca i64, align 8
+  store i64 %arg, i64* %arg.addr, align 8
+  %0 = load i64, i64* %arg.addr, align 8
+  %conv = uitofp i64 %0 to double
+  ret double %conv
+; CHECK-LABEL: @_Z8testdully
+; CHECK: mtvsrd [[MOVEREG16:[0-9]+]], 3
+; CHECK: xscvuxddp 1, [[MOVEREG16]]
+}
diff --git a/test/CodeGen/PowerPC/loop-data-prefetch-inner.ll b/test/CodeGen/PowerPC/loop-data-prefetch-inner.ll
new file mode 100644
index 0000000..adcc7b9
--- /dev/null
+++ b/test/CodeGen/PowerPC/loop-data-prefetch-inner.ll
@@ -0,0 +1,66 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+; Function Attrs: nounwind
+define void @foo(double* %x, double* nocapture readonly %y) #0 {
+entry:
+  %scevgep = getelementptr double, double* %x, i64 1599
+  %scevgep20 = getelementptr double, double* %y, i64 1599
+  br label %vector.memcheck
+
+vector.memcheck:                                  ; preds = %for.end, %entry
+  %j.015 = phi i32 [ 0, %entry ], [ %inc7, %for.end ]
+  %bound0 = icmp uge double* %scevgep20, %x
+  %bound1 = icmp uge double* %scevgep, %y
+  %memcheck.conflict = and i1 %bound0, %bound1
+  br i1 %memcheck.conflict, label %middle.block, label %vector.body
+
+vector.body:                                      ; preds = %vector.memcheck, %vector.body
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.memcheck ]
+  %0 = getelementptr inbounds double, double* %y, i64 %index
+  %1 = bitcast double* %0 to <4 x double>*
+  %wide.load = load <4 x double>, <4 x double>* %1, align 8
+  %2 = fadd <4 x double> %wide.load, <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+  %3 = getelementptr inbounds double, double* %x, i64 %index
+  %4 = bitcast double* %3 to <4 x double>*
+  store <4 x double> %2, <4 x double>* %4, align 8
+  %index.next = add i64 %index, 4
+  %5 = icmp eq i64 %index.next, 1600
+  br i1 %5, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body, %vector.memcheck
+  %resume.val = phi i1 [ false, %vector.memcheck ], [ true, %vector.body ]
+  %trunc.resume.val = phi i64 [ 0, %vector.memcheck ], [ 1600, %vector.body ]
+  br i1 %resume.val, label %for.end, label %for.body3
+
+for.body3:                                        ; preds = %middle.block, %for.body3
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ %trunc.resume.val, %middle.block ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %6 = load double, double* %arrayidx, align 8
+  %add = fadd double %6, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %add, double* %arrayidx5, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.end, label %for.body3
+
+for.end:                                          ; preds = %middle.block, %for.body3
+  tail call void @bar(double* %x) #2
+  %inc7 = add nuw nsw i32 %j.015, 1
+  %exitcond16 = icmp eq i32 %inc7, 100
+  br i1 %exitcond16, label %for.end8, label %vector.memcheck
+
+for.end8:                                         ; preds = %for.end
+  ret void
+
+; CHECK-LABEL: @foo
+; CHECK: dcbt
+}
+
+declare void @bar(double*) #1
+
+attributes #0 = { nounwind "target-cpu"="a2q" }
+attributes #1 = { "target-cpu"="a2q" }
+attributes #2 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/loop-prep-all.ll b/test/CodeGen/PowerPC/loop-prep-all.ll
new file mode 100644
index 0000000..895daff
--- /dev/null
+++ b/test/CodeGen/PowerPC/loop-prep-all.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-bgq-linux < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BGQ
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @foo(double* nocapture %x, double* nocapture readonly %y) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %add = fadd double %0, 1.000000e+00
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %add, double* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond19 = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond19, label %for.body7, label %for.body
+
+; CHECK-LABEL: @foo
+
+; CHECK-BGQ-DAG: dcbt 4, 5
+; CHECK-DAG: lfdu [[REG1:[0-9]+]], 8({{[0-9]+}})
+; CHECK-DAG: fadd [[REG2:[0-9]+]], [[REG1]], 0
+; CHECK-DAG: stfdu [[REG2]], 8({{[0-9]+}})
+; CHECK: bdnz
+
+; CHECK: blr
+
+for.cond.cleanup6:                                ; preds = %for.body7
+  ret void
+
+for.body7:                                        ; preds = %for.body, %for.body7
+  %i3.017 = phi i32 [ %inc9, %for.body7 ], [ 0, %for.body ]
+  tail call void bitcast (void (...)* @bar to void ()*)() #2
+  %inc9 = add nuw nsw i32 %i3.017, 1
+  %exitcond = icmp eq i32 %inc9, 1024
+  br i1 %exitcond, label %for.cond.cleanup6, label %for.body7
+}
+
+declare void @bar(...) #1
+
+attributes #0 = { nounwind "target-cpu"="a2q" }
+attributes #1 = { "target-cpu"="a2q" }
+attributes #2 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/mcm-obj-2.ll b/test/CodeGen/PowerPC/mcm-obj-2.ll
index f31d852..36c5856 100644
--- a/test/CodeGen/PowerPC/mcm-obj-2.ll
+++ b/test/CodeGen/PowerPC/mcm-obj-2.ll
@@ -20,7 +20,7 @@ entry:
 ; accessing function-scoped variable si.
 ;
 ; CHECK: Relocations [
-; CHECK:   Section (2) .rela.text {
+; CHECK:   Section {{.*}} .rela.text {
 ; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_HA [[SYM2:[^ ]+]]
 ; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO_DS [[SYM2]]
 ; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO [[SYM2]]
diff --git a/test/CodeGen/PowerPC/mcm-obj.ll b/test/CodeGen/PowerPC/mcm-obj.ll
index 770ef35..46295cf 100644
--- a/test/CodeGen/PowerPC/mcm-obj.ll
+++ b/test/CodeGen/PowerPC/mcm-obj.ll
@@ -22,12 +22,12 @@ entry:
 ; accessing external variable ei.
 ;
 ; MEDIUM:      Relocations [
-; MEDIUM:        Section (2) .rela.text {
+; MEDIUM:        Section {{.*}} .rela.text {
 ; MEDIUM-NEXT:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_HA [[SYM1:[^ ]+]]
 ; MEDIUM-NEXT:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO_DS [[SYM1]]
 ;
 ; LARGE:       Relocations [
-; LARGE:         Section (2) .rela.text {
+; LARGE:         Section {{.*}} .rela.text {
 ; LARGE-NEXT:      0x{{[0-9,A-F]+}} R_PPC64_TOC16_HA [[SYM1:[^ ]+]]
 ; LARGE-NEXT:      0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO_DS [[SYM1]]
 
diff --git a/test/CodeGen/PowerPC/memset-nc-le.ll b/test/CodeGen/PowerPC/memset-nc-le.ll
new file mode 100644
index 0000000..af8e9c3
--- /dev/null
+++ b/test/CodeGen/PowerPC/memset-nc-le.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le"
+
+; Function Attrs: nounwind
+define void @test_vsx() unnamed_addr #0 align 2 {
+entry:
+  %0 = load i32, i32* undef, align 4
+  %1 = trunc i32 %0 to i8
+  call void @llvm.memset.p0i8.i64(i8* null, i8 %1, i64 32, i32 1, i1 false)
+  ret void
+
+; CHECK-LABEL: @test_vsx
+; CHECK: stxvd2x
+; CHECK: stxvd2x
+; CHECK: blr
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
+
+attributes #0 = { nounwind "target-cpu"="pwr8" }
+attributes #1 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/memset-nc.ll b/test/CodeGen/PowerPC/memset-nc.ll
new file mode 100644
index 0000000..414a987
--- /dev/null
+++ b/test/CodeGen/PowerPC/memset-nc.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s | FileCheck %s
+; RUN: llc -O0 < %s | FileCheck %s -check-prefix=CHECK-O0
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+; Function Attrs: nounwind
+define void @test_qpx() unnamed_addr #0 align 2 {
+entry:
+  %0 = load i32, i32* undef, align 4
+  %1 = trunc i32 %0 to i8
+  call void @llvm.memset.p0i8.i64(i8* null, i8 %1, i64 64, i32 32, i1 false)
+  ret void
+
+; CHECK-LABEL: @test_qpx
+; CHECK: qvstfdx
+; CHECK: qvstfdx
+; CHECK: blr
+
+; CHECK-O0-LABEL: @test_qpx
+; CHECK-O0-NOT: qvstfdx
+; CHECK-O0: blr
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
+
+; Function Attrs: nounwind
+define void @test_vsx() unnamed_addr #2 align 2 {
+entry:
+  %0 = load i32, i32* undef, align 4
+  %1 = trunc i32 %0 to i8
+  call void @llvm.memset.p0i8.i64(i8* null, i8 %1, i64 32, i32 1, i1 false)
+  ret void
+
+; CHECK-LABEL: @test_vsx
+; CHECK: stxvw4x
+; CHECK: stxvw4x
+; CHECK: blr
+
+; CHECK-O0-LABEL: @test_vsx
+; CHECK-O0-NOT: stxvw4x
+; CHECK-O0: blr
+}
+
+attributes #0 = { nounwind "target-cpu"="a2q" }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind "target-cpu"="pwr7" }
+
diff --git a/test/CodeGen/PowerPC/optnone-crbits-i1-ret.ll b/test/CodeGen/PowerPC/optnone-crbits-i1-ret.ll
new file mode 100644
index 0000000..745a038
--- /dev/null
+++ b/test/CodeGen/PowerPC/optnone-crbits-i1-ret.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+declare zeroext i1 @ri1()
+declare void @se1()
+declare void @se2()
+
+define void @test() #0 {
+entry:
+  %b = call zeroext i1 @ri1()
+  br label %next
+
+; CHECK-LABEL: @test
+; CHECK: bl ri1
+; CHECK-NEXT: nop
+; CHECK: andi. 3, 3, 1
+
+next:
+  br i1 %b, label %case1, label %case2
+
+case1:
+  call void @se1()
+  br label %end
+
+case2:
+  call void @se2()
+  br label %end
+
+end:
+  ret void
+
+; CHECK: blr
+}
+
+attributes #0 = { noinline optnone }
+
diff --git a/test/CodeGen/PowerPC/pip-inner.ll b/test/CodeGen/PowerPC/pip-inner.ll
new file mode 100644
index 0000000..930f0d3
--- /dev/null
+++ b/test/CodeGen/PowerPC/pip-inner.ll
@@ -0,0 +1,52 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @foo(double* %x, double* nocapture readonly %y) #0 {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.end, %entry
+  %i.015 = phi i32 [ 0, %entry ], [ %inc7, %for.end ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %add = fadd double %0, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %add, double* %arrayidx5, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16000
+  br i1 %exitcond, label %for.end, label %for.body3
+
+for.end:                                          ; preds = %for.body3
+  tail call void @bar(double* %x) #2
+  %inc7 = add nuw nsw i32 %i.015, 1
+  %exitcond16 = icmp eq i32 %inc7, 1000
+  br i1 %exitcond16, label %for.end8, label %for.cond1.preheader
+
+for.end8:                                         ; preds = %for.end
+  ret void
+
+; CHECK-LABEL: @foo
+
+; CHECK: lfdu [[REG1:[0-9]+]], 8({{[0-9]+}})
+; CHECK: fadd [[REG2:[0-9]+]], [[REG1]], {{[0-9]+}}
+; CHECK: stfdu [[REG2]], 8({{[0-9]+}})
+; CHECK: bdnz
+
+; CHECK: bl bar
+; CHECK-NEXT: nop
+
+; CHECK: blr
+}
+
+declare void @bar(double*) #1
+
+attributes #0 = { nounwind "target-cpu"="a2" }
+attributes #1 = { "target-cpu"="a2" }
+attributes #2 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/ppc32-i1-vaarg.ll b/test/CodeGen/PowerPC/ppc32-i1-vaarg.ll
index 6e0aec2..ad8ed38 100644
--- a/test/CodeGen/PowerPC/ppc32-i1-vaarg.ll
+++ b/test/CodeGen/PowerPC/ppc32-i1-vaarg.ll
@@ -5,7 +5,7 @@ target triple = "powerpc-unknown-linux-gnu"
 declare void @printf(i8*, ...)
 
 define void @main() {
-  call void (i8*, ...)* @printf(i8* undef, i1 false)
+  call void (i8*, ...) @printf(i8* undef, i1 false)
   ret void
 }
 
diff --git a/test/CodeGen/PowerPC/ppc32-pic-large.ll b/test/CodeGen/PowerPC/ppc32-pic-large.ll
index 4c85ab9..6a06945 100644
--- a/test/CodeGen/PowerPC/ppc32-pic-large.ll
+++ b/test/CodeGen/PowerPC/ppc32-pic-large.ll
@@ -6,7 +6,7 @@ declare i32 @call_foo(i32, ...)
 define i32 @foo() {
 entry:
   %0 = load i32, i32* @bar, align 4
-  %call = call i32 (i32, ...)* @call_foo(i32 %0, i32 0, i32 1, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64)
+  %call = call i32 (i32, ...) @call_foo(i32 %0, i32 0, i32 1, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64)
   ret i32 %0
 }
 
diff --git a/test/CodeGen/PowerPC/ppc32-pic.ll b/test/CodeGen/PowerPC/ppc32-pic.ll
index 74f9394..63f521c 100644
--- a/test/CodeGen/PowerPC/ppc32-pic.ll
+++ b/test/CodeGen/PowerPC/ppc32-pic.ll
@@ -6,7 +6,7 @@ declare i32 @call_foo(i32, ...)
 define i32 @foo() {
 entry:
   %0 = load i32, i32* @bar, align 4
-  %call = call i32 (i32, ...)* @call_foo(i32 %0, i32 0, i32 1, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64)
+  %call = call i32 (i32, ...) @call_foo(i32 %0, i32 0, i32 1, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64)
   ret i32 0
 }
 
diff --git a/test/CodeGen/PowerPC/ppc64-anyregcc-crash.ll b/test/CodeGen/PowerPC/ppc64-anyregcc-crash.ll
index 479c7a7..dfa6ec0 100644
--- a/test/CodeGen/PowerPC/ppc64-anyregcc-crash.ll
+++ b/test/CodeGen/PowerPC/ppc64-anyregcc-crash.ll
@@ -8,7 +8,7 @@ define i64 @anyreglimit(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i6
                         i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
                         i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32) {
 entry:
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 32,
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 32,
                 i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
                 i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
                 i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
diff --git a/test/CodeGen/PowerPC/ppc64-anyregcc.ll b/test/CodeGen/PowerPC/ppc64-anyregcc.ll
index ab2feb6..66f6a2c 100644
--- a/test/CodeGen/PowerPC/ppc64-anyregcc.ll
+++ b/test/CodeGen/PowerPC/ppc64-anyregcc.ll
@@ -82,7 +82,7 @@ target triple = "powerpc64-unknown-linux-gnu"
 ; CHECK-NEXT:   .long 3
 define i64 @test() nounwind ssp uwtable {
 entry:
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 0, i32 24, i8* null, i32 2, i32 1, i32 2, i64 3)
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 0, i32 24, i8* null, i32 2, i32 1, i32 2, i64 3)
   ret i64 0
 }
 
@@ -104,7 +104,7 @@ entry:
 define i64 @property_access1(i8* %obj) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 1, i32 24, i8* %f, i32 1, i8* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 1, i32 24, i8* %f, i32 1, i8* %obj)
   ret i64 %ret
 }
 
@@ -127,7 +127,7 @@ define i64 @property_access2() nounwind ssp uwtable {
 entry:
   %obj = alloca i64, align 8
   %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 24, i8* %f, i32 1, i64* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 24, i8* %f, i32 1, i64* %obj)
   ret i64 %ret
 }
 
@@ -150,7 +150,7 @@ define i64 @property_access3() nounwind ssp uwtable {
 entry:
   %obj = alloca i64, align 8
   %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 3, i32 24, i8* %f, i32 0, i64* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 3, i32 24, i8* %f, i32 0, i64* %obj)
   ret i64 %ret
 }
 
@@ -232,7 +232,7 @@ entry:
 define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 4, i32 24, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 4, i32 24, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
   ret i64 %ret
 }
 
@@ -314,7 +314,7 @@ entry:
 define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 24, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 24, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
   ret i64 %ret
 }
 
@@ -342,7 +342,7 @@ entry:
 ; CHECK-NEXT: .long  0
 define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 24, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 12, i32 24, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
   tail call void asm sideeffect "nop", "~{r0},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{r14},~{r15},~{r16},~{r17
 },~{r18},~{r19},~{r20},~{r21},~{r22},~{r23},~{r24},~{r25},~{r26},~{r27},~{r28},~{r29},~{r30},~{r31}"() nounwind
   ret i64 %result
@@ -384,7 +384,7 @@ define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
   tail call void asm sideeffect "nop", "~{r0},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{r14},~{r15},~{r16},~{r17
 },~{r18},~{r19},~{r20},~{r21},~{r22},~{r23},~{r24},~{r25},~{r26},~{r27},~{r28},~{r29},~{r30},~{r31}"() nounwind
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 13, i32 24, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 13, i32 24, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
   ret i64 %result
 }
 
diff --git a/test/CodeGen/PowerPC/ppc64-patchpoint.ll b/test/CodeGen/PowerPC/ppc64-patchpoint.ll
index 48ffb6a..67b2626 100644
--- a/test/CodeGen/PowerPC/ppc64-patchpoint.ll
+++ b/test/CodeGen/PowerPC/ppc64-patchpoint.ll
@@ -28,9 +28,9 @@ entry:
 ; CHECK: blr
 
   %resolveCall2 = inttoptr i64 244837814094590 to i8*
-  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 24, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 24, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
   %resolveCall3 = inttoptr i64 244837814094591 to i8*
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 3, i32 24, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 3, i32 24, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
   ret i64 %result
 }
 
@@ -51,7 +51,7 @@ entry:
   store i64 11, i64* %metadata
   store i64 12, i64* %metadata
   store i64 13, i64* %metadata
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
   ret void
 }
 
@@ -64,14 +64,14 @@ entry:
   %tmp80 = add i64 %tmp79, -16
   %tmp81 = inttoptr i64 %tmp80 to i64*
   %tmp82 = load i64, i64* %tmp81, align 8
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 8, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 15, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 14, i32 8, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 15, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
   %tmp83 = load i64, i64* %tmp33, align 8
   %tmp84 = add i64 %tmp83, -24
   %tmp85 = inttoptr i64 %tmp84 to i64*
   %tmp86 = load i64, i64* %tmp85, align 8
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 8, i64 %arg, i64 %tmp10, i64 %tmp86)
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 18, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 17, i32 8, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 18, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
   ret i64 10
 }
 
@@ -87,7 +87,7 @@ entry:
 ; CHECK-NEXT: nop
 ; CHECK-NOT:  nop
 ; CHECK: blr
-  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* null, i32 2, i64 %p1, i64 %p2)
+  %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* null, i32 2, i64 %p1, i64 %p2)
   ret void
 }
 
diff --git a/test/CodeGen/PowerPC/ppc64-stackmap-nops.ll b/test/CodeGen/PowerPC/ppc64-stackmap-nops.ll
index 368ddc5..19d65b9 100644
--- a/test/CodeGen/PowerPC/ppc64-stackmap-nops.ll
+++ b/test/CodeGen/PowerPC/ppc64-stackmap-nops.ll
@@ -16,7 +16,7 @@ entry:
 ; CHECK: mtlr [[REG1]]
 ; CHECK: blr
 
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  0, i32  32)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  0, i32  32)
   ret void
 }
 
diff --git a/test/CodeGen/PowerPC/ppc64-stackmap.ll b/test/CodeGen/PowerPC/ppc64-stackmap.ll
index bc974a0..917fa74 100644
--- a/test/CodeGen/PowerPC/ppc64-stackmap.ll
+++ b/test/CodeGen/PowerPC/ppc64-stackmap.ll
@@ -112,7 +112,7 @@ target triple = "powerpc64-unknown-linux-gnu"
 define void @constantargs() {
 entry:
   %0 = inttoptr i64 244837814094590 to i8*
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 1, i32 24, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 1, i32 24, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
   ret void
 }
 
@@ -134,7 +134,7 @@ entry:
   ; Runtime void->void call.
   call void inttoptr (i64 244837814094590 to void ()*)()
   ; Followed by inline OSR patchpoint with 12-byte shadow and 2 live vars.
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
   ret void
 }
 
@@ -160,7 +160,7 @@ entry:
 cold:
   ; OSR patchpoint with 12-byte nop-slide and 2 live vars.
   %thunk = inttoptr i64 244837814094590 to i8*
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4, i32 24, i8* %thunk, i32 0, i64 %a, i64 %b)
+  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 4, i32 24, i8* %thunk, i32 0, i64 %a, i64 %b)
   unreachable
 ret:
   ret void
@@ -176,7 +176,7 @@ ret:
 define i64 @propertyRead(i64* %obj) {
 entry:
   %resolveRead = inttoptr i64 244837814094590 to i8*
-  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 24, i8* %resolveRead, i32 1, i64* %obj)
+  %result = call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 24, i8* %resolveRead, i32 1, i64* %obj)
   %add = add i64 %result, 3
   ret i64 %add
 }
@@ -196,7 +196,7 @@ entry:
 define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) {
 entry:
   %resolveWrite = inttoptr i64 244837814094590 to i8*
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 24, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 6, i32 24, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
   ret void
 }
 
@@ -218,7 +218,7 @@ entry:
 define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
 entry:
   %resolveCall = inttoptr i64 244837814094590 to i8*
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 7, i32 24, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 7, i32 24, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
   ret void
 }
 
@@ -240,7 +240,7 @@ entry:
 define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
 entry:
   %resolveCall = inttoptr i64 244837814094590 to i8*
-  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 8, i32 24, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  %result = call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 8, i32 24, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
   %add = add i64 %result, 3
   ret i64 %add
 }
@@ -260,7 +260,7 @@ entry:
 ; CHECK-NEXT:   .short 31
 define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27) {
 entry:
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 11, i32 24, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
+  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 24, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
   ret void
 }
 
@@ -279,7 +279,7 @@ entry:
 ; CHECK-NEXT:   .short 31
 define webkit_jscc void @spilledStackMapValue(i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29) {
 entry:
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 16, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29)
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 12, i32 16, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29)
   ret void
 }
 
@@ -297,7 +297,7 @@ entry:
 ; CHECK-NEXT:   .long   33
 
 define void @liveConstant() {
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 8, i32 33)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 15, i32 8, i32 33)
   ret void
 }
 
@@ -314,7 +314,7 @@ define void @liveConstant() {
 ; CHECK-NEXT:   .long   {{[0-9]+}}
 define void @clobberLR(i32 %a) {
   tail call void asm sideeffect "nop", "~{r0},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{r14},~{r15},~{r16},~{r17},~{r18},~{r19},~{r20},~{r21},~{r22},~{r23},~{r24},~{r25},~{r26},~{r27},~{r28},~{r29},~{r30},~{r31}"() nounwind
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
   ret void
 }
 
diff --git a/test/CodeGen/PowerPC/ppcf128-3.ll b/test/CodeGen/PowerPC/ppcf128-3.ll
index 5043b62..fe3b418 100644
--- a/test/CodeGen/PowerPC/ppcf128-3.ll
+++ b/test/CodeGen/PowerPC/ppcf128-3.ll
@@ -4,28 +4,28 @@
 define i32 @stp_sequence_set_short_data(%struct.stp_sequence* %sequence, i32 %count, i16* %data) {
 entry:
 	%tmp1112 = sitofp i16 0 to ppc_fp128		; <ppc_fp128> [#uses=1]
-	%tmp13 = call i32 (...)* @__inline_isfinite( ppc_fp128 %tmp1112 ) nounwind 		; <i32> [#uses=0]
+	%tmp13 = call i32 (...) @__inline_isfinite( ppc_fp128 %tmp1112 ) nounwind 		; <i32> [#uses=0]
 	ret i32 0
 }
 
 define i32 @stp_sequence_set_short_data2(%struct.stp_sequence* %sequence, i32 %count, i16* %data) {
 entry:
 	%tmp1112 = sitofp i8 0 to ppc_fp128		; <ppc_fp128> [#uses=1]
-	%tmp13 = call i32 (...)* @__inline_isfinite( ppc_fp128 %tmp1112 ) nounwind 		; <i32> [#uses=0]
+	%tmp13 = call i32 (...) @__inline_isfinite( ppc_fp128 %tmp1112 ) nounwind 		; <i32> [#uses=0]
 	ret i32 0
 }
 
 define i32 @stp_sequence_set_short_data3(%struct.stp_sequence* %sequence, i32 %count, i16* %data) {
 entry:
 	%tmp1112 = uitofp i16 0 to ppc_fp128		; <ppc_fp128> [#uses=1]
-	%tmp13 = call i32 (...)* @__inline_isfinite( ppc_fp128 %tmp1112 ) nounwind 		; <i32> [#uses=0]
+	%tmp13 = call i32 (...) @__inline_isfinite( ppc_fp128 %tmp1112 ) nounwind 		; <i32> [#uses=0]
 	ret i32 0
 }
 
 define i32 @stp_sequence_set_short_data4(%struct.stp_sequence* %sequence, i32 %count, i16* %data) {
 entry:
 	%tmp1112 = uitofp i8 0 to ppc_fp128		; <ppc_fp128> [#uses=1]
-	%tmp13 = call i32 (...)* @__inline_isfinite( ppc_fp128 %tmp1112 ) nounwind 		; <i32> [#uses=0]
+	%tmp13 = call i32 (...) @__inline_isfinite( ppc_fp128 %tmp1112 ) nounwind 		; <i32> [#uses=0]
 	ret i32 0
 }
 
diff --git a/test/CodeGen/PowerPC/remat-imm.ll b/test/CodeGen/PowerPC/remat-imm.ll
index 486495e..ffae8a9 100644
--- a/test/CodeGen/PowerPC/remat-imm.ll
+++ b/test/CodeGen/PowerPC/remat-imm.ll
@@ -9,7 +9,7 @@ define i32 @main() nounwind {
 entry:
 ; CHECK: li 4, 128
 ; CHECK-NOT: mr 4, {{.*}}
-  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i32 0, i32 0), i32 128, i32 128) nounwind
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i32 0, i32 0), i32 128, i32 128) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/PowerPC/resolvefi-basereg.ll b/test/CodeGen/PowerPC/resolvefi-basereg.ll
index 2b22f95..a613c33 100644
--- a/test/CodeGen/PowerPC/resolvefi-basereg.ll
+++ b/test/CodeGen/PowerPC/resolvefi-basereg.ll
@@ -340,7 +340,7 @@ if.end:                                           ; preds = %if.then, %entry
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %65, i8* bitcast (%struct.S1998* getelementptr inbounds ([5 x %struct.S1998], [5 x %struct.S1998]* @a1998, i32 0, i64 2) to i8*), i64 5168, i32 16, i1 false)
   %66 = bitcast %struct.S1998* %agg.tmp115 to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %66, i8* bitcast (%struct.S1998* getelementptr inbounds ([5 x %struct.S1998], [5 x %struct.S1998]* @a1998, i32 0, i64 2) to i8*), i64 5168, i32 16, i1 false)
-  call void (i32, ...)* @check1998va(i32 signext 1, double 1.000000e+00, %struct.S1998* byval align 16 %agg.tmp113, i64 2, %struct.S1998* byval align 16 %agg.tmp114, %struct.S1998* byval align 16 %agg.tmp115)
+  call void (i32, ...) @check1998va(i32 signext 1, double 1.000000e+00, %struct.S1998* byval align 16 %agg.tmp113, i64 2, %struct.S1998* byval align 16 %agg.tmp114, %struct.S1998* byval align 16 %agg.tmp115)
   %67 = bitcast %struct.S1998* %agg.tmp116 to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %67, i8* bitcast (%struct.S1998* @s1998 to i8*), i64 5168, i32 16, i1 false)
   %68 = bitcast %struct.S1998* %agg.tmp117 to i8*
@@ -349,7 +349,7 @@ if.end:                                           ; preds = %if.then, %entry
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %69, i8* bitcast (%struct.S1998* getelementptr inbounds ([5 x %struct.S1998], [5 x %struct.S1998]* @a1998, i32 0, i64 2) to i8*), i64 5168, i32 16, i1 false)
   %70 = bitcast %struct.S1998* %agg.tmp119 to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %70, i8* bitcast (%struct.S1998* @s1998 to i8*), i64 5168, i32 16, i1 false)
-  call void (i32, ...)* @check1998va(i32 signext 2, %struct.S1998* byval align 16 %agg.tmp116, %struct.S1998* byval align 16 %agg.tmp117, ppc_fp128 0xM40000000000000000000000000000000, %struct.S1998* byval align 16 %agg.tmp118, %struct.S1998* byval align 16 %agg.tmp119)
+  call void (i32, ...) @check1998va(i32 signext 2, %struct.S1998* byval align 16 %agg.tmp116, %struct.S1998* byval align 16 %agg.tmp117, ppc_fp128 0xM40000000000000000000000000000000, %struct.S1998* byval align 16 %agg.tmp118, %struct.S1998* byval align 16 %agg.tmp119)
   ret void
 }
 
diff --git a/test/CodeGen/PowerPC/s000-alias-misched.ll b/test/CodeGen/PowerPC/s000-alias-misched.ll
index 3a0c897..2e34c65 100644
--- a/test/CodeGen/PowerPC/s000-alias-misched.ll
+++ b/test/CodeGen/PowerPC/s000-alias-misched.ll
@@ -87,7 +87,7 @@ for.end10:                                        ; preds = %for.end
   %sub = sub nsw i64 %call11, %call1
   %conv = sitofp i64 %sub to double
   %div = fdiv double %conv, 1.000000e+06
-  %call12 = tail call signext i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str137, i64 0, i64 0), double %div) nounwind
+  %call12 = tail call signext i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str137, i64 0, i64 0), double %div) nounwind
   tail call void @check(i32 signext 1)
   ret i32 0
 }
diff --git a/test/CodeGen/PowerPC/stack-protector.ll b/test/CodeGen/PowerPC/stack-protector.ll
index 48bfbe6..8d255bd 100644
--- a/test/CodeGen/PowerPC/stack-protector.ll
+++ b/test/CodeGen/PowerPC/stack-protector.ll
@@ -14,7 +14,7 @@ entry:
 	%0 = load i8*, i8** %a_addr, align 4		; <i8*> [#uses=1]
 	%1 = call i8* @strcpy(i8* %buf1, i8* %0) nounwind		; <i8*> [#uses=0]
   %buf2 = bitcast [8 x i8]* %buf to i8*		; <i8*> [#uses=1]
-	%2 = call i32 (i8*, ...)* @printf(i8* getelementptr ([11 x i8], [11 x i8]* @"\01LC", i32 0, i32 0), i8* %buf2) nounwind		; <i32> [#uses=0]
+	%2 = call i32 (i8*, ...) @printf(i8* getelementptr ([11 x i8], [11 x i8]* @"\01LC", i32 0, i32 0), i8* %buf2) nounwind		; <i32> [#uses=0]
 	br label %return
 
 return:		; preds = %entry
diff --git a/test/CodeGen/PowerPC/trampoline.ll b/test/CodeGen/PowerPC/trampoline.ll
index bc7bee8..e1a26da 100644
--- a/test/CodeGen/PowerPC/trampoline.ll
+++ b/test/CodeGen/PowerPC/trampoline.ll
@@ -106,7 +106,7 @@ entry:
 	%27 = load %struct.objc_selector*, %struct.objc_selector** @"\01L_OBJC_SELECTOR_REFERENCES_1", align 4		; <%struct.objc_selector*> [#uses=1]
 	%__block_holder_tmp_1.03 = bitcast %struct.__block_1* %__block_holder_tmp_1.0 to void (%struct.CGImage*)*		; <void (%struct.CGImage*)*> [#uses=1]
 	%28 = load %struct.objc_object*, %struct.objc_object** %self.1, align 4		; <%struct.objc_object*> [#uses=1]
-	%29 = call %struct.objc_object* (%struct.objc_object*, %struct.objc_selector*, ...)* inttoptr (i64 4294901504 to %struct.objc_object* (%struct.objc_object*, %struct.objc_selector*, ...)*)(%struct.objc_object* %28, %struct.objc_selector* %27, void (%struct.CGImage*)* %__block_holder_tmp_1.03) nounwind		; <%struct.objc_object*> [#uses=0]
+	%29 = call %struct.objc_object* (%struct.objc_object*, %struct.objc_selector*, ...) inttoptr (i64 4294901504 to %struct.objc_object* (%struct.objc_object*, %struct.objc_selector*, ...)*)(%struct.objc_object* %28, %struct.objc_selector* %27, void (%struct.CGImage*)* %__block_holder_tmp_1.03) nounwind		; <%struct.objc_object*> [#uses=0]
 	br label %return
 
 return:		; preds = %entry
@@ -155,7 +155,7 @@ entry:
 	%15 = load %struct.objc_selector*, %struct.objc_selector** @"\01L_OBJC_SELECTOR_REFERENCES_0", align 4		; <%struct.objc_selector*> [#uses=1]
 	%16 = load %struct.objc_super*, %struct.objc_super** %objc_super.5, align 4		; <%struct.objc_super*> [#uses=1]
 	%17 = load %struct.NSZone*, %struct.NSZone** %zone, align 4		; <%struct.NSZone*> [#uses=1]
-	%18 = call %struct.objc_object* (%struct.objc_super*, %struct.objc_selector*, ...)* @objc_msgSendSuper(%struct.objc_super* %16, %struct.objc_selector* %15, %struct.NSZone* %17) nounwind		; <%struct.objc_object*> [#uses=1]
+	%18 = call %struct.objc_object* (%struct.objc_super*, %struct.objc_selector*, ...) @objc_msgSendSuper(%struct.objc_super* %16, %struct.objc_selector* %15, %struct.NSZone* %17) nounwind		; <%struct.objc_object*> [#uses=1]
 	%19 = bitcast %struct.objc_object* %18 to %struct.NSBitmapImageRep*		; <%struct.NSBitmapImageRep*> [#uses=1]
 	%20 = load %struct.NSBitmapImageRep**, %struct.NSBitmapImageRep*** %new, align 4		; <%struct.NSBitmapImageRep**> [#uses=1]
 	store %struct.NSBitmapImageRep* %19, %struct.NSBitmapImageRep** %20, align 4
diff --git a/test/CodeGen/PowerPC/varargs-struct-float.ll b/test/CodeGen/PowerPC/varargs-struct-float.ll
index dbdda05..7bb5a34 100644
--- a/test/CodeGen/PowerPC/varargs-struct-float.ll
+++ b/test/CodeGen/PowerPC/varargs-struct-float.ll
@@ -12,7 +12,7 @@ entry:
   store float %s.coerce, float* %coerce.dive, align 1
   %coerce.dive1 = getelementptr %struct.Sf1, %struct.Sf1* %s, i32 0, i32 0
   %0 = load float, float* %coerce.dive1, align 1
-  call void (i32, ...)* @testvaSf1(i32 1, float inreg %0)
+  call void (i32, ...) @testvaSf1(i32 1, float inreg %0)
   ret void
 }
 
diff --git a/test/CodeGen/PowerPC/vec_constants.ll b/test/CodeGen/PowerPC/vec_constants.ll
index 45df814..858b85d 100644
--- a/test/CodeGen/PowerPC/vec_constants.ll
+++ b/test/CodeGen/PowerPC/vec_constants.ll
@@ -1,7 +1,5 @@
-; RUN: llc -O0 -mcpu=pwr7 < %s | FileCheck %s
-
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
+; RUN: llc -O0 -mcpu=pwr7 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -O0 -mcpu=pwr7 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
 
 define void @test1(<4 x i32>* %P1, <4 x i32>* %P2, <4 x float>* %P3) nounwind {
 	%tmp = load <4 x i32>, <4 x i32>* %P1		; <<4 x i32>> [#uses=1]
diff --git a/test/CodeGen/PowerPC/vperm-lowering.ll b/test/CodeGen/PowerPC/vperm-lowering.ll
index d55d26c..c78ffdd 100644
--- a/test/CodeGen/PowerPC/vperm-lowering.ll
+++ b/test/CodeGen/PowerPC/vperm-lowering.ll
@@ -9,58 +9,23 @@ define <16 x i8> @foo() nounwind ssp {
 }
 
 ; CHECK: .LCPI0_0:
-; CHECK: .byte 31
-; CHECK: .byte 26
-; CHECK: .byte 21
-; CHECK: .byte 16
-; CHECK: .byte 11
-; CHECK: .byte 6
-; CHECK: .byte 1
-; CHECK: .byte 28
-; CHECK: .byte 23
-; CHECK: .byte 18
-; CHECK: .byte 13
-; CHECK: .byte 8
-; CHECK: .byte 3
-; CHECK: .byte 30
-; CHECK: .byte 25
-; CHECK: .byte 20
-; CHECK: .LCPI0_1:
 ; CHECK: .byte 0
-; CHECK: .byte 1
-; CHECK: .byte 2
-; CHECK: .byte 3
-; CHECK: .byte 4
 ; CHECK: .byte 5
-; CHECK: .byte 6
-; CHECK: .byte 7
-; CHECK: .byte 8
-; CHECK: .byte 9
 ; CHECK: .byte 10
-; CHECK: .byte 11
-; CHECK: .byte 12
-; CHECK: .byte 13
-; CHECK: .byte 14
 ; CHECK: .byte 15
-; CHECK: .LCPI0_2:
-; CHECK: .byte 16
-; CHECK: .byte 17
-; CHECK: .byte 18
-; CHECK: .byte 19
 ; CHECK: .byte 20
-; CHECK: .byte 21
-; CHECK: .byte 22
-; CHECK: .byte 23
-; CHECK: .byte 24
 ; CHECK: .byte 25
-; CHECK: .byte 26
-; CHECK: .byte 27
-; CHECK: .byte 28
-; CHECK: .byte 29
 ; CHECK: .byte 30
-; CHECK: .byte 31
+; CHECK: .byte 3
+; CHECK: .byte 8
+; CHECK: .byte 13
+; CHECK: .byte 18
+; CHECK: .byte 23
+; CHECK: .byte 28
+; CHECK: .byte 1
+; CHECK: .byte 6
+; CHECK: .byte 11
 ; CHECK: foo:
-; CHECK: addis [[REG1:[0-9]+]], 2, .LCPI0_2@toc@ha
-; CHECK: addi [[REG2:[0-9]+]], [[REG1]], .LCPI0_2@toc@l
+; CHECK: addis [[REG1:[0-9]+]], 2, .LCPI0_0@toc@ha
+; CHECK: addi [[REG2:[0-9]+]], [[REG1]], .LCPI0_0@toc@l
 ; CHECK: lvx [[REG3:[0-9]+]], 0, [[REG2]]
-; CHECK: vperm {{[0-9]+}}, [[REG3]], {{[0-9]+}}, {{[0-9]+}}
diff --git a/test/CodeGen/PowerPC/vsx-fma-m.ll b/test/CodeGen/PowerPC/vsx-fma-m.ll
index 64185a4..d859273 100644
--- a/test/CodeGen/PowerPC/vsx-fma-m.ll
+++ b/test/CodeGen/PowerPC/vsx-fma-m.ll
@@ -98,9 +98,9 @@ entry:
 ; re-ordering the instructions.
 ; CHECK-DAG: xsmaddadp [[F1]], 2, 3
 
-; CHECK-DAG: xsmaddmdp 2, 3, 4
+; CHECK-DAG: xsmaddmdp 3, 2, 4
 ; CHECK-DAG: stxsdx [[F1]], 0, 8
-; CHECK-DAG: stxsdx 2, 8, [[C1]]
+; CHECK-DAG: stxsdx 3, 8, [[C1]]
 ; CHECK-DAG: stxsdx 1, 8, [[C2]]
 ; CHECK-DAG: stxsdx 4, 8, [[C3]]
 ; CHECK: blr
@@ -269,10 +269,10 @@ entry:
 ; re-ordering the instructions.
 ; CHECK-DAG: xvmaddadp [[V1]], 35, 36
 
-; CHECK-DAG: xvmaddmdp 35, 36, 37
+; CHECK-DAG: xvmaddmdp 36, 35, 37
 ; CHECK-DAG: xvmaddadp 34, 35, 38
 ; CHECK-DAG: stxvd2x 32, 0, 3
-; CHECK-DAG: stxvd2x 35, 3, [[C1]]
+; CHECK-DAG: stxvd2x 36, 3, [[C1]]
 ; CHECK-DAG: stxvd2x 34, 3, [[C2]]
 ; CHECK-DAG: stxvd2x 37, 3, [[C3]]
 ; CHECK: blr
diff --git a/test/CodeGen/PowerPC/vsx-spill-norwstore.ll b/test/CodeGen/PowerPC/vsx-spill-norwstore.ll
index c135a00..77b6cb2 100644
--- a/test/CodeGen/PowerPC/vsx-spill-norwstore.ll
+++ b/test/CodeGen/PowerPC/vsx-spill-norwstore.ll
@@ -23,7 +23,7 @@ check.exit69.i:                                   ; preds = %entry
   br i1 undef, label %if.then.i63.i, label %check.exit64.i
 
 if.then.i63.i:                                    ; preds = %check.exit69.i
-  tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str10, i64 0, i64 0), i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str1, i64 0, i64 0)) #0
+  tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str10, i64 0, i64 0), i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str1, i64 0, i64 0)) #0
   br label %check.exit64.i
 
 check.exit64.i:                                   ; preds = %if.then.i63.i, %check.exit69.i
diff --git a/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll b/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll
index c381fc4..e7e13d6 100644
--- a/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll
+++ b/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll
@@ -18,7 +18,7 @@ declare void @llvm.AMDGPU.barrier.local() #1
 ; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], 0x100, [[VADDR]]
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]]
 
-; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:0 offset1:1
+; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:1
 ; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:33
 ; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256
 ; CHECK: s_endpgm
diff --git a/test/CodeGen/R600/ds_read2.ll b/test/CodeGen/R600/ds_read2.ll
index 7110a90..5929898 100644
--- a/test/CodeGen/R600/ds_read2.ll
+++ b/test/CodeGen/R600/ds_read2.ll
@@ -7,7 +7,7 @@
  @lds.f64 = addrspace(3) global [512 x double] undef, align 8
 
 ; SI-LABEL: @simple_read2_f32
-; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:0 offset1:8
+; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8
 ; SI: s_waitcnt lgkmcnt(0)
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
@@ -26,7 +26,7 @@ define void @simple_read2_f32(float addrspace(1)* %out) #0 {
 }
 
 ; SI-LABEL: @simple_read2_f32_max_offset
-; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:0 offset1:255
+; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255
 ; SI: s_waitcnt lgkmcnt(0)
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
@@ -63,7 +63,7 @@ define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
 }
 
 ; SI-LABEL: @simple_read2_f32_x2
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:0 offset1:8
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
 define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
@@ -94,7 +94,7 @@ define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
 
 ; Make sure there is an instruction between the two sets of reads.
 ; SI-LABEL: @simple_read2_f32_x2_barrier
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:0 offset1:8
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
 ; SI: s_barrier
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
@@ -313,7 +313,7 @@ define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrs
 
 ; SI-LABEL: @simple_read2_f64
 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}}
-; SI: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset0:0 offset1:8
+; SI: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
@@ -331,7 +331,7 @@ define void @simple_read2_f64(double addrspace(1)* %out) #0 {
 }
 
 ; SI-LABEL: @simple_read2_f64_max_offset
-; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:0 offset1:255
+; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255
 ; SI: s_endpgm
 define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -366,7 +366,7 @@ define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
 
 ; Alignment only 4
 ; SI-LABEL: @misaligned_read2_f64
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:0 offset1:1
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15
 ; SI: s_endpgm
 define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
@@ -386,7 +386,7 @@ define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)
 
 ; SI-LABEL: @load_constant_adjacent_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:0 offset1:1
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
 define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
@@ -397,7 +397,7 @@ define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
 
 ; SI-LABEL: @load_constant_disjoint_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:0 offset1:2
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2
 define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
@@ -410,7 +410,7 @@ define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
 
 ; SI-LABEL: @load_misaligned64_constant_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:0 offset1:1
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
 define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
@@ -425,8 +425,8 @@ define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
 ; SI-LABEL: @load_misaligned64_constant_large_offsets
 ; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
 ; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000
-; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset0:0 offset1:1
-; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset0:0 offset1:1
+; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1
+; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1
 ; SI: s_endpgm
 define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
diff --git a/test/CodeGen/R600/ds_read2st64.ll b/test/CodeGen/R600/ds_read2st64.ll
index 482debb..54b3b45 100644
--- a/test/CodeGen/R600/ds_read2st64.ll
+++ b/test/CodeGen/R600/ds_read2st64.ll
@@ -5,7 +5,7 @@
 
 
 ; SI-LABEL: @simple_read2st64_f32_0_1
-; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
 ; SI: s_waitcnt lgkmcnt(0)
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
@@ -117,7 +117,7 @@ define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
 }
 
 ; SI-LABEL: @simple_read2st64_f64_0_1
-; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
 ; SI: s_waitcnt lgkmcnt(0)
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
@@ -158,7 +158,7 @@ define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspac
 ; Alignment only
 
 ; SI-LABEL: @misaligned_read2st64_f64
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:0 offset1:1
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
 ; SI: s_endpgm
 define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
@@ -237,7 +237,7 @@ define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double
 
 ; SI-LABEL: @byte_size_only_divisible_64_read2_f64
 ; SI-NOT: ds_read2st_b64
-; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:0 offset1:8
+; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
 ; SI: s_endpgm
 define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
diff --git a/test/CodeGen/R600/ds_write2.ll b/test/CodeGen/R600/ds_write2.ll
index 1d94c57..b553d34 100644
--- a/test/CodeGen/R600/ds_write2.ll
+++ b/test/CodeGen/R600/ds_write2.ll
@@ -7,7 +7,7 @@
 ; SI-LABEL: @simple_write2_one_val_f32
 ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset0:0 offset1:8
+; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8
 ; SI: s_endpgm
 define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -25,7 +25,7 @@ define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1
 ; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:8 
+; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 
 ; SI: s_endpgm
 define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -84,7 +84,7 @@ define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float
 ; SI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
 ; SI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:8
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
 define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -105,7 +105,7 @@ define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2
 ; SI-LABEL: @simple_write2_two_val_subreg2_f32
 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:8
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
 define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -124,7 +124,7 @@ define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x floa
 ; SI-LABEL: @simple_write2_two_val_subreg4_f32
 ; SI-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:8
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
 define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -144,7 +144,7 @@ define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x floa
 ; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:255
+; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
 ; SI: s_endpgm
 define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -179,7 +179,7 @@ define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float add
 }
 
 ; SI-LABEL: @simple_write2_two_val_f32_x2
-; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:0 offset1:8
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
 ; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
 ; SI: s_endpgm
 define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
@@ -268,7 +268,7 @@ define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float add
 ; SI-LABEL: @simple_write2_one_val_f64
 ; SI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
-; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset0:0 offset1:8
+; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
 ; SI: s_endpgm
 define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -285,7 +285,7 @@ define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace
 ; SI-LABEL: @misaligned_simple_write2_one_val_f64
 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:1
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15
 ; SI: s_endpgm
 define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
@@ -304,7 +304,7 @@ define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, doubl
 ; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
-; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:8
+; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
 ; SI: s_endpgm
 define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -324,7 +324,7 @@ define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace
 
 ; SI-LABEL: @store_constant_adjacent_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 define void @store_constant_adjacent_offsets() {
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
@@ -334,7 +334,7 @@ define void @store_constant_adjacent_offsets() {
 ; SI-LABEL: @store_constant_disjoint_offsets
 ; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
 ; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset0:0 offset1:2
+; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2
 define void @store_constant_disjoint_offsets() {
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
@@ -345,7 +345,7 @@ define void @store_constant_disjoint_offsets() {
 
 ; SI-LABEL: @store_misaligned64_constant_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
 define void @store_misaligned64_constant_offsets() {
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
@@ -358,8 +358,8 @@ define void @store_misaligned64_constant_offsets() {
 ; SI-LABEL: @store_misaligned64_constant_large_offsets
 ; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
 ; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}}
-; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1
-; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; SI: s_endpgm
 define void @store_misaligned64_constant_large_offsets() {
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
diff --git a/test/CodeGen/R600/ds_write2st64.ll b/test/CodeGen/R600/ds_write2st64.ll
index 2044df2..1d9d881 100644
--- a/test/CodeGen/R600/ds_write2st64.ll
+++ b/test/CodeGen/R600/ds_write2st64.ll
@@ -7,7 +7,7 @@
 ; SI-LABEL: @simple_write2st64_one_val_f32_0_1
 ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset0:0 offset1:1
+; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1
 ; SI: s_endpgm
 define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -46,7 +46,7 @@ define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float add
 ; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:255
+; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
 ; SI: s_endpgm
 define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -85,7 +85,7 @@ define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, d
 
 ; SI-LABEL: @byte_size_only_divisible_64_write2st64_f64
 ; SI-NOT: ds_write2st64_b64
-; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:0 offset1:8
+; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8
 ; SI: s_endpgm
 define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
diff --git a/test/CodeGen/R600/fmaxnum.ll b/test/CodeGen/R600/fmaxnum.ll
index c105598..3029bd02 100644
--- a/test/CodeGen/R600/fmaxnum.ll
+++ b/test/CodeGen/R600/fmaxnum.ll
@@ -11,6 +11,9 @@ declare double @llvm.maxnum.f64(double, double)
 
 ; FUNC-LABEL: @test_fmax_f32
 ; SI: v_max_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
 define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
   %val = call float @llvm.maxnum.f32(float %a, float %b) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -20,6 +23,10 @@ define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwin
 ; FUNC-LABEL: @test_fmax_v2f32
 ; SI: v_max_f32_e32
 ; SI: v_max_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
 define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
   %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) #0
   store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
@@ -31,6 +38,12 @@ define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2
 ; SI: v_max_f32_e32
 ; SI: v_max_f32_e32
 ; SI: v_max_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
 define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
   %val = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) #0
   store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
@@ -46,6 +59,17 @@ define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4
 ; SI: v_max_f32_e32
 ; SI: v_max_f32_e32
 ; SI: v_max_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]]
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W
 define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
   %val = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b) #0
   store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
@@ -69,6 +93,27 @@ define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8
 ; SI: v_max_f32_e32
 ; SI: v_max_f32_e32
 ; SI: v_max_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]]
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W
+; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].W
+; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].W
 define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
   %val = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %a, <16 x float> %b) #0
   store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
@@ -79,6 +124,10 @@ define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a,
 ; SI-NOT: v_max_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 1.0, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -89,6 +138,11 @@ define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_max_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+; EG: 2143289344(nan)
 define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -99,6 +153,10 @@ define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_max_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 1.0, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -109,6 +167,10 @@ define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_max_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 1.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -119,6 +181,10 @@ define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_max_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -129,6 +195,10 @@ define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_max_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -139,6 +209,10 @@ define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_max_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float -0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -149,6 +223,10 @@ define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_max_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float -0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -157,6 +235,10 @@ define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
 
 ; FUNC-LABEL: @fmax_var_immediate_f32
 ; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -165,6 +247,9 @@ define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind
 
 ; FUNC-LABEL: @fmax_immediate_var_f32
 ; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
 define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -174,6 +259,9 @@ define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind
 ; FUNC-LABEL: @fmax_var_literal_f32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
 ; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
 define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -183,6 +271,9 @@ define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
 ; FUNC-LABEL: @fmax_literal_var_f32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
 ; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
 define void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/fminnum.ll b/test/CodeGen/R600/fminnum.ll
index 6b93b83..4d7b525 100644
--- a/test/CodeGen/R600/fminnum.ll
+++ b/test/CodeGen/R600/fminnum.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare float @llvm.minnum.f32(float, float) #0
 declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #0
@@ -9,6 +10,9 @@ declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #0
 
 ; FUNC-LABEL: @test_fmin_f32
 ; SI: v_min_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
 define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
   %val = call float @llvm.minnum.f32(float %a, float %b) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -18,6 +22,10 @@ define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwin
 ; FUNC-LABEL: @test_fmin_v2f32
 ; SI: v_min_f32_e32
 ; SI: v_min_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
 define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
   %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b) #0
   store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
@@ -29,6 +37,12 @@ define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2
 ; SI: v_min_f32_e32
 ; SI: v_min_f32_e32
 ; SI: v_min_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
 define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
   %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) #0
   store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
@@ -44,6 +58,17 @@ define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4
 ; SI: v_min_f32_e32
 ; SI: v_min_f32_e32
 ; SI: v_min_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]]
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W
 define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
   %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b) #0
   store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
@@ -67,6 +92,27 @@ define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8
 ; SI: v_min_f32_e32
 ; SI: v_min_f32_e32
 ; SI: v_min_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]]
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W
+; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].W
+; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].W
 define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
   %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b) #0
   store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
@@ -77,6 +123,10 @@ define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a,
 ; SI-NOT: v_min_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 1.0, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -87,6 +137,11 @@ define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_min_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+; EG: 2143289344({{nan|1\.#QNAN0e\+00}})
 define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -97,6 +152,10 @@ define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_min_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -107,6 +166,10 @@ define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_min_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -117,6 +180,10 @@ define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_min_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -127,6 +194,10 @@ define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_min_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -137,6 +208,10 @@ define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_min_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float -0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -147,6 +222,10 @@ define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_min_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float -0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -155,6 +234,9 @@ define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
 
 ; FUNC-LABEL: @fmin_var_immediate_f32
 ; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
 define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float %a, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -163,6 +245,9 @@ define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind
 
 ; FUNC-LABEL: @fmin_immediate_var_f32
 ; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
 define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float 2.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -172,6 +257,9 @@ define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind
 ; FUNC-LABEL: @fmin_var_literal_f32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
 ; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
 define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float %a, float 99.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -181,6 +269,9 @@ define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
 ; FUNC-LABEL: @fmin_literal_var_f32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
 ; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
 define void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float 99.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/ftrunc.f64.ll b/test/CodeGen/R600/ftrunc.f64.ll
index 4ea84a7..6618d8b 100644
--- a/test/CodeGen/R600/ftrunc.f64.ll
+++ b/test/CodeGen/R600/ftrunc.f64.ll
@@ -27,9 +27,9 @@ define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
 ; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
 ; SI: s_lshr_b64
-; SI: cmp_gt_i32
 ; SI: s_not_b64
 ; SI: s_and_b64
+; SI: cmp_gt_i32
 ; SI: cndmask_b32
 ; SI: cndmask_b32
 ; SI: cmp_lt_i32
diff --git a/test/CodeGen/R600/local-memory-two-objects.ll b/test/CodeGen/R600/local-memory-two-objects.ll
index caa4b19..06a8b12 100644
--- a/test/CodeGen/R600/local-memory-two-objects.ll
+++ b/test/CodeGen/R600/local-memory-two-objects.ll
@@ -32,8 +32,8 @@
 ; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
 ; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], 16, v{{[0-9]+}}
 ; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]]
-; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]]
-; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]] offset:16
+; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16
+; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]]
 
 define void @local_memory_two_objects(i32 addrspace(1)* %out) {
 entry:
diff --git a/test/CodeGen/R600/operand-folding.ll b/test/CodeGen/R600/operand-folding.ll
index 4bf748e..816755e 100644
--- a/test/CodeGen/R600/operand-folding.ll
+++ b/test/CodeGen/R600/operand-folding.ll
@@ -19,7 +19,7 @@ endif:
 }
 
 ; CHECK-LABEL: {{^}}fold_imm:
-; CHECK v_or_i32_e32 v{{[0-9]+}}, 5
+; CHECK: v_or_b32_e32 v{{[0-9]+}}, 5
 define void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) {
 entry:
   %fold = add i32 3, 2
diff --git a/test/CodeGen/R600/si-annotate-cf.ll b/test/CodeGen/R600/si-annotate-cf.ll
new file mode 100644
index 0000000..3f30988
--- /dev/null
+++ b/test/CodeGen/R600/si-annotate-cf.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}break_inserted_outside_of_loop:
+
+; SI: [[LOOP_LABEL:[A-Z0-9]+]]:
+; Lowered break instructin:
+; SI: s_or_b64
+; Lowered Loop instruction:
+; SI: s_andn2_b64
+; s_cbranch_execnz [[LOOP_LABEL]]
+; SI: s_endpgm
+define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+main_body:
+  %0 = and i32 %a, %b
+  %1 = trunc i32 %0 to i1
+  br label %ENDIF
+
+ENDLOOP:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+
+ENDIF:
+  br i1 %1, label %ENDLOOP, label %ENDIF
+}
diff --git a/test/CodeGen/R600/unaligned-load-store.ll b/test/CodeGen/R600/unaligned-load-store.ll
index efb1de2..82d88eb 100644
--- a/test/CodeGen/R600/unaligned-load-store.ll
+++ b/test/CodeGen/R600/unaligned-load-store.ll
@@ -195,7 +195,7 @@ define void @load_lds_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out,
 
 ; SI-LABEL: {{^}}load_lds_i64_align_4_with_split_offset:
 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:0 offset1:1
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1
 ; SI: s_endpgm
 define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)*
@@ -243,7 +243,7 @@ define void @store_lds_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
 
 ; SI-LABEL: {{^}}store_lds_i64_align_4_with_split_offset:
 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
-; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; SI: s_endpgm
 define void @store_lds_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
   %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)*
diff --git a/test/CodeGen/SPARC/2008-10-10-InlineAsmRegOperand.ll b/test/CodeGen/SPARC/2008-10-10-InlineAsmRegOperand.ll
index c12e9c1..7975ee4 100644
--- a/test/CodeGen/SPARC/2008-10-10-InlineAsmRegOperand.ll
+++ b/test/CodeGen/SPARC/2008-10-10-InlineAsmRegOperand.ll
@@ -7,7 +7,7 @@ module asm "\09.section\09\22.dtors\22,#alloc,#write"
 
 define void @frame_dummy() nounwind {
 entry:
-	%asmtmp = tail call void (i8*)* (void (i8*)*)* asm "", "=r,0"(void (i8*)* @_Jv_RegisterClasses) nounwind		; <void (i8*)*> [#uses=0]
+	%asmtmp = tail call void (i8*)* (void (i8*)*) asm "", "=r,0"(void (i8*)* @_Jv_RegisterClasses) nounwind		; <void (i8*)*> [#uses=0]
 	unreachable
 }
 
diff --git a/test/CodeGen/SPARC/2011-01-11-Call.ll b/test/CodeGen/SPARC/2011-01-11-Call.ll
index 067bade..8097e49 100644
--- a/test/CodeGen/SPARC/2011-01-11-Call.ll
+++ b/test/CodeGen/SPARC/2011-01-11-Call.ll
@@ -22,8 +22,8 @@
 
 define void @test() nounwind {
 entry:
- %0 = tail call i32 (...)* @foo() nounwind
- tail call void (...)* @bar() nounwind
+ %0 = tail call i32 (...) @foo() nounwind
+ tail call void (...) @bar() nounwind
  ret void
 }
 
@@ -48,6 +48,6 @@ declare void @bar(...)
 
 define i32 @test_tail_call_with_return() nounwind {
 entry:
- %0 = tail call i32 (...)* @foo() nounwind
+ %0 = tail call i32 (...) @foo() nounwind
  ret i32 %0
 }
diff --git a/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll b/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
index 8a3edc6..29bca67 100644
--- a/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
+++ b/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
@@ -66,7 +66,7 @@ entry:
   br i1 %0, label %bb, label %bb1
 
 bb:                                               ; preds = %entry
-  %1 = tail call i32 (...)* @foo(i32 %a) nounwind
+  %1 = tail call i32 (...) @foo(i32 %a) nounwind
   ret i32 %1
 
 bb1:                                              ; preds = %entry
diff --git a/test/CodeGen/SPARC/64abi.ll b/test/CodeGen/SPARC/64abi.ll
index a7e482c..7c08998 100644
--- a/test/CodeGen/SPARC/64abi.ll
+++ b/test/CodeGen/SPARC/64abi.ll
@@ -436,7 +436,7 @@ declare i64 @receive_fp128(i64 %a, ...)
 ; CHECK:       call receive_fp128
 define i64 @test_fp128_variable_args(i64 %a, fp128 %b) {
 entry:
-  %0 = call i64 (i64, ...)* @receive_fp128(i64 %a, fp128 %b)
+  %0 = call i64 (i64, ...) @receive_fp128(i64 %a, fp128 %b)
   ret i64 %0
 }
 
diff --git a/test/CodeGen/SPARC/setjmp.ll b/test/CodeGen/SPARC/setjmp.ll
index e75ef96..17519c5 100644
--- a/test/CodeGen/SPARC/setjmp.ll
+++ b/test/CodeGen/SPARC/setjmp.ll
@@ -47,7 +47,7 @@ entry:
 
 bar.exit:                                         ; preds = %entry
   %8 = load i32, i32* %0, align 4, !tbaa !4
-  %9 = call i32 (i8*, ...)* @printf(i8* noalias getelementptr inbounds ([30 x i8], [30 x i8]* @.cst, i32 0, i32 0), i32 %8) #0
+  %9 = call i32 (i8*, ...) @printf(i8* noalias getelementptr inbounds ([30 x i8], [30 x i8]* @.cst, i32 0, i32 0), i32 %8) #0
   ret i32 0
 }
 
diff --git a/test/CodeGen/SPARC/tls.ll b/test/CodeGen/SPARC/tls.ll
index d54cf60..a70637b 100644
--- a/test/CodeGen/SPARC/tls.ll
+++ b/test/CodeGen/SPARC/tls.ll
@@ -99,7 +99,7 @@ entry:
 ; v9abs-obj: ]
 
 ; pic-obj: Relocations [
-; pic-obj:  Section (2) .rela.text {
+; pic-obj:  Section {{.*}} .rela.text {
 ; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_PC22 _GLOBAL_OFFSET_TABLE_ 0x4
 ; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_PC10 _GLOBAL_OFFSET_TABLE_ 0x8
 ; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_LDO_HIX22 local_symbol 0x0
diff --git a/test/CodeGen/SPARC/varargs.ll b/test/CodeGen/SPARC/varargs.ll
index 9f18644..c2d1e98 100644
--- a/test/CodeGen/SPARC/varargs.ll
+++ b/test/CodeGen/SPARC/varargs.ll
@@ -71,6 +71,6 @@ declare void @llvm.va_start(i8*)
 ; CHECK: , %f2
 define i32 @call_1d() #0 {
 entry:
-  %call = call double (i8*, double, ...)* @varargsfunc(i8* undef, double 1.000000e+00, double 2.000000e+00)
+  %call = call double (i8*, double, ...) @varargsfunc(i8* undef, double 1.000000e+00, double 2.000000e+00)
   ret i32 1
 }
diff --git a/test/CodeGen/SystemZ/ctpop-01.ll b/test/CodeGen/SystemZ/ctpop-01.ll
new file mode 100644
index 0000000..ad80f9f
--- /dev/null
+++ b/test/CodeGen/SystemZ/ctpop-01.ll
@@ -0,0 +1,96 @@
+; Test population-count instruction
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+declare i32 @llvm.ctpop.i32(i32 %a)
+declare i64 @llvm.ctpop.i64(i64 %a)
+
+define i32 @f1(i32 %a) {
+; CHECK-LABEL: f1:
+; CHECK: popcnt  %r0, %r2
+; CHECK: sllk    %r1, %r0, 16
+; CHECK: ar      %r1, %r0
+; CHECK: sllk    %r2, %r1, 8
+; CHECK: ar      %r2, %r1
+; CHECK: srl     %r2, 24
+; CHECK: br      %r14
+
+  %popcnt = call i32 @llvm.ctpop.i32(i32 %a)
+  ret i32 %popcnt
+}
+
+define i32 @f2(i32 %a) {
+; CHECK-LABEL: f2:
+; CHECK: llhr    %r0, %r2
+; CHECK: popcnt  %r0, %r0
+; CHECK: risblg  %r2, %r0, 16, 151, 8
+; CHECK: ar      %r2, %r0
+; CHECK: srl     %r2, 8
+; CHECK: br      %r14
+  %and = and i32 %a, 65535
+  %popcnt = call i32 @llvm.ctpop.i32(i32 %and)
+  ret i32 %popcnt
+}
+
+define i32 @f3(i32 %a) {
+; CHECK-LABEL: f3:
+; CHECK: llcr    %r0, %r2
+; CHECK: popcnt  %r2, %r0
+; CHECK: br      %r14
+  %and = and i32 %a, 255
+  %popcnt = call i32 @llvm.ctpop.i32(i32 %and)
+  ret i32 %popcnt
+}
+
+define i64 @f4(i64 %a) {
+; CHECK-LABEL: f4:
+; CHECK: popcnt  %r0, %r2
+; CHECK: sllg    %r1, %r0, 32
+; CHECK: agr     %r1, %r0
+; CHECK: sllg    %r0, %r1, 16
+; CHECK: agr     %r0, %r1
+; CHECK: sllg    %r1, %r0, 8
+; CHECK: agr     %r1, %r0
+; CHECK: srlg    %r2, %r1, 56
+; CHECK: br      %r14
+  %popcnt = call i64 @llvm.ctpop.i64(i64 %a)
+  ret i64 %popcnt
+}
+
+define i64 @f5(i64 %a) {
+; CHECK-LABEL: f5:
+; CHECK: llgfr   %r0, %r2
+; CHECK: popcnt  %r0, %r0
+; CHECK: sllg    %r1, %r0, 16
+; CHECK: algfr   %r0, %r1
+; CHECK: sllg    %r1, %r0, 8
+; CHECK: algfr   %r0, %r1
+; CHECK: srlg    %r2, %r0, 24
+  %and = and i64 %a, 4294967295
+  %popcnt = call i64 @llvm.ctpop.i64(i64 %and)
+  ret i64 %popcnt
+}
+
+define i64 @f6(i64 %a) {
+; CHECK-LABEL: f6:
+; CHECK: llghr   %r0, %r2
+; CHECK: popcnt  %r0, %r0
+; CHECK: risbg   %r1, %r0, 48, 183, 8
+; CHECK: agr     %r1, %r0
+; CHECK: srlg    %r2, %r1, 8
+; CHECK: br      %r14
+  %and = and i64 %a, 65535
+  %popcnt = call i64 @llvm.ctpop.i64(i64 %and)
+  ret i64 %popcnt
+}
+
+define i64 @f7(i64 %a) {
+; CHECK-LABEL: f7:
+; CHECK: llgcr   %r0, %r2
+; CHECK: popcnt  %r2, %r0
+; CHECK: br      %r14
+  %and = and i64 %a, 255
+  %popcnt = call i64 @llvm.ctpop.i64(i64 %and)
+  ret i64 %popcnt
+}
+
diff --git a/test/CodeGen/SystemZ/htm-intrinsics.ll b/test/CodeGen/SystemZ/htm-intrinsics.ll
new file mode 100644
index 0000000..6441ef9
--- /dev/null
+++ b/test/CodeGen/SystemZ/htm-intrinsics.ll
@@ -0,0 +1,352 @@
+; Test transactional-execution intrinsics.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 | FileCheck %s
+
+declare i32 @llvm.s390.tbegin(i8 *, i32)
+declare i32 @llvm.s390.tbegin.nofloat(i8 *, i32)
+declare void @llvm.s390.tbeginc(i8 *, i32)
+declare i32 @llvm.s390.tend()
+declare void @llvm.s390.tabort(i64)
+declare void @llvm.s390.ntstg(i64, i64 *)
+declare i32 @llvm.s390.etnd()
+declare void @llvm.s390.ppa.txassist(i32)
+
+; TBEGIN.
+define void @test_tbegin() {
+; CHECK-LABEL: test_tbegin:
+; CHECK-NOT: stmg
+; CHECK: std %f8,
+; CHECK: std %f9,
+; CHECK: std %f10,
+; CHECK: std %f11,
+; CHECK: std %f12,
+; CHECK: std %f13,
+; CHECK: std %f14,
+; CHECK: std %f15,
+; CHECK: tbegin 0, 65292
+; CHECK: ld %f8,
+; CHECK: ld %f9,
+; CHECK: ld %f10,
+; CHECK: ld %f11,
+; CHECK: ld %f12,
+; CHECK: ld %f13,
+; CHECK: ld %f14,
+; CHECK: ld %f15,
+; CHECK: br %r14
+  call i32 @llvm.s390.tbegin(i8 *null, i32 65292)
+  ret void
+}
+
+; TBEGIN (nofloat).
+define void @test_tbegin_nofloat1() {
+; CHECK-LABEL: test_tbegin_nofloat1:
+; CHECK-NOT: stmg
+; CHECK-NOT: std
+; CHECK: tbegin 0, 65292
+; CHECK: br %r14
+  call i32 @llvm.s390.tbegin.nofloat(i8 *null, i32 65292)
+  ret void
+}
+
+; TBEGIN (nofloat) with integer CC return value.
+define i32 @test_tbegin_nofloat2() {
+; CHECK-LABEL: test_tbegin_nofloat2:
+; CHECK-NOT: stmg
+; CHECK-NOT: std
+; CHECK: tbegin 0, 65292
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %res = call i32 @llvm.s390.tbegin.nofloat(i8 *null, i32 65292)
+  ret i32 %res
+}
+
+; TBEGIN (nofloat) with implicit CC check.
+define void @test_tbegin_nofloat3(i32 *%ptr) {
+; CHECK-LABEL: test_tbegin_nofloat3:
+; CHECK-NOT: stmg
+; CHECK-NOT: std
+; CHECK: tbegin 0, 65292
+; CHECK: jnh  {{\.L*}}
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %res = call i32 @llvm.s390.tbegin.nofloat(i8 *null, i32 65292)
+  %cmp = icmp eq i32 %res, 2
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  store i32 0, i32* %ptr, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; TBEGIN (nofloat) with dual CC use.
+define i32 @test_tbegin_nofloat4(i32 %pad, i32 *%ptr) {
+; CHECK-LABEL: test_tbegin_nofloat4:
+; CHECK-NOT: stmg
+; CHECK-NOT: std
+; CHECK: tbegin 0, 65292
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: cijlh %r2, 2,  {{\.L*}}
+; CHECK: mvhi 0(%r3), 0
+; CHECK: br %r14
+  %res = call i32 @llvm.s390.tbegin.nofloat(i8 *null, i32 65292)
+  %cmp = icmp eq i32 %res, 2
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  store i32 0, i32* %ptr, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret i32 %res
+}
+
+; TBEGIN (nofloat) with register.
+define void @test_tbegin_nofloat5(i8 *%ptr) {
+; CHECK-LABEL: test_tbegin_nofloat5:
+; CHECK-NOT: stmg
+; CHECK-NOT: std
+; CHECK: tbegin 0(%r2), 65292
+; CHECK: br %r14
+  call i32 @llvm.s390.tbegin.nofloat(i8 *%ptr, i32 65292)
+  ret void
+}
+
+; TBEGIN (nofloat) with GRSM 0x0f00.
+define void @test_tbegin_nofloat6() {
+; CHECK-LABEL: test_tbegin_nofloat6:
+; CHECK: stmg %r6, %r15,
+; CHECK-NOT: std
+; CHECK: tbegin 0, 3840
+; CHECK: br %r14
+  call i32 @llvm.s390.tbegin.nofloat(i8 *null, i32 3840)
+  ret void
+}
+
+; TBEGIN (nofloat) with GRSM 0xf100.
+define void @test_tbegin_nofloat7() {
+; CHECK-LABEL: test_tbegin_nofloat7:
+; CHECK: stmg %r8, %r15,
+; CHECK-NOT: std
+; CHECK: tbegin 0, 61696
+; CHECK: br %r14
+  call i32 @llvm.s390.tbegin.nofloat(i8 *null, i32 61696)
+  ret void
+}
+
+; TBEGIN (nofloat) with GRSM 0xfe00 -- stack pointer added automatically.
+define void @test_tbegin_nofloat8() {
+; CHECK-LABEL: test_tbegin_nofloat8:
+; CHECK-NOT: stmg
+; CHECK-NOT: std
+; CHECK: tbegin 0, 65280
+; CHECK: br %r14
+  call i32 @llvm.s390.tbegin.nofloat(i8 *null, i32 65024)
+  ret void
+}
+
+; TBEGIN (nofloat) with GRSM 0xfb00 -- no frame pointer needed.
+define void @test_tbegin_nofloat9() {
+; CHECK-LABEL: test_tbegin_nofloat9:
+; CHECK: stmg %r10, %r15,
+; CHECK-NOT: std
+; CHECK: tbegin 0, 64256
+; CHECK: br %r14
+  call i32 @llvm.s390.tbegin.nofloat(i8 *null, i32 64256)
+  ret void
+}
+
+; TBEGIN (nofloat) with GRSM 0xfb00 -- frame pointer added automatically.
+define void @test_tbegin_nofloat10(i64 %n) {
+; CHECK-LABEL: test_tbegin_nofloat10:
+; CHECK: stmg %r11, %r15,
+; CHECK-NOT: std
+; CHECK: tbegin 0, 65280
+; CHECK: br %r14
+  %buf = alloca i8, i64 %n
+  call i32 @llvm.s390.tbegin.nofloat(i8 *null, i32 64256)
+  ret void
+}
+
+; TBEGINC.
+define void @test_tbeginc() {
+; CHECK-LABEL: test_tbeginc:
+; CHECK-NOT: stmg
+; CHECK-NOT: std
+; CHECK: tbeginc 0, 65288
+; CHECK: br %r14
+  call void @llvm.s390.tbeginc(i8 *null, i32 65288)
+  ret void
+}
+
+; TEND with integer CC return value.
+define i32 @test_tend1() {
+; CHECK-LABEL: test_tend1:
+; CHECK: tend
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %res = call i32 @llvm.s390.tend()
+  ret i32 %res
+}
+
+; TEND with implicit CC check.
+define void @test_tend3(i32 *%ptr) {
+; CHECK-LABEL: test_tend3:
+; CHECK: tend
+; CHECK: je  {{\.L*}}
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %res = call i32 @llvm.s390.tend()
+  %cmp = icmp eq i32 %res, 2
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  store i32 0, i32* %ptr, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; TEND with dual CC use.
+define i32 @test_tend2(i32 %pad, i32 *%ptr) {
+; CHECK-LABEL: test_tend2:
+; CHECK: tend
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: cijlh %r2, 2,  {{\.L*}}
+; CHECK: mvhi 0(%r3), 0
+; CHECK: br %r14
+  %res = call i32 @llvm.s390.tend()
+  %cmp = icmp eq i32 %res, 2
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  store i32 0, i32* %ptr, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret i32 %res
+}
+
+; TABORT with register only.
+define void @test_tabort1(i64 %val) {
+; CHECK-LABEL: test_tabort1:
+; CHECK: tabort 0(%r2)
+; CHECK: br %r14
+  call void @llvm.s390.tabort(i64 %val)
+  ret void
+}
+
+; TABORT with immediate only.
+define void @test_tabort2(i64 %val) {
+; CHECK-LABEL: test_tabort2:
+; CHECK: tabort 1234
+; CHECK: br %r14
+  call void @llvm.s390.tabort(i64 1234)
+  ret void
+}
+
+; TABORT with register + immediate.
+define void @test_tabort3(i64 %val) {
+; CHECK-LABEL: test_tabort3:
+; CHECK: tabort 1234(%r2)
+; CHECK: br %r14
+  %sum = add i64 %val, 1234
+  call void @llvm.s390.tabort(i64 %sum)
+  ret void
+}
+
+; TABORT with out-of-range immediate.
+define void @test_tabort4(i64 %val) {
+; CHECK-LABEL: test_tabort4:
+; CHECK: tabort 0({{%r[1-5]}})
+; CHECK: br %r14
+  call void @llvm.s390.tabort(i64 4096)
+  ret void
+}
+
+; NTSTG with base pointer only.
+define void @test_ntstg1(i64 *%ptr, i64 %val) {
+; CHECK-LABEL: test_ntstg1:
+; CHECK: ntstg %r3, 0(%r2)
+; CHECK: br %r14
+  call void @llvm.s390.ntstg(i64 %val, i64 *%ptr)
+  ret void
+}
+
+; NTSTG with base and index.
+; Check that VSTL doesn't allow an index.
+define void @test_ntstg2(i64 *%base, i64 %index, i64 %val) {
+; CHECK-LABEL: test_ntstg2:
+; CHECK: sllg [[REG:%r[1-5]]], %r3, 3
+; CHECK: ntstg %r4, 0([[REG]],%r2)
+; CHECK: br %r14
+  %ptr = getelementptr i64, i64 *%base, i64 %index
+  call void @llvm.s390.ntstg(i64 %val, i64 *%ptr)
+  ret void
+}
+
+; NTSTG with the highest in-range displacement.
+define void @test_ntstg3(i64 *%base, i64 %val) {
+; CHECK-LABEL: test_ntstg3:
+; CHECK: ntstg %r3, 524280(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr i64, i64 *%base, i64 65535
+  call void @llvm.s390.ntstg(i64 %val, i64 *%ptr)
+  ret void
+}
+
+; NTSTG with an out-of-range positive displacement.
+define void @test_ntstg4(i64 *%base, i64 %val) {
+; CHECK-LABEL: test_ntstg4:
+; CHECK: ntstg %r3, 0({{%r[1-5]}})
+; CHECK: br %r14
+  %ptr = getelementptr i64, i64 *%base, i64 65536
+  call void @llvm.s390.ntstg(i64 %val, i64 *%ptr)
+  ret void
+}
+
+; NTSTG with the lowest in-range displacement.
+define void @test_ntstg5(i64 *%base, i64 %val) {
+; CHECK-LABEL: test_ntstg5:
+; CHECK: ntstg %r3, -524288(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr i64, i64 *%base, i64 -65536
+  call void @llvm.s390.ntstg(i64 %val, i64 *%ptr)
+  ret void
+}
+
+; NTSTG with an out-of-range negative displacement.
+define void @test_ntstg6(i64 *%base, i64 %val) {
+; CHECK-LABEL: test_ntstg6:
+; CHECK: ntstg %r3, 0({{%r[1-5]}})
+; CHECK: br %r14
+  %ptr = getelementptr i64, i64 *%base, i64 -65537
+  call void @llvm.s390.ntstg(i64 %val, i64 *%ptr)
+  ret void
+}
+
+; ETND.
+define i32 @test_etnd() {
+; CHECK-LABEL: test_etnd:
+; CHECK: etnd %r2
+; CHECK: br %r14
+  %res = call i32 @llvm.s390.etnd()
+  ret i32 %res
+}
+
+; PPA (Transaction-Abort Assist)
+define void @test_ppa_txassist(i32 %val) {
+; CHECK-LABEL: test_ppa_txassist:
+; CHECK: ppa %r2, 0, 1
+; CHECK: br %r14
+  call void @llvm.s390.ppa.txassist(i32 %val)
+  ret void
+}
+
diff --git a/test/CodeGen/SystemZ/int-cmp-12.ll b/test/CodeGen/SystemZ/int-cmp-12.ll
index 077b224..d9c6a9f 100644
--- a/test/CodeGen/SystemZ/int-cmp-12.ll
+++ b/test/CodeGen/SystemZ/int-cmp-12.ll
@@ -49,13 +49,24 @@ define double @f4(double %a, double %b, i64 %i1) {
   ret double %res
 }
 
-; Check the next value up, which must use a register comparison.
+; Check the next value up, which can use a shifted comparison
 define double @f5(double %a, double %b, i64 %i1) {
 ; CHECK-LABEL: f5:
-; CHECK: clgrjl %r2,
+; CHECK: srlg [[REG:%r[0-5]]], %r2, 32
+; CHECK: cgije [[REG]], 0
 ; CHECK: ldr %f0, %f2
 ; CHECK: br %r14
   %cond = icmp ult i64 %i1, 4294967296
   %res = select i1 %cond, double %a, double %b
   ret double %res
 }
+; Check the next value up, which must use a register comparison.
+define double @f6(double %a, double %b, i64 %i1) {
+; CHECK-LABEL: f6:
+; CHECK: clgrjl %r2,
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %cond = icmp ult i64 %i1, 4294967297
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-47.ll b/test/CodeGen/SystemZ/int-cmp-47.ll
index 038a25b..274350d 100644
--- a/test/CodeGen/SystemZ/int-cmp-47.ll
+++ b/test/CodeGen/SystemZ/int-cmp-47.ll
@@ -309,7 +309,8 @@ exit:
 define void @f17(i64 %a) {
 ; CHECK-LABEL: f17:
 ; CHECK-NOT: tmhh
-; CHECK: llihh {{%r[0-5]}}, 49151
+; CHECK: srlg [[REG:%r[0-5]]], %r2, 48
+; CHECK: cgfi [[REG]], 49151
 ; CHECK-NOT: tmhh
 ; CHECK: br %r14
 entry:
diff --git a/test/CodeGen/SystemZ/int-cmp-50.ll b/test/CodeGen/SystemZ/int-cmp-50.ll
new file mode 100644
index 0000000..287ac2c
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-cmp-50.ll
@@ -0,0 +1,30 @@
+; Verify that we do not crash on always-true conditions
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -O0
+;
+; This test was compiled using clang -O0 from the following source code:
+;
+; int test(unsigned long x)
+; {
+;   return x >= 0 && x <= 15;
+; }
+
+define signext i32 @test(i64 %x) {
+entry:
+  %x.addr = alloca i64, align 8
+  store i64 %x, i64* %x.addr, align 8
+  %0 = load i64, i64* %x.addr, align 8
+  %cmp = icmp uge i64 %0, 0
+  br i1 %cmp, label %land.rhs, label %land.end
+
+land.rhs:                                         ; preds = %entry
+  %1 = load i64, i64* %x.addr, align 8
+  %cmp1 = icmp ule i64 %1, 15
+  br label %land.end
+
+land.end:                                         ; preds = %land.rhs, %entry
+  %2 = phi i1 [ false, %entry ], [ %cmp1, %land.rhs ]
+  %land.ext = zext i1 %2 to i32
+  ret i32 %land.ext
+}
+
diff --git a/test/CodeGen/SystemZ/risbg-03.ll b/test/CodeGen/SystemZ/risbg-03.ll
new file mode 100644
index 0000000..c3c08ad
--- /dev/null
+++ b/test/CodeGen/SystemZ/risbg-03.ll
@@ -0,0 +1,30 @@
+; Test use of RISBG vs RISBGN on zEC12.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 | FileCheck %s
+
+; On zEC12, we generally prefer RISBGN.
+define i64 @f1(i64 %a, i64 %b) {
+; CHECK-LABEL: f1:
+; CHECK: risbgn %r2, %r3, 60, 62, 0
+; CHECK: br %r14
+  %anda = and i64 %a, -15
+  %andb = and i64 %b, 14
+  %or = or i64 %anda, %andb
+  ret i64 %or
+}
+
+; But we may fall back to RISBG if we can use the condition code.
+define i64 @f2(i64 %a, i64 %b, i32* %c) {
+; CHECK-LABEL: f2:
+; CHECK: risbg %r2, %r3, 60, 62, 0
+; CHECK-NEXT: ipm
+; CHECK: br %r14
+  %anda = and i64 %a, -15
+  %andb = and i64 %b, 14
+  %or = or i64 %anda, %andb
+  %cmp = icmp sgt i64 %or, 0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* %c, align 4
+  ret i64 %or
+}
+
diff --git a/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll b/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll
new file mode 100644
index 0000000..65cc394
--- /dev/null
+++ b/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=systemz < %s | FileCheck %s
+
+; CHECK-LABEL: tail_memcpy:
+; CHECK: jg memcpy
+define void @tail_memcpy(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: tail_memmove:
+; CHECK: jg memmove
+define void @tail_memmove(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 {
+entry:
+  tail call void @llvm.memmove.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: tail_memset:
+; CHECK: jg memset
+define void @tail_memset(i8* nocapture %p, i8 %c, i32 %n) #0 {
+entry:
+  tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #0
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #0
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/Thumb/2007-01-31-RegInfoAssert.ll b/test/CodeGen/Thumb/2007-01-31-RegInfoAssert.ll
index 37bcc36..2d2ac9c 100644
--- a/test/CodeGen/Thumb/2007-01-31-RegInfoAssert.ll
+++ b/test/CodeGen/Thumb/2007-01-31-RegInfoAssert.ll
@@ -9,7 +9,7 @@ define void @f1() {
 	%tmp7 = load i32, i32* %tmp1
 	%tmp14 = lshr i32 %tmp7, 1
 	%tmp1415 = and i32 %tmp14, 1
-	call void (i32, ...)* @printf( i32 undef, i32 0, i32 %tmp1415 )
+	call void (i32, ...) @printf( i32 undef, i32 0, i32 %tmp1415 )
 	ret void
 }
 
diff --git a/test/CodeGen/Thumb/2007-05-05-InvalidPushPop.ll b/test/CodeGen/Thumb/2007-05-05-InvalidPushPop.ll
index 71fb005..079ab87 100644
--- a/test/CodeGen/Thumb/2007-05-05-InvalidPushPop.ll
+++ b/test/CodeGen/Thumb/2007-05-05-InvalidPushPop.ll
@@ -25,12 +25,12 @@ entry:
 	%ret3 = bitcast i32* %ret to i8**		; <i8**> [#uses=2]
 	%tmp4 = call i32 @pthread_join( i32 %tmp2, i8** %ret3 )		; <i32> [#uses=0]
 	%tmp5 = load i32, i32* %ret		; <i32> [#uses=1]
-	%tmp7 = call i32 (i8*, ...)* @printf( i8* getelementptr ([14 x i8], [14 x i8]* @.str, i32 0, i32 0), i32 %tmp5 )		; <i32> [#uses=0]
+	%tmp7 = call i32 (i8*, ...) @printf( i8* getelementptr ([14 x i8], [14 x i8]* @.str, i32 0, i32 0), i32 %tmp5 )		; <i32> [#uses=0]
 	%tmp8 = call i32 @pthread_create( i32* %t, %struct.pthread_attr_t* null, i8* (i8*)* @f, i8* null )		; <i32> [#uses=0]
 	%tmp9 = load i32, i32* %t		; <i32> [#uses=1]
 	%tmp11 = call i32 @pthread_join( i32 %tmp9, i8** %ret3 )		; <i32> [#uses=0]
 	%tmp12 = load i32, i32* %ret		; <i32> [#uses=1]
-	%tmp14 = call i32 (i8*, ...)* @printf( i8* getelementptr ([14 x i8], [14 x i8]* @.str1, i32 0, i32 0), i32 %tmp12 )		; <i32> [#uses=0]
+	%tmp14 = call i32 (i8*, ...) @printf( i8* getelementptr ([14 x i8], [14 x i8]* @.str1, i32 0, i32 0), i32 %tmp12 )		; <i32> [#uses=0]
 	ret i32 0
 }
 
diff --git a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
index 58b15c8..9235830 100644
--- a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
+++ b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
@@ -150,6 +150,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !97 = !MDLocation(line: 227, scope: !94, inlinedAt: !96)
 !98 = !MDLocation(line: 52, scope: !1)
 !101 = !MDFile(filename: "ggEdgeDiscrepancy.cc", directory: "/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src")
-!102 = !{i32 0}
+!102 = !{}
 !103 = !{!3, !77}
 !104 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/Thumb/2011-05-11-DAGLegalizer.ll b/test/CodeGen/Thumb/2011-05-11-DAGLegalizer.ll
index ff574c2..d8e1651 100644
--- a/test/CodeGen/Thumb/2011-05-11-DAGLegalizer.ll
+++ b/test/CodeGen/Thumb/2011-05-11-DAGLegalizer.ll
@@ -48,7 +48,7 @@ do.body:                                          ; preds = %entry
   %tmp20 = bitcast %struct.RRRRRRRR* %agg.tmp16 to i8*
   %tmp21 = bitcast %struct.RRRRRRRR* %arrayidx19 to i8*
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp20, i8* %tmp21, i32 312, i32 4, i1 false)
-  call void (i8*, i32, i8*, i8*, ...)* @CLLoggingLog(i8* %tmp, i32 2, i8* getelementptr inbounds ([62 x i8], [62 x i8]* @__PRETTY_FUNCTION__._ZN12CLGll, i32 0, i32 0), i8* getelementptr inbounds ([75 x i8], [75 x i8]* @.str, i32 0, i32 0), %struct.RRRRRRRR* byval %agg.tmp, %struct.RRRRRRRR* byval %agg.tmp4, %struct.RRRRRRRR* byval %agg.tmp10, %struct.RRRRRRRR* byval %agg.tmp16)
+  call void (i8*, i32, i8*, i8*, ...) @CLLoggingLog(i8* %tmp, i32 2, i8* getelementptr inbounds ([62 x i8], [62 x i8]* @__PRETTY_FUNCTION__._ZN12CLGll, i32 0, i32 0), i8* getelementptr inbounds ([75 x i8], [75 x i8]* @.str, i32 0, i32 0), %struct.RRRRRRRR* byval %agg.tmp, %struct.RRRRRRRR* byval %agg.tmp4, %struct.RRRRRRRR* byval %agg.tmp10, %struct.RRRRRRRR* byval %agg.tmp16)
   br label %do.end
 
 do.end:                                           ; preds = %do.body
diff --git a/test/CodeGen/Thumb/2011-06-16-NoGPRs.ll b/test/CodeGen/Thumb/2011-06-16-NoGPRs.ll
index d39a760..accb82c 100644
--- a/test/CodeGen/Thumb/2011-06-16-NoGPRs.ll
+++ b/test/CodeGen/Thumb/2011-06-16-NoGPRs.ll
@@ -16,7 +16,7 @@ declare i8* @f2(i8*, i8*, ...)
 
 define internal void @f(i8* %self, i8* %_cmd, %0* %inObjects, %0* %inIndexes) optsize ssp {
 entry:
-  %call14 = tail call i8* (i8*, i8*, ...)* (i8*, i8*)* @f1(i8* undef, i8* %_cmd) optsize
+  %call14 = tail call i8* (i8*, i8*, ...)* (i8*, i8*) @f1(i8* undef, i8* %_cmd) optsize
   %0 = bitcast i8* (i8*, i8*, ...)* %call14 to void (i8*, i8*, %0*, %0*)*
   tail call void %0(i8* %self, i8* %_cmd, %0* %inObjects, %0* %inIndexes) optsize
   tail call void bitcast (i8* (i8*, i8*, ...)* @f2 to void (i8*, i8*, i32, %0*, %0*)*)(i8* %self, i8* undef, i32 2, %0* %inIndexes, %0* undef) optsize
diff --git a/test/CodeGen/Thumb/asmprinter-bug.ll b/test/CodeGen/Thumb/asmprinter-bug.ll
index 0024d08..e12fcb1 100644
--- a/test/CodeGen/Thumb/asmprinter-bug.ll
+++ b/test/CodeGen/Thumb/asmprinter-bug.ll
@@ -250,7 +250,7 @@ entry:
 	br label %bb
 
 bb:		; preds = %bb3, %entry
-	%0 = tail call  i32 (...)* @read(i32 0, i8* getelementptr ([500 x i8], [500 x i8]* @abuf, i32 0, i32 0), i32 500) nounwind		; <i32> [#uses=4]
+	%0 = tail call  i32 (...) @read(i32 0, i8* getelementptr ([500 x i8], [500 x i8]* @abuf, i32 0, i32 0), i32 500) nounwind		; <i32> [#uses=4]
 	%1 = icmp slt i32 %0, 0		; <i1> [#uses=1]
 	br i1 %1, label %bb1, label %bb2
 
@@ -266,7 +266,7 @@ bb3:		; preds = %bb2
 	%3 = shl i32 %0, 1		; <i32> [#uses=1]
 	tail call  void @adpcm_decoder(i8* getelementptr ([500 x i8], [500 x i8]* @abuf, i32 0, i32 0), i16* getelementptr ([1000 x i16], [1000 x i16]* @sbuf, i32 0, i32 0), i32 %3, %struct.adpcm_state* @state) nounwind
 	%4 = shl i32 %0, 2		; <i32> [#uses=1]
-	%5 = tail call  i32 (...)* @write(i32 1, i16* getelementptr ([1000 x i16], [1000 x i16]* @sbuf, i32 0, i32 0), i32 %4) nounwind		; <i32> [#uses=0]
+	%5 = tail call  i32 (...) @write(i32 1, i16* getelementptr ([1000 x i16], [1000 x i16]* @sbuf, i32 0, i32 0), i32 %4) nounwind		; <i32> [#uses=0]
 	br label %bb
 
 bb4:		; preds = %bb2
@@ -275,7 +275,7 @@ bb4:		; preds = %bb2
 	%8 = sext i16 %7 to i32		; <i32> [#uses=1]
 	%9 = load i8, i8* getelementptr (%struct.adpcm_state, %struct.adpcm_state* @state, i32 0, i32 1), align 2		; <i8> [#uses=1]
 	%10 = sext i8 %9 to i32		; <i32> [#uses=1]
-	%11 = tail call  i32 (%struct.FILE*, i8*, ...)* @fprintf(%struct.FILE* %6, i8* getelementptr ([28 x i8], [28 x i8]* @.str1, i32 0, i32 0), i32 %8, i32 %10) nounwind		; <i32> [#uses=0]
+	%11 = tail call  i32 (%struct.FILE*, i8*, ...) @fprintf(%struct.FILE* %6, i8* getelementptr ([28 x i8], [28 x i8]* @.str1, i32 0, i32 0), i32 %8, i32 %10) nounwind		; <i32> [#uses=0]
 	ret i32 0
 }
 
diff --git a/test/CodeGen/Thumb/vargs.ll b/test/CodeGen/Thumb/vargs.ll
index 7137429..1c7b631 100644
--- a/test/CodeGen/Thumb/vargs.ll
+++ b/test/CodeGen/Thumb/vargs.ll
@@ -27,7 +27,7 @@ bb:             ; preds = %bb, %entry
 bb7:            ; preds = %bb
         %tmp3 = bitcast i8* %tmp to i32*                ; <i32*> [#uses=1]
         %tmp.upgrd.3 = load i32, i32* %tmp3          ; <i32> [#uses=1]
-        %tmp10 = call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @str, i32 0, i64 0), i32 %tmp.upgrd.3 )                ; <i32> [#uses=0]
+        %tmp10 = call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @str, i32 0, i64 0), i32 %tmp.upgrd.3 )                ; <i32> [#uses=0]
         %va.upgrd.4 = bitcast i8** %va to i8*           ; <i8*> [#uses=1]
         call void @llvm.va_end( i8* %va.upgrd.4 )
         ret void
diff --git a/test/CodeGen/Thumb2/2009-07-21-ISelBug.ll b/test/CodeGen/Thumb2/2009-07-21-ISelBug.ll
index 0dc04e0..e363a34 100644
--- a/test/CodeGen/Thumb2/2009-07-21-ISelBug.ll
+++ b/test/CodeGen/Thumb2/2009-07-21-ISelBug.ll
@@ -24,7 +24,7 @@ entry:
 	%15 = sext i8 %6 to i32		; <i32> [#uses=2]
 	%16 = sext i16 %10 to i32		; <i32> [#uses=2]
 	%17 = sext i16 %13 to i32		; <i32> [#uses=2]
-	%18 = call  i32 (i8*, ...)* @printf(i8* getelementptr ([36 x i8], [36 x i8]* @"\01LC", i32 0, i32 0), i32 -128, i32 0, i32 %15, i32 %16, i32 %17, i32 0, i32 %14) nounwind		; <i32> [#uses=0]
+	%18 = call  i32 (i8*, ...) @printf(i8* getelementptr ([36 x i8], [36 x i8]* @"\01LC", i32 0, i32 0), i32 -128, i32 0, i32 %15, i32 %16, i32 %17, i32 0, i32 %14) nounwind		; <i32> [#uses=0]
 	%19 = add i32 0, %15		; <i32> [#uses=1]
 	%20 = add i32 %19, %16		; <i32> [#uses=1]
 	%21 = add i32 %20, %14		; <i32> [#uses=1]
diff --git a/test/CodeGen/Thumb2/2009-08-04-CoalescerAssert.ll b/test/CodeGen/Thumb2/2009-08-04-CoalescerAssert.ll
index e980bdb..b75a14b 100644
--- a/test/CodeGen/Thumb2/2009-08-04-CoalescerAssert.ll
+++ b/test/CodeGen/Thumb2/2009-08-04-CoalescerAssert.ll
@@ -17,7 +17,7 @@ bb1:		; preds = %entry
 
 bb2:		; preds = %bb1
 	%0 = call i8* @llvm.frameaddress(i32 0)		; <i8*> [#uses=1]
-	%1 = call  i32 (%struct.FILE*, i8*, ...)* @fprintf(%struct.FILE* noalias undef, i8* noalias getelementptr ([30 x i8], [30 x i8]* @.str2, i32 0, i32 0), i8* %0, i8* null) nounwind		; <i32> [#uses=0]
+	%1 = call  i32 (%struct.FILE*, i8*, ...) @fprintf(%struct.FILE* noalias undef, i8* noalias getelementptr ([30 x i8], [30 x i8]* @.str2, i32 0, i32 0), i8* %0, i8* null) nounwind		; <i32> [#uses=0]
 	unreachable
 
 bb9:		; preds = %bb1
diff --git a/test/CodeGen/Thumb2/2009-08-04-CoalescerBug.ll b/test/CodeGen/Thumb2/2009-08-04-CoalescerBug.ll
index a670782..ccec979 100644
--- a/test/CodeGen/Thumb2/2009-08-04-CoalescerBug.ll
+++ b/test/CodeGen/Thumb2/2009-08-04-CoalescerBug.ll
@@ -67,22 +67,22 @@ FontSize.exit:		; preds = %bb.i1, %FontHalfXHeight.exit
 	br i1 %2, label %bb.i5, label %FontName.exit
 
 bb.i5:		; preds = %FontSize.exit
-	call  void (i32, i32, i8*, i32, %struct.FILE_POS*, ...)* @Error(i32 1, i32 2, i8* getelementptr ([20 x i8], [20 x i8]* @.str24239, i32 0, i32 0), i32 0, %struct.FILE_POS* bitcast (%4* @no_file_pos to %struct.FILE_POS*), i8* getelementptr ([10 x i8], [10 x i8]* @.str81872, i32 0, i32 0)) nounwind
+	call  void (i32, i32, i8*, i32, %struct.FILE_POS*, ...) @Error(i32 1, i32 2, i8* getelementptr ([20 x i8], [20 x i8]* @.str24239, i32 0, i32 0), i32 0, %struct.FILE_POS* bitcast (%4* @no_file_pos to %struct.FILE_POS*), i8* getelementptr ([10 x i8], [10 x i8]* @.str81872, i32 0, i32 0)) nounwind
 	br label %FontName.exit
 
 FontName.exit:		; preds = %bb.i5, %FontSize.exit
-	%3 = call  i32 (%struct.FILE*, i8*, ...)* @fprintf(%struct.FILE* undef, i8* getelementptr ([8 x i8], [8 x i8]* @.str1822946, i32 0, i32 0), i32 %1, i8* undef) nounwind		; <i32> [#uses=0]
+	%3 = call  i32 (%struct.FILE*, i8*, ...) @fprintf(%struct.FILE* undef, i8* getelementptr ([8 x i8], [8 x i8]* @.str1822946, i32 0, i32 0), i32 %1, i8* undef) nounwind		; <i32> [#uses=0]
 	%4 = call  i32 @"\01_fwrite"(i8* getelementptr ([11 x i8], [11 x i8]* @.str1842948, i32 0, i32 0), i32 1, i32 10, i8* undef) nounwind		; <i32> [#uses=0]
 	%5 = sub i32 %colmark, undef		; <i32> [#uses=1]
 	%6 = sub i32 %rowmark, undef		; <i32> [#uses=1]
 	%7 = load %struct.FILE*, %struct.FILE** @out_fp, align 4		; <%struct.FILE*> [#uses=1]
-	%8 = call  i32 (%struct.FILE*, i8*, ...)* @fprintf(%struct.FILE* %7, i8* getelementptr ([17 x i8], [17 x i8]* @.str212784, i32 0, i32 0), i32 %5, i32 %6) nounwind		; <i32> [#uses=0]
+	%8 = call  i32 (%struct.FILE*, i8*, ...) @fprintf(%struct.FILE* %7, i8* getelementptr ([17 x i8], [17 x i8]* @.str212784, i32 0, i32 0), i32 %5, i32 %6) nounwind		; <i32> [#uses=0]
 	store i32 0, i32* @cpexists, align 4
 	%9 = getelementptr %struct.rec, %struct.rec* %y.0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 1		; <i32*> [#uses=1]
 	%10 = load i32, i32* %9, align 4		; <i32> [#uses=1]
 	%11 = sub i32 0, %10		; <i32> [#uses=1]
 	%12 = load %struct.FILE*, %struct.FILE** @out_fp, align 4		; <%struct.FILE*> [#uses=1]
-	%13 = call  i32 (%struct.FILE*, i8*, ...)* @fprintf(%struct.FILE* %12, i8* getelementptr ([17 x i8], [17 x i8]* @.str212784, i32 0, i32 0), i32 undef, i32 %11) nounwind		; <i32> [#uses=0]
+	%13 = call  i32 (%struct.FILE*, i8*, ...) @fprintf(%struct.FILE* %12, i8* getelementptr ([17 x i8], [17 x i8]* @.str212784, i32 0, i32 0), i32 undef, i32 %11) nounwind		; <i32> [#uses=0]
 	store i32 0, i32* @cpexists, align 4
 	br label %bb100.outer.outer
 
diff --git a/test/CodeGen/Thumb2/2009-08-04-ScavengerAssert.ll b/test/CodeGen/Thumb2/2009-08-04-ScavengerAssert.ll
index 0f277e4..89f47d9 100644
--- a/test/CodeGen/Thumb2/2009-08-04-ScavengerAssert.ll
+++ b/test/CodeGen/Thumb2/2009-08-04-ScavengerAssert.ll
@@ -94,7 +94,7 @@ bb1:		; preds = %bb, %entry
 	br i1 %8, label %bb2, label %bb3
 
 bb2:		; preds = %bb1
-	call  void (i32, i32, i8*, i32, %struct.FILE_POS*, ...)* @Error(i32 1, i32 2, i8* getelementptr ([20 x i8], [20 x i8]* @.str24239, i32 0, i32 0), i32 0, %struct.FILE_POS* bitcast (%4* @no_file_pos to %struct.FILE_POS*), i8* getelementptr ([40 x i8], [40 x i8]* @.str1802944, i32 0, i32 0)) nounwind
+	call  void (i32, i32, i8*, i32, %struct.FILE_POS*, ...) @Error(i32 1, i32 2, i8* getelementptr ([20 x i8], [20 x i8]* @.str24239, i32 0, i32 0), i32 0, %struct.FILE_POS* bitcast (%4* @no_file_pos to %struct.FILE_POS*), i8* getelementptr ([40 x i8], [40 x i8]* @.str1802944, i32 0, i32 0)) nounwind
 	br label %bb3
 
 bb3:		; preds = %bb2, %bb1
@@ -124,7 +124,7 @@ bb9:		; preds = %bb8
 	br i1 %15, label %bb.i, label %FontHalfXHeight.exit
 
 bb.i:		; preds = %bb9
-	call  void (i32, i32, i8*, i32, %struct.FILE_POS*, ...)* @Error(i32 1, i32 2, i8* getelementptr ([20 x i8], [20 x i8]* @.str24239, i32 0, i32 0), i32 0, %struct.FILE_POS* bitcast (%4* @no_file_pos to %struct.FILE_POS*), i8* getelementptr ([17 x i8], [17 x i8]* @.str111875, i32 0, i32 0)) nounwind
+	call  void (i32, i32, i8*, i32, %struct.FILE_POS*, ...) @Error(i32 1, i32 2, i8* getelementptr ([20 x i8], [20 x i8]* @.str24239, i32 0, i32 0), i32 0, %struct.FILE_POS* bitcast (%4* @no_file_pos to %struct.FILE_POS*), i8* getelementptr ([17 x i8], [17 x i8]* @.str111875, i32 0, i32 0)) nounwind
 	%.pre186 = load i32, i32* @currentfont, align 4		; <i32> [#uses=1]
 	br label %FontHalfXHeight.exit
 
@@ -139,7 +139,7 @@ bb1.i:		; preds = %bb.i1, %FontHalfXHeight.exit
 	br i1 undef, label %bb2.i, label %FontSize.exit
 
 bb2.i:		; preds = %bb1.i
-	call  void (i32, i32, i8*, i32, %struct.FILE_POS*, ...)* @Error(i32 37, i32 61, i8* getelementptr ([30 x i8], [30 x i8]* @.str101874, i32 0, i32 0), i32 1, %struct.FILE_POS* null) nounwind
+	call  void (i32, i32, i8*, i32, %struct.FILE_POS*, ...) @Error(i32 37, i32 61, i8* getelementptr ([30 x i8], [30 x i8]* @.str101874, i32 0, i32 0), i32 1, %struct.FILE_POS* null) nounwind
 	unreachable
 
 FontSize.exit:		; preds = %bb1.i
@@ -151,33 +151,33 @@ FontSize.exit:		; preds = %bb1.i
 	br i1 %21, label %bb.i5, label %FontName.exit
 
 bb.i5:		; preds = %FontSize.exit
-	call  void (i32, i32, i8*, i32, %struct.FILE_POS*, ...)* @Error(i32 1, i32 2, i8* getelementptr ([20 x i8], [20 x i8]* @.str24239, i32 0, i32 0), i32 0, %struct.FILE_POS* bitcast (%4* @no_file_pos to %struct.FILE_POS*), i8* getelementptr ([10 x i8], [10 x i8]* @.str81872, i32 0, i32 0)) nounwind
+	call  void (i32, i32, i8*, i32, %struct.FILE_POS*, ...) @Error(i32 1, i32 2, i8* getelementptr ([20 x i8], [20 x i8]* @.str24239, i32 0, i32 0), i32 0, %struct.FILE_POS* bitcast (%4* @no_file_pos to %struct.FILE_POS*), i8* getelementptr ([10 x i8], [10 x i8]* @.str81872, i32 0, i32 0)) nounwind
 	br label %FontName.exit
 
 FontName.exit:		; preds = %bb.i5, %FontSize.exit
 	%22 = phi %struct.FONT_INFO* [ undef, %bb.i5 ], [ undef, %FontSize.exit ]		; <%struct.FONT_INFO*> [#uses=1]
 	%23 = getelementptr %struct.FONT_INFO, %struct.FONT_INFO* %22, i32 %19, i32 5		; <%struct.rec**> [#uses=0]
-	%24 = call  i32 (%struct.FILE*, i8*, ...)* @fprintf(%struct.FILE* undef, i8* getelementptr ([8 x i8], [8 x i8]* @.str1822946, i32 0, i32 0), i32 %18, i8* null) nounwind		; <i32> [#uses=0]
+	%24 = call  i32 (%struct.FILE*, i8*, ...) @fprintf(%struct.FILE* undef, i8* getelementptr ([8 x i8], [8 x i8]* @.str1822946, i32 0, i32 0), i32 %18, i8* null) nounwind		; <i32> [#uses=0]
 	br label %bb10
 
 bb10:		; preds = %FontName.exit, %bb8
 	%25 = call  i32 @"\01_fwrite"(i8* getelementptr ([11 x i8], [11 x i8]* @.str1842948, i32 0, i32 0), i32 1, i32 10, i8* undef) nounwind		; <i32> [#uses=0]
 	%26 = sub i32 %rowmark, undef		; <i32> [#uses=1]
 	%27 = load %struct.FILE*, %struct.FILE** @out_fp, align 4		; <%struct.FILE*> [#uses=1]
-	%28 = call  i32 (%struct.FILE*, i8*, ...)* @fprintf(%struct.FILE* %27, i8* getelementptr ([17 x i8], [17 x i8]* @.str212784, i32 0, i32 0), i32 undef, i32 %26) nounwind		; <i32> [#uses=0]
+	%28 = call  i32 (%struct.FILE*, i8*, ...) @fprintf(%struct.FILE* %27, i8* getelementptr ([17 x i8], [17 x i8]* @.str212784, i32 0, i32 0), i32 undef, i32 %26) nounwind		; <i32> [#uses=0]
 	store i32 0, i32* @cpexists, align 4
-	%29 = call  i32 (%struct.FILE*, i8*, ...)* @fprintf(%struct.FILE* undef, i8* getelementptr ([17 x i8], [17 x i8]* @.str192782, i32 0, i32 0), double 2.000000e+01, double 2.000000e+01) nounwind		; <i32> [#uses=0]
+	%29 = call  i32 (%struct.FILE*, i8*, ...) @fprintf(%struct.FILE* undef, i8* getelementptr ([17 x i8], [17 x i8]* @.str192782, i32 0, i32 0), double 2.000000e+01, double 2.000000e+01) nounwind		; <i32> [#uses=0]
 	%30 = getelementptr %struct.rec, %struct.rec* %y.0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0		; <i32*> [#uses=1]
 	%31 = load i32, i32* %30, align 4		; <i32> [#uses=1]
 	%32 = sub i32 0, %31		; <i32> [#uses=1]
 	%33 = load i32, i32* undef, align 4		; <i32> [#uses=1]
 	%34 = sub i32 0, %33		; <i32> [#uses=1]
 	%35 = load %struct.FILE*, %struct.FILE** @out_fp, align 4		; <%struct.FILE*> [#uses=1]
-	%36 = call  i32 (%struct.FILE*, i8*, ...)* @fprintf(%struct.FILE* %35, i8* getelementptr ([17 x i8], [17 x i8]* @.str212784, i32 0, i32 0), i32 %32, i32 %34) nounwind		; <i32> [#uses=0]
+	%36 = call  i32 (%struct.FILE*, i8*, ...) @fprintf(%struct.FILE* %35, i8* getelementptr ([17 x i8], [17 x i8]* @.str212784, i32 0, i32 0), i32 %32, i32 %34) nounwind		; <i32> [#uses=0]
 	store i32 0, i32* @cpexists, align 4
 	%37 = load %struct.rec*, %struct.rec** null, align 4		; <%struct.rec*> [#uses=1]
 	%38 = getelementptr %struct.rec, %struct.rec* %37, i32 0, i32 0, i32 4		; <%struct.FOURTH_UNION*> [#uses=1]
-	%39 = call  i32 (%struct.FILE*, i8*, ...)* @fprintf(%struct.FILE* undef, i8* getelementptr ([23 x i8], [23 x i8]* @.str1852949, i32 0, i32 0), %struct.FOURTH_UNION* %38) nounwind		; <i32> [#uses=0]
+	%39 = call  i32 (%struct.FILE*, i8*, ...) @fprintf(%struct.FILE* undef, i8* getelementptr ([23 x i8], [23 x i8]* @.str1852949, i32 0, i32 0), %struct.FOURTH_UNION* %38) nounwind		; <i32> [#uses=0]
 	%buff14 = getelementptr [512 x i8], [512 x i8]* %buff, i32 0, i32 0		; <i8*> [#uses=5]
 	%40 = call  i8* @fgets(i8* %buff14, i32 512, %struct.FILE* %12) nounwind		; <i8*> [#uses=0]
 	%iftmp.506.0 = select i1 undef, i32 2, i32 0		; <i32> [#uses=1]
@@ -245,7 +245,7 @@ bb.i56:		; preds = %bb27
 	br i1 undef, label %bb1.i58, label %bb2.i60
 
 bb1.i58:		; preds = %bb.i56
-	call  void (i32, i32, i8*, i32, %struct.FILE_POS*, ...)* @Error(i32 31, i32 1, i8* getelementptr ([32 x i8], [32 x i8]* @.str1575, i32 0, i32 0), i32 1, %struct.FILE_POS* bitcast (%4* @no_file_pos to %struct.FILE_POS*)) nounwind
+	call  void (i32, i32, i8*, i32, %struct.FILE_POS*, ...) @Error(i32 31, i32 1, i8* getelementptr ([32 x i8], [32 x i8]* @.str1575, i32 0, i32 0), i32 1, %struct.FILE_POS* bitcast (%4* @no_file_pos to %struct.FILE_POS*)) nounwind
 	br label %bb2.i60
 
 bb2.i60:		; preds = %bb1.i58, %bb.i56
@@ -385,7 +385,7 @@ bb.i2:		; preds = %bb69
 	br i1 undef, label %bb1.i3, label %bb2.i4
 
 bb1.i3:		; preds = %bb.i2
-	call  void (i32, i32, i8*, i32, %struct.FILE_POS*, ...)* @Error(i32 31, i32 1, i8* getelementptr ([32 x i8], [32 x i8]* @.str1575, i32 0, i32 0), i32 1, %struct.FILE_POS* bitcast (%4* @no_file_pos to %struct.FILE_POS*)) nounwind
+	call  void (i32, i32, i8*, i32, %struct.FILE_POS*, ...) @Error(i32 31, i32 1, i8* getelementptr ([32 x i8], [32 x i8]* @.str1575, i32 0, i32 0), i32 1, %struct.FILE_POS* bitcast (%4* @no_file_pos to %struct.FILE_POS*)) nounwind
 	br label %bb2.i4
 
 bb2.i4:		; preds = %bb1.i3, %bb.i2
@@ -502,7 +502,7 @@ bb102:		; preds = %bb101.split
 
 bb103:		; preds = %bb101.split
 	%99 = load %struct.FILE*, %struct.FILE** @out_fp, align 4		; <%struct.FILE*> [#uses=1]
-	%100 = call  i32 (%struct.FILE*, i8*, ...)* @fprintf(%struct.FILE* %99, i8* getelementptr ([26 x i8], [26 x i8]* @.str1932957, i32 0, i32 0)) nounwind		; <i32> [#uses=0]
+	%100 = call  i32 (%struct.FILE*, i8*, ...) @fprintf(%struct.FILE* %99, i8* getelementptr ([26 x i8], [26 x i8]* @.str1932957, i32 0, i32 0)) nounwind		; <i32> [#uses=0]
 	store i32 0, i32* @wordcount, align 4
 	ret void
 }
diff --git a/test/CodeGen/Thumb2/2009-08-21-PostRAKill4.ll b/test/CodeGen/Thumb2/2009-08-21-PostRAKill4.ll
index 13cf135..04dcb9d 100644
--- a/test/CodeGen/Thumb2/2009-08-21-PostRAKill4.ll
+++ b/test/CodeGen/Thumb2/2009-08-21-PostRAKill4.ll
@@ -18,9 +18,9 @@ declare i32 @printf(i8* nocapture, ...) nounwind
 
 define i32 @main() nounwind {
 entry:
-  %0 = tail call  i32 (i8*, ...)* @printf(i8* getelementptr ([31 x i8], [31 x i8]* @.str1, i32 0, i32 0), i32 1, i32 1, i32 1, i32 1, i32 1, i32 1) nounwind ; <i32> [#uses=0]
-  %1 = tail call  i32 (i8*, ...)* @printf(i8* getelementptr ([31 x i8], [31 x i8]* @.str1, i32 0, i32 0), i32 -128, i32 116, i32 116, i32 -3852, i32 -31232, i32 -1708916736) nounwind ; <i32> [#uses=0]
-  %2 = tail call  i32 (i32, ...)* @getUnknown(i32 undef, i32 116, i32 116, i32 -3852, i32 -31232, i32 30556, i32 -1708916736) nounwind ; <i32> [#uses=1]
-  %3 = tail call  i32 (i8*, ...)* @printf(i8* getelementptr ([4 x i8], [4 x i8]* @.str2, i32 0, i32 0), i32 %2) nounwind ; <i32> [#uses=0]
+  %0 = tail call  i32 (i8*, ...) @printf(i8* getelementptr ([31 x i8], [31 x i8]* @.str1, i32 0, i32 0), i32 1, i32 1, i32 1, i32 1, i32 1, i32 1) nounwind ; <i32> [#uses=0]
+  %1 = tail call  i32 (i8*, ...) @printf(i8* getelementptr ([31 x i8], [31 x i8]* @.str1, i32 0, i32 0), i32 -128, i32 116, i32 116, i32 -3852, i32 -31232, i32 -1708916736) nounwind ; <i32> [#uses=0]
+  %2 = tail call  i32 (i32, ...) @getUnknown(i32 undef, i32 116, i32 116, i32 -3852, i32 -31232, i32 30556, i32 -1708916736) nounwind ; <i32> [#uses=1]
+  %3 = tail call  i32 (i8*, ...) @printf(i8* getelementptr ([4 x i8], [4 x i8]* @.str2, i32 0, i32 0), i32 %2) nounwind ; <i32> [#uses=0]
   ret i32 0
 }
diff --git a/test/CodeGen/Thumb2/2009-12-01-LoopIVUsers.ll b/test/CodeGen/Thumb2/2009-12-01-LoopIVUsers.ll
index aa61c28..e59e84d 100644
--- a/test/CodeGen/Thumb2/2009-12-01-LoopIVUsers.ll
+++ b/test/CodeGen/Thumb2/2009-12-01-LoopIVUsers.ll
@@ -47,7 +47,7 @@ entry:
   store i8* %bp, i8** %bp_addr
   %0 = load i8*, i8** %in_addr, align 4                ; <i8*> [#uses=1]
   store i8* %0, i8** %out, align 4
-  %1 = call  i32 (...)* @foo() nounwind ; <i32> [#uses=1]
+  %1 = call  i32 (...) @foo() nounwind ; <i32> [#uses=1]
   store i32 %1, i32* %i, align 4
   %2 = load i32, i32* %three_by_three_addr, align 4    ; <i32> [#uses=1]
   %3 = icmp eq i32 %2, 0                          ; <i1> [#uses=1]
diff --git a/test/CodeGen/Thumb2/2010-06-14-NEONCoalescer.ll b/test/CodeGen/Thumb2/2010-06-14-NEONCoalescer.ll
index 735e724..24a995a 100644
--- a/test/CodeGen/Thumb2/2010-06-14-NEONCoalescer.ll
+++ b/test/CodeGen/Thumb2/2010-06-14-NEONCoalescer.ll
@@ -32,10 +32,10 @@ entry:
   %tmp7 = extractelement <2 x double> %5, i32 0   ; <double> [#uses=1]
   %tmp5 = extractelement <2 x double> %5, i32 1   ; <double> [#uses=1]
 ; CHECK: printf
-  %7 = tail call  i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0), double %tmp7, double %tmp5) nounwind ; <i32> [#uses=0]
+  %7 = tail call  i32 (i8*, ...) @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0), double %tmp7, double %tmp5) nounwind ; <i32> [#uses=0]
   %tmp3 = extractelement <2 x double> %6, i32 0   ; <double> [#uses=1]
   %tmp1 = extractelement <2 x double> %6, i32 1   ; <double> [#uses=1]
-  %8 = tail call  i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0), double %tmp3, double %tmp1) nounwind ; <i32> [#uses=0]
+  %8 = tail call  i32 (i8*, ...) @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0), double %tmp3, double %tmp1) nounwind ; <i32> [#uses=0]
   ret i32 0
 }
 
diff --git a/test/CodeGen/Thumb2/2010-08-10-VarSizedAllocaBug.ll b/test/CodeGen/Thumb2/2010-08-10-VarSizedAllocaBug.ll
index 1efbe4c..3b14d22 100644
--- a/test/CodeGen/Thumb2/2010-08-10-VarSizedAllocaBug.ll
+++ b/test/CodeGen/Thumb2/2010-08-10-VarSizedAllocaBug.ll
@@ -15,7 +15,7 @@ entry:
 bb:                                               ; preds = %entry
   %1 = alloca [1000 x i8], align 4                ; <[1000 x i8]*> [#uses=1]
   %.sub = getelementptr inbounds [1000 x i8], [1000 x i8]* %1, i32 0, i32 0 ; <i8*> [#uses=2]
-  %2 = call i32 (i8*, i32, i32, i8*, ...)* @__sprintf_chk(i8* %.sub, i32 0, i32 1000, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %i) nounwind ; <i32> [#uses=0]
+  %2 = call i32 (i8*, i32, i32, i8*, ...) @__sprintf_chk(i8* %.sub, i32 0, i32 1000, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %i) nounwind ; <i32> [#uses=0]
   %3 = load i8, i8* %.sub, align 4                    ; <i8> [#uses=1]
   %4 = sext i8 %3 to i32                          ; <i32> [#uses=1]
   ret i32 %4
@@ -52,7 +52,7 @@ bb2:                                              ; preds = %bb
 ; CHECK-NOT: mov sp, r7
 ; CHECK-NOT: sub sp, #12
 ; CHECK: pop
-  %4 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %2) nounwind ; <i32> [#uses=0]
+  %4 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %2) nounwind ; <i32> [#uses=0]
   ret i32 0
 }
 
diff --git a/test/CodeGen/Thumb2/div.ll b/test/CodeGen/Thumb2/div.ll
index b273a89..80a842d 100644
--- a/test/CodeGen/Thumb2/div.ll
+++ b/test/CodeGen/Thumb2/div.ll
@@ -4,6 +4,10 @@
 ; RUN:    | FileCheck %s -check-prefix=CHECK-THUMBV7M
 ; RUN: llc -mtriple=thumb-apple-darwin -mcpu=swift %s -o - \
 ; RUN:    | FileCheck %s -check-prefix=CHECK-HWDIV
+; RUN: llc -mtriple=thumb-apple-darwin -mcpu=cortex-r4 %s -o - \
+; RUN:    | FileCheck %s -check-prefix=CHECK-HWDIV
+; RUN: llc -mtriple=thumb-apple-darwin -mcpu=cortex-r4f %s -o - \
+; RUN:    | FileCheck %s -check-prefix=CHECK-HWDIV
 ; RUN: llc -mtriple=thumb-apple-darwin -mcpu=cortex-r5 %s -o - \
 ; RUN:    | FileCheck %s -check-prefix=CHECK-HWDIV
 
diff --git a/test/CodeGen/Thumb2/large-call.ll b/test/CodeGen/Thumb2/large-call.ll
index ca94980..f6a5a60 100644
--- a/test/CodeGen/Thumb2/large-call.ll
+++ b/test/CodeGen/Thumb2/large-call.ll
@@ -22,7 +22,7 @@ entry:
   %d = alloca double, align 8
   store double 1.000000e+00, double* %d, align 8
   %0 = load double, double* %d, align 8
-  call void (i8*, i8*, i8*, ...)* @variadic(i8* null, i8* null, i8* null, i32 1, double 1.234800e+03, double 2.363450e+03, double %0, i32 1, double 1.234560e+03, double 2.345670e+03, double 4.6334563e+03, double 2.423440e+03, double 4.234330e+03, double 2.965430e+03, i32 1, double 4.669300e+03, double 2.927500e+03, double 4.663100e+03, double 2.921000e+03, double 4.663100e+03, double 2.345100e+03, i32 1, double 3.663100e+03, double 2.905100e+03, double 4.669300e+03, double 2.898600e+03, double 4.676900e+03, double 2.898600e+03, i32 1, double 4.684600e+03, double 2.898600e+03, double 1.234800e+03, double 2.905100e+03, double 1.234800e+03, double 2.345100e+03, i32 1, double 7.719700e+03, double 2.920500e+03, double 4.713500e+03, double 2.927000e+03, double 4.705800e+03, double 2.927000e+03, i32 1, double 8.698200e+03, double 2.927000e+03, double 4.692000e+03, double 2.920500e+03, double 4.692000e+03, double 2.912500e+03, i32 1, double 4.692000e+03, double 2.945600e+03, double 4.698200e+03, double 2.898100e+03, double 4.705800e+03, double 2.898100e+03, i32 1, double 4.713500e+03, double 2.898100e+03, double 4.719700e+03, double 2.945600e+03, double 4.719700e+03, double 2.912500e+03, i32 1, double 4.749200e+03, double 2.920100e+03, double 4.743000e+03, double 2.926600e+03, double 4.735300e+03, double 2.926600e+03, i32 1, double 4.727700e+03, double 2.926600e+03, double 4.721500e+03, double 2.920100e+03, double 4.721500e+03, double 2.912100e+03, i32 1, double 4.721500e+03, double 2.945100e+03, double 4.727700e+03, double 2.897700e+03, double 4.735300e+03, double 2.897700e+03, i32 1, double 4.743000e+03, double 2.897700e+03, double 4.749200e+03, double 2.945100e+03, double 4.749200e+03, double 2.912100e+03, i32 1, double 4.778200e+03, double 2.920100e+03, double 4.772000e+03, double 2.926600e+03, double 4.764300e+03, double 2.926600e+03, i32 1, double 4.756700e+03, double 2.926600e+03, double 4.750500e+03, double 2.920100e+03, double 4.750500e+03, double 2.912100e+03, i32 1, double 4.750500e+03, double 2.945100e+03, double 4.756700e+03, double 2.897700e+03, double 4.764300e+03, double 2.897700e+03, i32 1, double 4.772000e+03, double 2.897700e+03, double 4.778200e+03, double 2.945100e+03, double 4.778200e+03, double 2.912100e+03, i32 1, double 4.801900e+03, double 2.942100e+03, double 4.795700e+03, double 2.948500e+03, double 4.788100e+03, double 2.948500e+03, i32 1, double 4.780500e+03, double 2.948500e+03, double 4.774300e+03, double 2.942100e+03, double 4.774300e+03, double 2.934100e+03, i32 1, double 4.774300e+03, double 2.926100e+03, double 4.780500e+03, double 2.919600e+03, double 4.788100e+03, double 2.919600e+03, i32 1, double 4.795700e+03, double 2.919600e+03, double 4.801900e+03, double 2.926100e+03, double 4.801900e+03, double 2.934100e+03, i32 1, double 4.801500e+03, double 2.972500e+03, double 4.795300e+03, double 2.978900e+03, double 4.787700e+03, double 2.978900e+03, i32 1, double 4.780000e+03, double 2.978900e+03, double 4.773800e+03, double 2.972500e+03, double 4.773800e+03, double 2.964500e+03, i32 1, double 4.773800e+03, double 2.956500e+03, double 4.780000e+03, double 2.950000e+03, double 4.787700e+03, double 2.950000e+03, i32 1, double 4.795300e+03, double 2.950000e+03, double 4.801500e+03, double 2.956500e+03, double 4.801500e+03, double 2.964500e+03, i32 1, double 4.802400e+03, double 3.010200e+03, double 4.796200e+03, double 3.016600e+03, double 4.788500e+03, double 3.016600e+03, i32 1, double 4.780900e+03, double 3.016600e+03, double 4.774700e+03, double 3.010200e+03, double 4.774700e+03, double 3.002200e+03, i32 1, double 4.774700e+03, double 2.994200e+03, double 4.780900e+03, double 2.987700e+03, double 4.788500e+03, double 2.987700e+03, i32 1, double 4.796200e+03, double 2.987700e+03, double 4.802400e+03, double 2.994200e+03, double 4.802400e+03, double 3.002200e+03, i32 1, double 4.802400e+03, double 3.039400e+03, double 4.796200e+03, double 3.455800e+03, double 4.788500e+03, double 3.455800e+03, i32 1, double 4.780900e+03, double 3.455800e+03, double 4.774700e+03, double 3.039400e+03, double 4.774700e+03, double 3.031400e+03, i32 1, double 4.774700e+03, double 3.023400e+03, double 4.780900e+03, double 3.016900e+03, double 4.788500e+03, double 3.016900e+03, i32 1, double 4.796200e+03, double 3.016900e+03, double 4.802400e+03, double 3.023400e+03, double 4.802400e+03, double 3.031400e+03, i32 1, double 4.778600e+03, double 3.063100e+03, double 4.772400e+03, double 3.069600e+03, double 4.764700e+03, double 3.069600e+03, i32 1, double 4.757100e+03, double 3.069600e+03, double 4.750900e+03, double 3.063100e+03, double 4.750900e+03, double 3.055100e+03, i32 1, double 4.750900e+03, double 3.457100e+03, double 4.757100e+03, double 3.450700e+03, double 4.764700e+03, double 3.450700e+03, i32 1, double 4.772400e+03, double 3.450700e+03, double 4.778600e+03, double 3.457100e+03, double 4.778600e+03, double 3.055100e+03, i32 1, double 4.748600e+03, double 3.063600e+03, double 4.742400e+03, double 3.070000e+03, double 4.734700e+03, double 3.070000e+03, i32 1, double 4.727100e+03, double 3.070000e+03, double 4.720900e+03, double 3.063600e+03, double 4.720900e+03, double 3.055600e+03, i32 1, double 4.720900e+03, double 3.457600e+03, double 4.727100e+03, double 3.451100e+03, double 4.734700e+03, double 3.451100e+03, i32 1, double 4.742400e+03, double 3.451100e+03, double 4.748600e+03, double 3.457600e+03, double 4.748600e+03, double 3.055600e+03, i32 1, double 4.719500e+03, double 3.063600e+03, double 4.713300e+03, double 3.070000e+03, double 4.705700e+03, double 3.070000e+03, i32 1, double 4.698000e+03, double 3.070000e+03, double 4.691900e+03, double 3.063600e+03, double 4.691900e+03, double 3.055600e+03, i32 1, double 4.691900e+03, double 3.457600e+03, double 4.698000e+03, double 3.451100e+03, double 4.705700e+03, double 3.451100e+03, i32 1, double 4.713300e+03, double 3.451100e+03, double 4.719500e+03, double 3.457600e+03, double 4.719500e+03, double 3.055600e+03, i32 1, double 4.691300e+03, double 3.064000e+03, double 4.685100e+03, double 3.070500e+03, double 4.677500e+03, double 3.070500e+03, i32 1, double 4.669900e+03, double 3.070500e+03, double 4.663700e+03, double 3.064000e+03, double 4.663700e+03, double 3.056000e+03, i32 1, double 4.663700e+03, double 3.458000e+03, double 4.669900e+03, double 3.451600e+03, double 4.677500e+03, double 3.451600e+03, i32 1, double 4.685100e+03, double 3.451600e+03, double 4.691300e+03, double 3.458000e+03, double 4.691300e+03, double 3.056000e+03, i32 1, double 4.668500e+03, double 3.453000e+03, double 4.662300e+03, double 3.459400e+03, double 4.654700e+03, double 3.459400e+03, i32 1, double 4.647000e+03, double 3.459400e+03, double 4.640900e+03, double 3.453000e+03, double 4.640900e+03, double 3.035000e+03, i32 1, double 4.640900e+03, double 3.027000e+03, double 4.647000e+03, double 3.020500e+03, double 4.654700e+03, double 3.020500e+03, i32 1, double 4.662300e+03, double 3.020500e+03, double 4.668500e+03, double 3.027000e+03, double 4.668500e+03, double 3.035000e+03, i32 1, double 4.668500e+03, double 3.014300e+03, double 4.662300e+03, double 3.020800e+03, double 4.654700e+03, double 3.020800e+03, i32 1, double 4.647000e+03, double 3.020800e+03, double 4.640900e+03, double 3.014300e+03, double 4.640900e+03, double 3.006400e+03, i32 1, double 4.640900e+03, double 2.998400e+03, double 4.647000e+03, double 2.991900e+03, double 4.654700e+03, double 2.991900e+03, i32 1, double 4.662300e+03, double 2.991900e+03, double 4.668500e+03, double 2.998400e+03, double 4.668500e+03, double 3.006400e+03, i32 1, double 4.668100e+03, double 2.941100e+03, double 4.661900e+03, double 2.947600e+03, double 4.654200e+03, double 2.947600e+03, i32 1, double 4.646600e+03, double 2.947600e+03, double 4.640400e+03, double 2.941100e+03, double 4.640400e+03, double 2.933100e+03, i32 1, double 4.640400e+03, double 2.925200e+03, double 4.646600e+03, double 2.918700e+03, double 4.654200e+03, double 2.918700e+03, i32 1, double 4.661900e+03, double 2.918700e+03, double 4.668100e+03, double 2.925200e+03, double 4.668100e+03, double 2.933100e+03, i32 1, double 4.668500e+03, double 2.971600e+03, double 4.662300e+03, double 2.978100e+03, double 4.654700e+03, double 2.978100e+03, i32 1, double 4.647000e+03, double 2.978100e+03, double 4.640900e+03, double 2.971600e+03, double 4.640900e+03, double 2.963600e+03, i32 1, double 4.640900e+03, double 2.955700e+03, double 4.647000e+03, double 2.949200e+03, double 4.654700e+03, double 2.949200e+03, i32 1, double 4.662300e+03, double 2.949200e+03, double 4.668500e+03, double 2.955700e+03, double 4.668500e+03, double 2.963600e+03, i32 2, i32 1, double 4.691300e+03, double 3.056000e+03, i32 2, i32 1, double 4.748600e+03, double 3.055600e+03, i32 2, i32 1, double 4.778200e+03, double 2.912100e+03, i32 2, i32 1, double 4.749200e+03, double 2.912100e+03, i32 2, i32 1, double 4.802400e+03, double 3.031400e+03, i32 2, i32 1, double 4.778600e+03, double 3.055100e+03, i32 2, i32 1, double 4.801500e+03, double 2.964500e+03, i32 2, i32 1, double 4.802400e+03, double 3.002200e+03, i32 2, i32 1, double 4.719700e+03, double 2.912500e+03, i32 2, i32 1, double 4.801900e+03, double 2.934100e+03, i32 2, i32 1, double 4.719500e+03, double 3.055600e+03, i32 2, i32 1, double 4.668500e+03, double 3.006400e+03, i32 2, i32 1, double 4.668500e+03, double 3.035000e+03, i32 2, i32 1, double 4.668100e+03, double 2.933100e+03, i32 2, i32 1, double 4.668500e+03, double 2.963600e+03, i32 2, i32 48)
+  call void (i8*, i8*, i8*, ...) @variadic(i8* null, i8* null, i8* null, i32 1, double 1.234800e+03, double 2.363450e+03, double %0, i32 1, double 1.234560e+03, double 2.345670e+03, double 4.6334563e+03, double 2.423440e+03, double 4.234330e+03, double 2.965430e+03, i32 1, double 4.669300e+03, double 2.927500e+03, double 4.663100e+03, double 2.921000e+03, double 4.663100e+03, double 2.345100e+03, i32 1, double 3.663100e+03, double 2.905100e+03, double 4.669300e+03, double 2.898600e+03, double 4.676900e+03, double 2.898600e+03, i32 1, double 4.684600e+03, double 2.898600e+03, double 1.234800e+03, double 2.905100e+03, double 1.234800e+03, double 2.345100e+03, i32 1, double 7.719700e+03, double 2.920500e+03, double 4.713500e+03, double 2.927000e+03, double 4.705800e+03, double 2.927000e+03, i32 1, double 8.698200e+03, double 2.927000e+03, double 4.692000e+03, double 2.920500e+03, double 4.692000e+03, double 2.912500e+03, i32 1, double 4.692000e+03, double 2.945600e+03, double 4.698200e+03, double 2.898100e+03, double 4.705800e+03, double 2.898100e+03, i32 1, double 4.713500e+03, double 2.898100e+03, double 4.719700e+03, double 2.945600e+03, double 4.719700e+03, double 2.912500e+03, i32 1, double 4.749200e+03, double 2.920100e+03, double 4.743000e+03, double 2.926600e+03, double 4.735300e+03, double 2.926600e+03, i32 1, double 4.727700e+03, double 2.926600e+03, double 4.721500e+03, double 2.920100e+03, double 4.721500e+03, double 2.912100e+03, i32 1, double 4.721500e+03, double 2.945100e+03, double 4.727700e+03, double 2.897700e+03, double 4.735300e+03, double 2.897700e+03, i32 1, double 4.743000e+03, double 2.897700e+03, double 4.749200e+03, double 2.945100e+03, double 4.749200e+03, double 2.912100e+03, i32 1, double 4.778200e+03, double 2.920100e+03, double 4.772000e+03, double 2.926600e+03, double 4.764300e+03, double 2.926600e+03, i32 1, double 4.756700e+03, double 2.926600e+03, double 4.750500e+03, double 2.920100e+03, double 4.750500e+03, double 2.912100e+03, i32 1, double 4.750500e+03, double 2.945100e+03, double 4.756700e+03, double 2.897700e+03, double 4.764300e+03, double 2.897700e+03, i32 1, double 4.772000e+03, double 2.897700e+03, double 4.778200e+03, double 2.945100e+03, double 4.778200e+03, double 2.912100e+03, i32 1, double 4.801900e+03, double 2.942100e+03, double 4.795700e+03, double 2.948500e+03, double 4.788100e+03, double 2.948500e+03, i32 1, double 4.780500e+03, double 2.948500e+03, double 4.774300e+03, double 2.942100e+03, double 4.774300e+03, double 2.934100e+03, i32 1, double 4.774300e+03, double 2.926100e+03, double 4.780500e+03, double 2.919600e+03, double 4.788100e+03, double 2.919600e+03, i32 1, double 4.795700e+03, double 2.919600e+03, double 4.801900e+03, double 2.926100e+03, double 4.801900e+03, double 2.934100e+03, i32 1, double 4.801500e+03, double 2.972500e+03, double 4.795300e+03, double 2.978900e+03, double 4.787700e+03, double 2.978900e+03, i32 1, double 4.780000e+03, double 2.978900e+03, double 4.773800e+03, double 2.972500e+03, double 4.773800e+03, double 2.964500e+03, i32 1, double 4.773800e+03, double 2.956500e+03, double 4.780000e+03, double 2.950000e+03, double 4.787700e+03, double 2.950000e+03, i32 1, double 4.795300e+03, double 2.950000e+03, double 4.801500e+03, double 2.956500e+03, double 4.801500e+03, double 2.964500e+03, i32 1, double 4.802400e+03, double 3.010200e+03, double 4.796200e+03, double 3.016600e+03, double 4.788500e+03, double 3.016600e+03, i32 1, double 4.780900e+03, double 3.016600e+03, double 4.774700e+03, double 3.010200e+03, double 4.774700e+03, double 3.002200e+03, i32 1, double 4.774700e+03, double 2.994200e+03, double 4.780900e+03, double 2.987700e+03, double 4.788500e+03, double 2.987700e+03, i32 1, double 4.796200e+03, double 2.987700e+03, double 4.802400e+03, double 2.994200e+03, double 4.802400e+03, double 3.002200e+03, i32 1, double 4.802400e+03, double 3.039400e+03, double 4.796200e+03, double 3.455800e+03, double 4.788500e+03, double 3.455800e+03, i32 1, double 4.780900e+03, double 3.455800e+03, double 4.774700e+03, double 3.039400e+03, double 4.774700e+03, double 3.031400e+03, i32 1, double 4.774700e+03, double 3.023400e+03, double 4.780900e+03, double 3.016900e+03, double 4.788500e+03, double 3.016900e+03, i32 1, double 4.796200e+03, double 3.016900e+03, double 4.802400e+03, double 3.023400e+03, double 4.802400e+03, double 3.031400e+03, i32 1, double 4.778600e+03, double 3.063100e+03, double 4.772400e+03, double 3.069600e+03, double 4.764700e+03, double 3.069600e+03, i32 1, double 4.757100e+03, double 3.069600e+03, double 4.750900e+03, double 3.063100e+03, double 4.750900e+03, double 3.055100e+03, i32 1, double 4.750900e+03, double 3.457100e+03, double 4.757100e+03, double 3.450700e+03, double 4.764700e+03, double 3.450700e+03, i32 1, double 4.772400e+03, double 3.450700e+03, double 4.778600e+03, double 3.457100e+03, double 4.778600e+03, double 3.055100e+03, i32 1, double 4.748600e+03, double 3.063600e+03, double 4.742400e+03, double 3.070000e+03, double 4.734700e+03, double 3.070000e+03, i32 1, double 4.727100e+03, double 3.070000e+03, double 4.720900e+03, double 3.063600e+03, double 4.720900e+03, double 3.055600e+03, i32 1, double 4.720900e+03, double 3.457600e+03, double 4.727100e+03, double 3.451100e+03, double 4.734700e+03, double 3.451100e+03, i32 1, double 4.742400e+03, double 3.451100e+03, double 4.748600e+03, double 3.457600e+03, double 4.748600e+03, double 3.055600e+03, i32 1, double 4.719500e+03, double 3.063600e+03, double 4.713300e+03, double 3.070000e+03, double 4.705700e+03, double 3.070000e+03, i32 1, double 4.698000e+03, double 3.070000e+03, double 4.691900e+03, double 3.063600e+03, double 4.691900e+03, double 3.055600e+03, i32 1, double 4.691900e+03, double 3.457600e+03, double 4.698000e+03, double 3.451100e+03, double 4.705700e+03, double 3.451100e+03, i32 1, double 4.713300e+03, double 3.451100e+03, double 4.719500e+03, double 3.457600e+03, double 4.719500e+03, double 3.055600e+03, i32 1, double 4.691300e+03, double 3.064000e+03, double 4.685100e+03, double 3.070500e+03, double 4.677500e+03, double 3.070500e+03, i32 1, double 4.669900e+03, double 3.070500e+03, double 4.663700e+03, double 3.064000e+03, double 4.663700e+03, double 3.056000e+03, i32 1, double 4.663700e+03, double 3.458000e+03, double 4.669900e+03, double 3.451600e+03, double 4.677500e+03, double 3.451600e+03, i32 1, double 4.685100e+03, double 3.451600e+03, double 4.691300e+03, double 3.458000e+03, double 4.691300e+03, double 3.056000e+03, i32 1, double 4.668500e+03, double 3.453000e+03, double 4.662300e+03, double 3.459400e+03, double 4.654700e+03, double 3.459400e+03, i32 1, double 4.647000e+03, double 3.459400e+03, double 4.640900e+03, double 3.453000e+03, double 4.640900e+03, double 3.035000e+03, i32 1, double 4.640900e+03, double 3.027000e+03, double 4.647000e+03, double 3.020500e+03, double 4.654700e+03, double 3.020500e+03, i32 1, double 4.662300e+03, double 3.020500e+03, double 4.668500e+03, double 3.027000e+03, double 4.668500e+03, double 3.035000e+03, i32 1, double 4.668500e+03, double 3.014300e+03, double 4.662300e+03, double 3.020800e+03, double 4.654700e+03, double 3.020800e+03, i32 1, double 4.647000e+03, double 3.020800e+03, double 4.640900e+03, double 3.014300e+03, double 4.640900e+03, double 3.006400e+03, i32 1, double 4.640900e+03, double 2.998400e+03, double 4.647000e+03, double 2.991900e+03, double 4.654700e+03, double 2.991900e+03, i32 1, double 4.662300e+03, double 2.991900e+03, double 4.668500e+03, double 2.998400e+03, double 4.668500e+03, double 3.006400e+03, i32 1, double 4.668100e+03, double 2.941100e+03, double 4.661900e+03, double 2.947600e+03, double 4.654200e+03, double 2.947600e+03, i32 1, double 4.646600e+03, double 2.947600e+03, double 4.640400e+03, double 2.941100e+03, double 4.640400e+03, double 2.933100e+03, i32 1, double 4.640400e+03, double 2.925200e+03, double 4.646600e+03, double 2.918700e+03, double 4.654200e+03, double 2.918700e+03, i32 1, double 4.661900e+03, double 2.918700e+03, double 4.668100e+03, double 2.925200e+03, double 4.668100e+03, double 2.933100e+03, i32 1, double 4.668500e+03, double 2.971600e+03, double 4.662300e+03, double 2.978100e+03, double 4.654700e+03, double 2.978100e+03, i32 1, double 4.647000e+03, double 2.978100e+03, double 4.640900e+03, double 2.971600e+03, double 4.640900e+03, double 2.963600e+03, i32 1, double 4.640900e+03, double 2.955700e+03, double 4.647000e+03, double 2.949200e+03, double 4.654700e+03, double 2.949200e+03, i32 1, double 4.662300e+03, double 2.949200e+03, double 4.668500e+03, double 2.955700e+03, double 4.668500e+03, double 2.963600e+03, i32 2, i32 1, double 4.691300e+03, double 3.056000e+03, i32 2, i32 1, double 4.748600e+03, double 3.055600e+03, i32 2, i32 1, double 4.778200e+03, double 2.912100e+03, i32 2, i32 1, double 4.749200e+03, double 2.912100e+03, i32 2, i32 1, double 4.802400e+03, double 3.031400e+03, i32 2, i32 1, double 4.778600e+03, double 3.055100e+03, i32 2, i32 1, double 4.801500e+03, double 2.964500e+03, i32 2, i32 1, double 4.802400e+03, double 3.002200e+03, i32 2, i32 1, double 4.719700e+03, double 2.912500e+03, i32 2, i32 1, double 4.801900e+03, double 2.934100e+03, i32 2, i32 1, double 4.719500e+03, double 3.055600e+03, i32 2, i32 1, double 4.668500e+03, double 3.006400e+03, i32 2, i32 1, double 4.668500e+03, double 3.035000e+03, i32 2, i32 1, double 4.668100e+03, double 2.933100e+03, i32 2, i32 1, double 4.668500e+03, double 2.963600e+03, i32 2, i32 48)
   ret i32 0
 }
 
diff --git a/test/CodeGen/Thumb2/thumb2-ifcvt2.ll b/test/CodeGen/Thumb2/thumb2-ifcvt2.ll
index 91efc5d..1d2ba00 100644
--- a/test/CodeGen/Thumb2/thumb2-ifcvt2.ll
+++ b/test/CodeGen/Thumb2/thumb2-ifcvt2.ll
@@ -15,7 +15,7 @@ entry:
 	br i1 %tmp7, label %cond_true, label %UnifiedReturnBlock
 
 cond_true:		; preds = %entry
-	%tmp10 = call i32 (...)* @bar( )		; <i32> [#uses=0]
+	%tmp10 = call i32 (...) @bar( )		; <i32> [#uses=0]
 	ret void
 
 UnifiedReturnBlock:		; preds = %entry
diff --git a/test/CodeGen/Thumb2/thumb2-tbb.ll b/test/CodeGen/Thumb2/thumb2-tbb.ll
index d57638b..758f792 100644
--- a/test/CodeGen/Thumb2/thumb2-tbb.ll
+++ b/test/CodeGen/Thumb2/thumb2-tbb.ll
@@ -11,43 +11,43 @@ entry:
 
     switch i32 %n.u, label %bb12 [i32 1, label %bb i32 2, label %bb6 i32 4, label %bb7 i32 5, label %bb8 i32 6, label %bb10 i32 7, label %bb1 i32 8, label %bb3 i32 9, label %bb4 i32 10, label %bb9 i32 11, label %bb2 i32 12, label %bb5 i32 13, label %bb11 ]
 bb:
-    tail call void(...)* @foo1()
+    tail call void(...) @foo1()
     ret void
 bb1:
-    tail call void(...)* @foo2()
+    tail call void(...) @foo2()
     ret void
 bb2:
-    tail call void(...)* @foo6()
+    tail call void(...) @foo6()
     ret void
 bb3:
-    tail call void(...)* @foo3()
+    tail call void(...) @foo3()
     ret void
 bb4:
-    tail call void(...)* @foo4()
+    tail call void(...) @foo4()
     ret void
 bb5:
-    tail call void(...)* @foo5()
+    tail call void(...) @foo5()
     ret void
 bb6:
-    tail call void(...)* @foo1()
+    tail call void(...) @foo1()
     ret void
 bb7:
-    tail call void(...)* @foo2()
+    tail call void(...) @foo2()
     ret void
 bb8:
-    tail call void(...)* @foo6()
+    tail call void(...) @foo6()
     ret void
 bb9:
-    tail call void(...)* @foo3()
+    tail call void(...) @foo3()
     ret void
 bb10:
-    tail call void(...)* @foo4()
+    tail call void(...) @foo4()
     ret void
 bb11:
-    tail call void(...)* @foo5()
+    tail call void(...) @foo5()
     ret void
 bb12:
-    tail call void(...)* @foo6()
+    tail call void(...) @foo6()
     ret void
 }
 
diff --git a/test/CodeGen/WinEH/cppeh-alloca-sink.ll b/test/CodeGen/WinEH/cppeh-alloca-sink.ll
new file mode 100644
index 0000000..d50237f
--- /dev/null
+++ b/test/CodeGen/WinEH/cppeh-alloca-sink.ll
@@ -0,0 +1,180 @@
+; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
+
+; This test describes two difficult cases in sinking allocas into child frames.
+; We don't currently do this optimization, but we'll need to tweak these tests
+; when we do.
+
+; This test is based on the following code:
+;
+; // In this case we can sink the alloca from the parent into the catch because
+; // the lifetime is limited to the catch.
+; extern "C" void may_throw();
+; extern "C" void sink_alloca_to_catch() {
+;   try {
+;     may_throw();
+;   } catch (int) {
+;     volatile int only_used_in_catch = 42;
+;   }
+; }
+;
+; // In this case we cannot. The variable should live as long as the parent
+; // frame lives.
+; extern "C" void use_catch_var(int *);
+; extern "C" void dont_sink_alloca_to_catch(int n) {
+;   int live_in_out_catch = 0;
+;   while (n > 0) {
+;     try {
+;       may_throw();
+;     } catch (int) {
+;       use_catch_var(&live_in_out_catch);
+;     }
+;     n--;
+;   }
+; }
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+declare void @may_throw() #1
+declare i32 @__CxxFrameHandler3(...)
+declare i32 @llvm.eh.typeid.for(i8*) #2
+declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #3
+declare void @llvm.eh.endcatch() #3
+
+%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
+%eh.CatchHandlerType = type { i32, i8* }
+
+$"\01??_R0H@8" = comdat any
+
+@"\01??_7type_info@@6B@" = external constant i8*
+@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
+@llvm.eh.handlertype.H.0 = private unnamed_addr constant %eh.CatchHandlerType { i32 0, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*) }, section "llvm.metadata"
+
+; Function Attrs: uwtable
+define void @sink_alloca_to_catch() #0 {
+entry:
+  %0 = alloca i32
+  %only_used_in_catch = alloca i32, align 4
+  invoke void @may_throw()
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          catch %eh.CatchHandlerType* @llvm.eh.handlertype.H.0
+  %2 = extractvalue { i8*, i32 } %1, 1
+  %3 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*)) #3
+  %matches = icmp eq i32 %2, %3
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad
+  %4 = extractvalue { i8*, i32 } %1, 0
+  call void @llvm.eh.begincatch(i8* %4, i8* null) #3
+  store volatile i32 42, i32* %only_used_in_catch, align 4
+  tail call void @llvm.eh.endcatch() #3
+  br label %try.cont
+
+try.cont:                                         ; preds = %entry, %catch
+  ret void
+
+eh.resume:                                        ; preds = %lpad
+  resume { i8*, i32 } %1
+}
+
+; CHECK-LABEL: define void @sink_alloca_to_catch()
+; CHECK: call void (...) @llvm.frameescape(i32* %only_used_in_catch)
+
+declare void @use_catch_var(i32*) #1
+
+; Function Attrs: uwtable
+define void @dont_sink_alloca_to_catch(i32 %n) #0 {
+entry:
+  %0 = alloca i32
+  %n.addr = alloca i32, align 4
+  %live_in_out_catch = alloca i32, align 4
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  store i32 %n, i32* %n.addr, align 4
+  br label %while.cond
+
+while.cond:                                       ; preds = %try.cont, %entry
+  %1 = load i32, i32* %n.addr, align 4
+  %cmp = icmp sgt i32 %1, 0
+  br i1 %cmp, label %while.body, label %while.end
+
+while.body:                                       ; preds = %while.cond
+  invoke void @may_throw()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %while.body
+  br label %try.cont
+
+lpad:                                             ; preds = %while.body
+  %2 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          catch i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*)
+  %3 = extractvalue { i8*, i32 } %2, 0
+  store i8* %3, i8** %exn.slot
+  %4 = extractvalue { i8*, i32 } %2, 1
+  store i32 %4, i32* %ehselector.slot
+  br label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %lpad
+  %sel = load i32, i32* %ehselector.slot
+  %5 = call i32 @llvm.eh.typeid.for(i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*)) #3
+  %matches = icmp eq i32 %sel, %5
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %catch.dispatch
+  %exn = load i8*, i8** %exn.slot
+  call void @llvm.eh.begincatch(i8* %exn, i8* null) #3
+  invoke void @use_catch_var(i32* %live_in_out_catch)
+          to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2:                                     ; preds = %catch
+  call void @llvm.eh.endcatch() #3
+  br label %try.cont
+
+try.cont:                                         ; preds = %invoke.cont2, %invoke.cont
+  %6 = load i32, i32* %0
+  %7 = load i32, i32* %n.addr, align 4
+  %dec = add nsw i32 %7, -1
+  store i32 %dec, i32* %n.addr, align 4
+  br label %while.cond
+
+lpad1:                                            ; preds = %catch
+  %8 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          cleanup
+  %9 = extractvalue { i8*, i32 } %8, 0
+  store i8* %9, i8** %exn.slot
+  %10 = extractvalue { i8*, i32 } %8, 1
+  store i32 %10, i32* %ehselector.slot
+  call void @llvm.eh.endcatch() #3
+  br label %eh.resume
+
+while.end:                                        ; preds = %while.cond
+  ret void
+
+eh.resume:                                        ; preds = %lpad1, %catch.dispatch
+  %exn3 = load i8*, i8** %exn.slot
+  %sel4 = load i32, i32* %ehselector.slot
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn3, 0
+  %lpad.val5 = insertvalue { i8*, i32 } %lpad.val, i32 %sel4, 1
+  resume { i8*, i32 } %lpad.val5
+}
+
+; CHECK-LABEL: define void @dont_sink_alloca_to_catch(i32 %n)
+; CHECK: call void (...) @llvm.frameescape(i32* %live_in_out_catch)
+
+; CHECK-LABEL: define internal i8* @sink_alloca_to_catch.catch(i8*, i8*)
+; CHECK: %only_used_in_catch.i8 = call i8* @llvm.framerecover({{.*}}, i32 0)
+; CHECK: %only_used_in_catch = bitcast
+
+; CHECK-LABEL: define internal i8* @dont_sink_alloca_to_catch.catch(i8*, i8*)
+; CHECK: %live_in_out_catch.i8 = call i8* @llvm.framerecover({{.*}}, i32 0)
+; CHECK: %live_in_out_catch = bitcast
+
+
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
diff --git a/test/CodeGen/WinEH/cppeh-catch-all.ll b/test/CodeGen/WinEH/cppeh-catch-all.ll
index 6e69862..4d017fd 100644
--- a/test/CodeGen/WinEH/cppeh-catch-all.ll
+++ b/test/CodeGen/WinEH/cppeh-catch-all.ll
@@ -19,7 +19,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc-windows-msvc"
 
 ; The function entry in this case remains unchanged.
-; CHECK: define void @_Z4testv() #0 {
+; CHECK: define void @_Z4testv()
 ; CHECK: entry:
 ; CHECK:   invoke void @_Z9may_throwv()
 ; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]+]]
@@ -38,7 +38,7 @@ invoke.cont:                                      ; preds = %entry
 ; CHECK: [[LPAD_LABEL]]:{{[ ]+}}; preds = %entry
 ; CHECK:   landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
 ; CHECK-NEXT:           catch i8* null
-; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...)* @llvm.eh.actions(i32 1, i8* null, i8* null, i8* (i8*, i8*)* @_Z4testv.catch)
+; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* null, i32 -1, i8* (i8*, i8*)* @_Z4testv.catch)
 ; CHECK-NEXT:   indirectbr i8* [[RECOVER]], [label %try.cont]
 
 lpad:                                             ; preds = %entry
@@ -70,7 +70,7 @@ try.cont:                                         ; preds = %invoke.cont2, %invo
 ; CHECK: }
 }
 
-; CHECK: define internal i8* @_Z4testv.catch(i8*, i8*) {
+; CHECK: define internal i8* @_Z4testv.catch(i8*, i8*)
 ; CHECK: entry:
 ; CHECK:   call void @_Z16handle_exceptionv()
 ; CHECK:   ret i8* blockaddress(@_Z4testv, %try.cont)
diff --git a/test/CodeGen/WinEH/cppeh-catch-scalar.ll b/test/CodeGen/WinEH/cppeh-catch-scalar.ll
index 0f8a7a8..4ea3838 100644
--- a/test/CodeGen/WinEH/cppeh-catch-scalar.ll
+++ b/test/CodeGen/WinEH/cppeh-catch-scalar.ll
@@ -21,10 +21,10 @@ target triple = "x86_64-pc-windows-msvc"
 @_ZTIi = external constant i8*
 
 ; The function entry will be rewritten like this.
-; CHECK: define void @_Z4testv() #0 {
+; CHECK: define void @_Z4testv()
 ; CHECK: entry:
 ; CHECK:   [[I_PTR:\%.+]] = alloca i32, align 4
-; CHECK:   call void (...)* @llvm.frameescape(i32* [[I_PTR]])
+; CHECK:   call void (...) @llvm.frameescape(i32* [[I_PTR]])
 ; CHECK:   invoke void @_Z9may_throwv()
 ; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]+]]
 
@@ -43,7 +43,7 @@ invoke.cont:                                      ; preds = %entry
 ; CHECK: [[LPAD_LABEL]]:{{[ ]+}}; preds = %entry
 ; CHECK:   landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
 ; CHECK-NEXT:           catch i8* bitcast (i8** @_ZTIi to i8*)
-; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...)* @llvm.eh.actions(i32 1, i8* bitcast (i8** @_ZTIi to i8*), i32* %i, i8* (i8*, i8*)* @_Z4testv.catch)
+; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (i8** @_ZTIi to i8*), i32 0, i8* (i8*, i8*)* @_Z4testv.catch)
 ; CHECK-NEXT:   indirectbr i8* [[RECOVER]], [label %try.cont]
 
 lpad:                                             ; preds = %entry
@@ -94,7 +94,7 @@ eh.resume:                                        ; preds = %catch.dispatch
 ; CHECK: }
 }
 
-; CHECK: define internal i8* @_Z4testv.catch(i8*, i8*) {
+; CHECK: define internal i8* @_Z4testv.catch(i8*, i8*)
 ; CHECK: entry:
 ; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 0)
 ; CHECK:   [[I_PTR1:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
diff --git a/test/CodeGen/WinEH/cppeh-catch-unwind.ll b/test/CodeGen/WinEH/cppeh-catch-unwind.ll
index 3db1635..37a5429 100644
--- a/test/CodeGen/WinEH/cppeh-catch-unwind.ll
+++ b/test/CodeGen/WinEH/cppeh-catch-unwind.ll
@@ -33,13 +33,10 @@ $"\01??_R0H@8" = comdat any
 
 ; CHECK-LABEL: define void @"\01?test@@YAXXZ"() #0 {
 ; CHECK: entry:
-; CHECK:   [[UNWIND_HELP:\%.+]] = alloca i64
 ; CHECK:   [[OBJ_PTR:\%.+]] = alloca %class.SomeClass
 ; CHECK:   [[TMP0:\%.+]] = alloca i32, align 4
 ; CHECK:   [[TMP1:\%.+]] = alloca i32, align 4
-; CHECK:   call void (...)* @llvm.frameescape(i32* [[TMP1]], %class.SomeClass* [[OBJ_PTR]], i32* [[TMP0]])
-; CHECK:   [[UNWIND_HELP_i8:\%.+]] = bitcast i64* [[UNWIND_HELP]] to i8*
-; CHECK:   call void @llvm.eh.unwindhelp(i8* [[UNWIND_HELP_i8]])
+; CHECK:   call void (...) @llvm.frameescape(i32* [[TMP1]], %class.SomeClass* [[OBJ_PTR]], i32* [[TMP0]])
 ; CHECK:   %call = invoke %class.SomeClass* @"\01??0SomeClass@@QEAA@XZ"(%class.SomeClass* %obj)
 ; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]+]]
 
@@ -71,7 +68,7 @@ invoke.cont2:                                     ; preds = %invoke.cont
 ; CHECK: [[LPAD_LABEL]]:{{[ ]+}}; preds = %entry
 ; CHECK:   [[LPAD_VAL:\%.+]] = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
 ; CHECK-NEXT:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...)* @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32* [[TMP1]], i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch")
+; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32 0, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch")
 ; CHECK-NEXT:   indirectbr i8* [[RECOVER]], [label %try.cont15]
 
 lpad:                                             ; preds = %entry
@@ -85,7 +82,7 @@ lpad:                                             ; preds = %entry
 ; CHECK:   [[LPAD1_VAL:\%.+]] = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
 ; CHECK-NEXT:           cleanup
 ; CHECK-NEXT:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-; CHECK-NEXT:   [[RECOVER1:\%.+]] = call i8* (...)* @llvm.eh.actions(i32 0, void (i8*, i8*)* @"\01?test@@YAXXZ.cleanup", i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32* [[TMP1]], i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch")
+; CHECK-NEXT:   [[RECOVER1:\%.+]] = call i8* (...) @llvm.eh.actions(i32 0, void (i8*, i8*)* @"\01?test@@YAXXZ.cleanup", i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32 0, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch")
 ; CHECK-NEXT:   indirectbr i8* [[RECOVER1]], [label %try.cont15]
 
 lpad1:                                            ; preds = %invoke.cont
@@ -100,7 +97,7 @@ lpad1:                                            ; preds = %invoke.cont
 ; CHECK:   [[LPAD3_VAL:\%.+]] = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
 ; CHECK-NEXT:           cleanup
 ; CHECK-NEXT:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-; CHECK-NEXT:   [[RECOVER3:\%.+]] = call i8* (...)* @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32* [[TMP0]], i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch1", i32 0, void (i8*, i8*)* @"\01?test@@YAXXZ.cleanup")
+; CHECK-NEXT:   [[RECOVER3:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32 2, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch1", i32 0, void (i8*, i8*)* @"\01?test@@YAXXZ.cleanup")
 ; CHECK-NEXT:   indirectbr i8* [[RECOVER3]], [label %try.cont]
 
 lpad3:                                            ; preds = %invoke.cont2
@@ -178,7 +175,7 @@ eh.resume:                                        ; preds = %catch.dispatch7
 ; CHECK: }
 }
 
-; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*) {
+; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
 ; CHECK: entry:
 ; CHECK:   [[RECOVER_TMP1:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
 ; CHECK:   [[TMP1_PTR:\%.+]] = bitcast i8* [[RECOVER_TMP1]] to i32*
@@ -186,15 +183,15 @@ eh.resume:                                        ; preds = %catch.dispatch7
 ; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %try.cont15)
 ; CHECK: }
 
-; CHECK-LABEL: define internal void @"\01?test@@YAXXZ.cleanup"(i8*, i8*) {
+; CHECK-LABEL: define internal void @"\01?test@@YAXXZ.cleanup"(i8*, i8*)
 ; CHECK: entry:
 ; CHECK:   [[RECOVER_OBJ:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
 ; CHECK:   [[OBJ_PTR:\%.+]] = bitcast i8* %obj.i8 to %class.SomeClass*
-; CHECK:   call void @"\01??1SomeClass@@QEAA@XZ"(%class.SomeClass* [[OBJ_PTR]]) #3
+; CHECK:   call void @"\01??1SomeClass@@QEAA@XZ"(%class.SomeClass* [[OBJ_PTR]])
 ; CHECK:   ret void
 ; CHECK: }
 
-; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch1"(i8*, i8*) {
+; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch1"(i8*, i8*)
 ; CHECK: entry:
 ; CHECK:   [[RECOVER_TMP0:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
 ; CHECK:   [[TMP0_PTR:\%.+]] = bitcast i8* [[RECOVER_TMP0]] to i32*
@@ -208,7 +205,6 @@ eh.resume:                                        ; preds = %catch.dispatch7
 ; CHECK:   [[LPAD5_VAL:\%.+]] = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
 ; CHECK:           cleanup
 ; CHECK:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-; CHECK:   unreachable
 ; CHECK: }
 
 declare %class.SomeClass* @"\01??0SomeClass@@QEAA@XZ"(%class.SomeClass* returned) #1
diff --git a/test/CodeGen/WinEH/cppeh-cleanup-invoke.ll b/test/CodeGen/WinEH/cppeh-cleanup-invoke.ll
new file mode 100644
index 0000000..5a57043
--- /dev/null
+++ b/test/CodeGen/WinEH/cppeh-cleanup-invoke.ll
@@ -0,0 +1,91 @@
+; RUN: opt -winehprepare -S < %s | FileCheck %s
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+; Modified based on this code:
+; struct HasDtor {
+;   ~HasDtor();
+; };
+; extern "C" void may_throw();
+; int main() {
+;   try {
+;     HasDtor o;
+;     may_throw();
+;   } catch (int) {
+;   }
+; }
+
+%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
+%eh.CatchHandlerType = type { i32, i8* }
+%struct.HasDtor = type { i8 }
+
+$"\01??_R0H@8" = comdat any
+
+@"\01??_7type_info@@6B@" = external constant i8*
+@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
+@llvm.eh.handlertype.H.0 = private unnamed_addr constant %eh.CatchHandlerType { i32 0, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*) }, section "llvm.metadata"
+
+define i32 @main() {
+entry:
+  %o = alloca %struct.HasDtor, align 1
+  invoke void @may_throw()
+          to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2:                                     ; preds = %invoke.cont
+  call void @"\01??1HasDtor@@QEAA@XZ"(%struct.HasDtor* %o)
+  br label %try.cont
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          catch %eh.CatchHandlerType* @llvm.eh.handlertype.H.0
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = extractvalue { i8*, i32 } %0, 1
+  br label %catch.dispatch
+
+lpad1:                                            ; preds = %invoke.cont
+  %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          cleanup
+          catch %eh.CatchHandlerType* @llvm.eh.handlertype.H.0
+  %4 = extractvalue { i8*, i32 } %3, 0
+  %5 = extractvalue { i8*, i32 } %3, 1
+  invoke void @"\01??1HasDtor@@QEAA@XZ"(%struct.HasDtor* %o)
+	  to label %catch.dispatch unwind label %lpad
+
+catch.dispatch:                                   ; preds = %lpad1, %lpad
+  %exn.slot.0 = phi i8* [ %4, %lpad1 ], [ %1, %lpad ]
+  %ehselector.slot.0 = phi i32 [ %5, %lpad1 ], [ %2, %lpad ]
+  %6 = call i32 @llvm.eh.typeid.for(i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*))
+  %matches = icmp eq i32 %ehselector.slot.0, %6
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %catch.dispatch
+  call void @llvm.eh.begincatch(i8* %exn.slot.0, i8* null)
+  call void @llvm.eh.endcatch()
+  br label %try.cont
+
+try.cont:                                         ; preds = %catch, %invoke.cont2
+  ret i32 0
+
+eh.resume:                                        ; preds = %catch.dispatch
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn.slot.0, 0
+  %lpad.val5 = insertvalue { i8*, i32 } %lpad.val, i32 %ehselector.slot.0, 1
+  resume { i8*, i32 } %lpad.val5
+}
+
+; CHECK-LABEL: define i32 @main()
+; CHECK: @llvm.eh.actions(i32 0, void (i8*, i8*)* @main.cleanup, i32 1, i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*), i32 -1, i8* (i8*, i8*)* @main.catch)
+
+; CHECK-LABEL: define internal void @main.cleanup(i8*, i8*)
+; CHECK: call void @"\01??1HasDtor@@QEAA@XZ"(%struct.HasDtor* %{{.*}})
+; CHECK: ret void
+
+declare void @may_throw()
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare void @"\01??1HasDtor@@QEAA@XZ"(%struct.HasDtor*)
+
+declare i32 @llvm.eh.typeid.for(i8*)
+declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture)
+declare void @llvm.eh.endcatch()
diff --git a/test/CodeGen/WinEH/cppeh-frame-vars.ll b/test/CodeGen/WinEH/cppeh-frame-vars.ll
index dc5ed1c..773dc94 100644
--- a/test/CodeGen/WinEH/cppeh-frame-vars.ll
+++ b/test/CodeGen/WinEH/cppeh-frame-vars.ll
@@ -47,7 +47,7 @@ $"\01??_R0H@8" = comdat any
 @"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
 
 ; The function entry should be rewritten like this.
-; CHECK: define void @"\01?test@@YAXXZ"() #0 {
+; CHECK: define void @"\01?test@@YAXXZ"()
 ; CHECK: entry:
 ; CHECK:   [[NUMEXCEPTIONS_PTR:\%.+]] = alloca i32, align 4
 ; CHECK:   [[EXCEPTIONVAL_PTR:\%.+]] = alloca [10 x i32], align 16
@@ -58,7 +58,7 @@ $"\01??_R0H@8" = comdat any
 ; CHECK:   [[TMP:\%.+]] = bitcast %struct.SomeData* [[DATA_PTR]] to i8*
 ; CHECK:   call void @llvm.memset(i8* [[TMP]], i8 0, i64 8, i32 4, i1 false)
 ; CHECK:   store i32 0, i32* [[I_PTR]], align 4
-; CHECK:   call void (...)* @llvm.frameescape(i32* [[E_PTR]], i32* [[NUMEXCEPTIONS_PTR]], [10 x i32]* [[EXCEPTIONVAL_PTR]], i32* [[I_PTR]], %struct.SomeData* [[DATA_PTR]])
+; CHECK:   call void (...) @llvm.frameescape(i32* [[E_PTR]], i32* [[NUMEXCEPTIONS_PTR]], [10 x i32]* [[EXCEPTIONVAL_PTR]], i32* [[I_PTR]], %struct.SomeData* [[DATA_PTR]])
 ; CHECK:   br label %for.cond
 
 ; Function Attrs: uwtable
@@ -101,7 +101,7 @@ invoke.cont:                                      ; preds = %for.body
 ; CHECK: [[LPAD_LABEL]]:{{[ ]+}}; preds = %for.body
 ; CHECK:   landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
 ; CHECK-NEXT:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...)* @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32* %e, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch")
+; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32 0, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch")
 ; CHECK-NEXT:   indirectbr i8* [[RECOVER]], [label %try.cont]
 
 lpad:                                             ; preds = %for.body
@@ -196,7 +196,7 @@ eh.resume:                                        ; preds = %catch.dispatch
 }
 
 ; The following catch handler should be outlined.
-; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*) {
+; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
 ; CHECK: entry:
 ; CHECK:   [[RECOVER_E:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
 ; CHECK:   [[E_PTR1:\%.+]] = bitcast i8* [[RECOVER_E]] to i32*
diff --git a/test/CodeGen/WinEH/cppeh-inalloca.ll b/test/CodeGen/WinEH/cppeh-inalloca.ll
index f3f3631..93ec600 100644
--- a/test/CodeGen/WinEH/cppeh-inalloca.ll
+++ b/test/CodeGen/WinEH/cppeh-inalloca.ll
@@ -36,7 +36,7 @@ $"\01??_R0H@8" = comdat any
 @"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
 
 ; The function entry should be rewritten like this.
-; CHECK: define i32 @"\01?test@@YAHUA@@@Z"(<{ %struct.A }>* inalloca) #0 {
+; CHECK: define i32 @"\01?test@@YAHUA@@@Z"(<{ %struct.A }>* inalloca)
 ; CHECK: entry:
 ; CHECK:   [[TMP_REGMEM:\%.+]] = alloca <{ %struct.A }>*
 ; CHECK:   [[TMP:\%.+]] = select i1 true, <{ %struct.A }>* %0, <{ %struct.A }>* undef
@@ -44,7 +44,7 @@ $"\01??_R0H@8" = comdat any
 ; CHECK:   [[RETVAL:\%.+]] = alloca i32, align 4
 ; CHECK:   [[E_PTR:\%.+]] = alloca i32, align 4
 ; CHECK:   [[CLEANUP_SLOT:\%.+]] = alloca i32
-; CHECK:   call void (...)* @llvm.frameescape(i32* %e, <{ %struct.A }>** [[TMP_REGMEM]], i32* [[RETVAL]], i32* [[CLEANUP_SLOT]])
+; CHECK:   call void (...) @llvm.frameescape(i32* %e, <{ %struct.A }>** [[TMP_REGMEM]], i32* [[RETVAL]], i32* [[CLEANUP_SLOT]])
 ; CHECK:   invoke void @"\01?may_throw@@YAXXZ"()
 ; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
 
@@ -65,7 +65,7 @@ invoke.cont:                                      ; preds = %entry
 ; CHECK:   landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
 ; CHECK-NEXT:           cleanup
 ; CHECK-NEXT:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-; CHECK-NEXT:   [[RECOVER:\%recover.*]] = call i8* (...)* @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32* %e, i8* (i8*, i8*)* @"\01?test@@YAHUA@@@Z.catch", i32 0, void (i8*, i8*)* @"\01?test@@YAHUA@@@Z.cleanup")
+; CHECK-NEXT:   [[RECOVER:\%recover.*]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32 0, i8* (i8*, i8*)* @"\01?test@@YAHUA@@@Z.catch", i32 0, void (i8*, i8*)* @"\01?test@@YAHUA@@@Z.cleanup")
 ; CHECK-NEXT:   indirectbr i8* [[RECOVER]], [label %cleanup]
 
 lpad:                                             ; preds = %entry
@@ -142,7 +142,7 @@ eh.resume:                                        ; preds = %ehcleanup
 }
 
 ; The following catch handler should be outlined.
-; CHECK: define internal i8* @"\01?test@@YAHUA@@@Z.catch"(i8*, i8*) {
+; CHECK: define internal i8* @"\01?test@@YAHUA@@@Z.catch"(i8*, i8*)
 ; CHECK: entry:
 ; CHECK:   [[RECOVER_E:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 0)
 ; CHECK:   [[E_PTR:\%.+]] = bitcast i8* [[RECOVER_E]] to i32*
@@ -165,7 +165,7 @@ eh.resume:                                        ; preds = %ehcleanup
 ; CHECK: }
 
 ; The following cleanup handler should be outlined.
-; CHECK: define internal void @"\01?test@@YAHUA@@@Z.cleanup"(i8*, i8*) {
+; CHECK: define internal void @"\01?test@@YAHUA@@@Z.cleanup"(i8*, i8*)
 ; CHECK: entry:
 ; CHECK:   [[RECOVER_EH_TEMP1:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 1)
 ; CHECK:   [[EH_TEMP1:\%.+]] = bitcast i8* [[RECOVER_EH_TEMP]] to <{ %struct.A }>**
diff --git a/test/CodeGen/WinEH/cppeh-min-unwind.ll b/test/CodeGen/WinEH/cppeh-min-unwind.ll
index a1e97d5..7868b33 100644
--- a/test/CodeGen/WinEH/cppeh-min-unwind.ll
+++ b/test/CodeGen/WinEH/cppeh-min-unwind.ll
@@ -21,11 +21,11 @@ target triple = "x86_64-pc-windows-msvc"
 %class.SomeClass = type { [28 x i32] }
 
 ; The function entry should be rewritten like this.
-; CHECK: define void @_Z4testv() #0 {
+; CHECK: define void @_Z4testv()
 ; CHECK: entry:
 ; CHECK:   [[OBJ_PTR:\%.+]] = alloca %class.SomeClass, align 4
 ; CHECK:   call void @_ZN9SomeClassC1Ev(%class.SomeClass* [[OBJ_PTR]])
-; CHECK:   call void (...)* @llvm.frameescape(%class.SomeClass* [[OBJ_PTR]])
+; CHECK:   call void (...) @llvm.frameescape(%class.SomeClass* [[OBJ_PTR]])
 ; CHECK:   invoke void @_Z9may_throwv()
 ; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]+]]
 
@@ -46,7 +46,7 @@ invoke.cont:                                      ; preds = %entry
 ; CHECK: [[LPAD_LABEL]]:{{[ ]+}}; preds = %entry
 ; CHECK:   landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
 ; CHECK-NEXT:           cleanup
-; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...)* @llvm.eh.actions(i32 0, void (i8*, i8*)* @_Z4testv.cleanup)
+; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 0, void (i8*, i8*)* @_Z4testv.cleanup)
 ; CHECK-NEXT:   indirectbr i8* [[RECOVER]], []
 
 lpad:                                             ; preds = %entry
@@ -72,7 +72,7 @@ eh.resume:                                        ; preds = %lpad
 }
 
 ; This cleanup handler should be outlined.
-; CHECK: define internal void @_Z4testv.cleanup(i8*, i8*) {
+; CHECK: define internal void @_Z4testv.cleanup(i8*, i8*)
 ; CHECK: entry:
 ; CHECK:   [[RECOVER_OBJ:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 0)
 ; CHECK:   [[OBJ_PTR1:\%.+]] = bitcast i8* [[RECOVER_OBJ]] to %class.SomeClass*
diff --git a/test/CodeGen/WinEH/cppeh-multi-catch.ll b/test/CodeGen/WinEH/cppeh-multi-catch.ll
new file mode 100644
index 0000000..6052629
--- /dev/null
+++ b/test/CodeGen/WinEH/cppeh-multi-catch.ll
@@ -0,0 +1,229 @@
+; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
+
+; This test is based on the following code:
+;
+; void test()
+; {
+;   try {
+;     may_throw();
+;   } catch (int i) {
+;     handle_int(i);
+;   } catch (long long ll) {
+;     handle_long_long(ll);
+;   } catch (SomeClass &obj) {
+;     handle_obj(&obj);
+;   } catch (...) {
+;     handle_exception();
+;   }
+; }
+;
+; The catch handlers were edited to insert 'ret void' after the endcatch call.
+
+; ModuleID = 'catch-with-type.cpp'
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
+%eh.HandlerMapEntry = type { i32, i32 }
+%rtti.TypeDescriptor3 = type { i8**, i8*, [4 x i8] }
+%rtti.TypeDescriptor15 = type { i8**, i8*, [16 x i8] }
+%class.SomeClass = type { i8 }
+
+$"\01??_R0H@8" = comdat any
+
+$"\01??_R0_J@8" = comdat any
+
+$"\01??_R0?AVSomeClass@@@8" = comdat any
+
+@"\01??_7type_info@@6B@" = external constant i8*
+@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
+@__ImageBase = external constant i8
+@llvm.eh.handlermapentry.H = private unnamed_addr constant %eh.HandlerMapEntry { i32 0, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32) }, section "llvm.metadata"
+@"\01??_R0_J@8" = linkonce_odr global %rtti.TypeDescriptor3 { i8** @"\01??_7type_info@@6B@", i8* null, [4 x i8] c"._J\00" }, comdat
+@llvm.eh.handlermapentry._J = private unnamed_addr constant %eh.HandlerMapEntry { i32 0, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%rtti.TypeDescriptor3* @"\01??_R0_J@8" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32) }, section "llvm.metadata"
+@"\01??_R0?AVSomeClass@@@8" = linkonce_odr global %rtti.TypeDescriptor15 { i8** @"\01??_7type_info@@6B@", i8* null, [16 x i8] c".?AVSomeClass@@\00" }, comdat
+@"llvm.eh.handlermapentry.reference.?AVSomeClass@@" = private unnamed_addr constant %eh.HandlerMapEntry { i32 8, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%rtti.TypeDescriptor15* @"\01??_R0?AVSomeClass@@@8" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32) }, section "llvm.metadata"
+
+
+; CHECK: define void @"\01?test@@YAXXZ"() #0 {
+; CHECK: entry:
+; CHECK:   [[OBJ_PTR:\%.+]] = alloca %class.SomeClass*, align 8
+; CHECK:   [[LL_PTR:\%.+]] = alloca i64, align 8
+; CHECK:   [[I_PTR:\%.+]] = alloca i32, align 4
+; CHECK:   call void (...) @llvm.frameescape(i32* [[I_PTR]], i64* [[LL_PTR]], %class.SomeClass** [[OBJ_PTR]])
+; CHECK:   invoke void @"\01?may_throw@@YAXXZ"()
+; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]+]]
+
+; Function Attrs: uwtable
+define void @"\01?test@@YAXXZ"() #0 {
+entry:
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  %obj = alloca %class.SomeClass*, align 8
+  %ll = alloca i64, align 8
+  %i = alloca i32, align 4
+  invoke void @"\01?may_throw@@YAXXZ"()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  br label %try.cont
+
+; CHECK: [[LPAD_LABEL]]:{{[ ]+}}; preds = %entry
+; CHECK:   landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+; CHECK-NEXT:           catch %eh.HandlerMapEntry* @llvm.eh.handlermapentry.H
+; CHECK-NEXT:           catch %eh.HandlerMapEntry* @llvm.eh.handlermapentry._J
+; CHECK-NEXT:           catch %eh.HandlerMapEntry* @"llvm.eh.handlermapentry.reference.?AVSomeClass@@"
+; CHECK-NEXT:           catch i8* null
+; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(
+; CHECK-SAME:	     i32 1, i8* bitcast (%eh.HandlerMapEntry* @llvm.eh.handlermapentry.H to i8*), i32 0, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch",
+; CHECK-SAME:        i32 1, i8* bitcast (%eh.HandlerMapEntry* @llvm.eh.handlermapentry._J to i8*), i32 1, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch1",
+; CHECK-SAME:        i32 1, i8* bitcast (%eh.HandlerMapEntry* @"llvm.eh.handlermapentry.reference.?AVSomeClass@@" to i8*), i32 2, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch2",
+; CHECK-SAME:        i32 1, i8* null, i32 -1, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch3")
+; CHECK-NEXT:   indirectbr i8* [[RECOVER]], [label %catch14.split, label %catch10.split, label %catch6.split, label %catch.split]
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          catch %eh.HandlerMapEntry* @llvm.eh.handlermapentry.H
+          catch %eh.HandlerMapEntry* @llvm.eh.handlermapentry._J
+          catch %eh.HandlerMapEntry* @"llvm.eh.handlermapentry.reference.?AVSomeClass@@"
+          catch i8* null
+  %1 = extractvalue { i8*, i32 } %0, 0
+  store i8* %1, i8** %exn.slot
+  %2 = extractvalue { i8*, i32 } %0, 1
+  store i32 %2, i32* %ehselector.slot
+  br label %catch.dispatch
+
+; CHECK-NOT: catch.dispatch:
+catch.dispatch:                                   ; preds = %lpad
+  %sel = load i32, i32* %ehselector.slot
+  %3 = call i32 @llvm.eh.typeid.for(i8* bitcast (%eh.HandlerMapEntry* @llvm.eh.handlermapentry.H to i8*)) #3
+  %matches = icmp eq i32 %sel, %3
+  br i1 %matches, label %catch14, label %catch.fallthrough
+
+; CHECK-NOT: catch14:
+; CHECK: catch14.split:
+; CHECK-NEXT:   ret void
+catch14:                                          ; preds = %catch.dispatch
+  %exn15 = load i8*, i8** %exn.slot
+  %4 = bitcast i32* %i to i8*
+  call void @llvm.eh.begincatch(i8* %exn15, i8* %4) #3
+  %5 = load i32, i32* %i, align 4
+  call void @"\01?handle_int@@YAXH@Z"(i32 %5)
+  call void @llvm.eh.endcatch() #3
+  ret void
+
+try.cont:                                         ; preds = %invoke.cont
+  ret void
+
+; CHECK-NOT: catch.fallthrough:
+catch.fallthrough:                                ; preds = %catch.dispatch
+  %6 = call i32 @llvm.eh.typeid.for(i8* bitcast (%eh.HandlerMapEntry* @llvm.eh.handlermapentry._J to i8*)) #3
+  %matches1 = icmp eq i32 %sel, %6
+  br i1 %matches1, label %catch10, label %catch.fallthrough2
+
+; CHECK-NOT: catch10:
+; CHECK: catch10.split:
+; CHECK-NEXT:   ret void
+catch10:                                          ; preds = %catch.fallthrough
+  %exn11 = load i8*, i8** %exn.slot
+  %7 = bitcast i64* %ll to i8*
+  call void @llvm.eh.begincatch(i8* %exn11, i8* %7) #3
+  %8 = load i64, i64* %ll, align 8
+  call void @"\01?handle_long_long@@YAX_J@Z"(i64 %8)
+  call void @llvm.eh.endcatch() #3
+  ret void
+
+; CHECK-NOT: catch.fallthrough2:
+catch.fallthrough2:                               ; preds = %catch.fallthrough
+  %9 = call i32 @llvm.eh.typeid.for(i8* bitcast (%eh.HandlerMapEntry* @"llvm.eh.handlermapentry.reference.?AVSomeClass@@" to i8*)) #3
+  %matches3 = icmp eq i32 %sel, %9
+  br i1 %matches3, label %catch6, label %catch
+
+; CHECK-NOT: catch6:
+; CHECK: catch6.split:
+; CHECK-NEXT:   ret void
+catch6:                                           ; preds = %catch.fallthrough2
+  %exn7 = load i8*, i8** %exn.slot
+  %10 = bitcast %class.SomeClass** %obj to i8*
+  call void @llvm.eh.begincatch(i8* %exn7, i8* %10) #3
+  %11 = load %class.SomeClass*, %class.SomeClass** %obj, align 8
+  call void @"\01?handle_obj@@YAXPEAVSomeClass@@@Z"(%class.SomeClass* %11)
+  call void @llvm.eh.endcatch() #3
+  ret void
+
+; CHECK-NOT: catch:
+; CHECK: catch.split:
+; CHECK-NEXT:   ret void
+catch:                                            ; preds = %catch.fallthrough2
+  %exn = load i8*, i8** %exn.slot
+  call void @llvm.eh.begincatch(i8* %exn, i8* null) #3
+  call void @"\01?handle_exception@@YAXXZ"()  call void @llvm.eh.endcatch() #3
+  ret void
+; CHECK: }
+}
+
+; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
+; CHECK: entry:
+; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
+; CHECK:   [[I_PTR:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
+; CHECK:   [[TMP1:\%.+]] = load i32, i32* [[I_PTR]], align 4
+; CHECK:   call void @"\01?handle_int@@YAXH@Z"(i32 [[TMP1]])
+; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %catch14.split)
+; CHECK: }
+
+; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch1"(i8*, i8*)
+; CHECK: entry:
+; CHECK:   [[RECOVER_LL:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
+; CHECK:   [[LL_PTR:\%.+]] = bitcast i8* [[RECOVER_LL]] to i64*
+; CHECK:   [[TMP2:\%.+]] = load i64, i64* [[LL_PTR]], align 8
+; CHECK:   call void @"\01?handle_long_long@@YAX_J@Z"(i64 [[TMP2]])
+; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %catch10.split)
+; CHECK: }
+
+; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch2"(i8*, i8*)
+; CHECK: entry:
+; CHECK:   [[RECOVER_OBJ:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
+; CHECK:   [[OBJ_PTR:\%.+]] = bitcast i8* [[RECOVER_OBJ]] to %class.SomeClass**
+; CHECK:   [[TMP3:\%.+]] = load %class.SomeClass*, %class.SomeClass** [[OBJ_PTR]], align 8
+; CHECK:   call void @"\01?handle_obj@@YAXPEAVSomeClass@@@Z"(%class.SomeClass* [[TMP3]])
+; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %catch6.split)
+; CHECK: }
+
+; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch3"(i8*, i8*)
+; CHECK: entry:
+; CHECK:   call void @"\01?handle_exception@@YAXXZ"()
+; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %catch.split)
+; CHECK: }
+
+
+declare void @"\01?may_throw@@YAXXZ"() #1
+
+declare i32 @__CxxFrameHandler3(...)
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.eh.typeid.for(i8*) #2
+
+; Function Attrs: nounwind
+declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #3
+
+declare void @"\01?handle_exception@@YAXXZ"() #1
+
+; Function Attrs: nounwind
+declare void @llvm.eh.endcatch() #3
+
+declare void @"\01?handle_obj@@YAXPEAVSomeClass@@@Z"(%class.SomeClass*) #1
+
+declare void @"\01?handle_long_long@@YAX_J@Z"(i64) #1
+
+declare void @"\01?handle_int@@YAXH@Z"(i32) #1
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 3.7.0 (trunk 233155) (llvm/trunk 233153)"}
diff --git a/test/CodeGen/WinEH/cppeh-nested-1.ll b/test/CodeGen/WinEH/cppeh-nested-1.ll
index e94e771..5cd2b18 100644
--- a/test/CodeGen/WinEH/cppeh-nested-1.ll
+++ b/test/CodeGen/WinEH/cppeh-nested-1.ll
@@ -1,5 +1,4 @@
 ; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
-; XFAIL: *
 
 ; This test is based on the following code:
 ;
@@ -31,11 +30,11 @@ $"\01??_R0H@8" = comdat any
 @"\01??_R0M@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".M\00" }, comdat
 @"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
 
-; CHECK: define void @"\01?test@@YAXXZ"() #0 {
+; CHECK: define void @"\01?test@@YAXXZ"()
 ; CHECK: entry:
 ; CHECK:   %i = alloca i32, align 4
 ; CHECK:   %f = alloca float, align 4
-; CHECK:   call void (...)* @llvm.frameescape(i32* %i, float* %f)
+; CHECK:   call void (...) @llvm.frameescape(i32* %i, float* %f)
 ; CHECK:   invoke void @"\01?may_throw@@YAXXZ"()
 ; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
 
@@ -56,7 +55,7 @@ invoke.cont:                                      ; preds = %entry
 ; CHECK:   landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
 ; CHECK:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
 ; CHECK:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*)
-; CHECK:   [[RECOVER:\%.+]] = call i8* (...)* @llvm.eh.actions(i32 0, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32* %i, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch" to i8*), i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*), float* %f, i8* bitcast (i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch1")
+; CHECK:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32 0, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch", i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*), i32 1, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch1")
 ; CHECK:   indirectbr i8* [[RECOVER]], [label %try.cont, label %try.cont10]
 
 lpad:                                             ; preds = %entry
@@ -135,13 +134,10 @@ eh.resume:                                        ; %catch.dispatch3
 ; CHECK: }
 }
 
-; CHECK: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*) {
+; CHECK: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
 ; CHECK: entry:
 ; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
 ; CHECK:   [[I_PTR:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
-; ------------================= FAIL here =================------------
-; CHECK:   [[RECOVER_F:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
-; CHECK:   [[F_PTR:\%.+]] = bitcast i8* [[RECOVER_F]] to float*
 ; CHECK:   [[TMP1:\%.+]] = load i32, i32* [[I_PTR]], align 4
 ; CHECK:   invoke void @"\01?handle_int@@YAXH@Z"(i32 [[TMP1]])
 ; CHECK:           to label %invoke.cont2 unwind label %[[LPAD1_LABEL:lpad[0-9]*]]
@@ -152,13 +148,12 @@ eh.resume:                                        ; %catch.dispatch3
 ; CHECK: [[LPAD1_LABEL]]:{{[ ]+}}; preds = %entry
 ; CHECK:   [[LPAD1_VAL:\%.+]] = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
 ; CHECK:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*)
-; ------------================= FAIL here =================------------
-; CHECK:   [[RECOVER1:\%.+]] = call i8* (...)* @llvm.eh.actions({ i8*, i32 } [[LPAD1_VAL]], i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*), float* [[F_PTR]], i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch1")
+; CHECK:   [[RECOVER1:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*), i32 1, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch1")
 ; CHECK:   indirectbr i8* [[RECOVER1]], []
 ;
 ; CHECK: }
 
-; CHECK: define internal i8* @"\01?test@@YAXXZ.catch1"(i8*, i8*) {
+; CHECK: define internal i8* @"\01?test@@YAXXZ.catch1"(i8*, i8*)
 ; CHECK: entry:
 ; CHECK:   [[RECOVER_F1:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
 ; CHECK:   [[F_PTR1:\%.+]] = bitcast i8* [[RECOVER_F1]] to float*
diff --git a/test/CodeGen/WinEH/cppeh-nested-2.ll b/test/CodeGen/WinEH/cppeh-nested-2.ll
index 3479c41..ebcd3c9 100644
--- a/test/CodeGen/WinEH/cppeh-nested-2.ll
+++ b/test/CodeGen/WinEH/cppeh-nested-2.ll
@@ -38,13 +38,13 @@ target triple = "x86_64-pc-windows-msvc"
 @_ZTIi = external constant i8*
 
 ; The function entry should be rewritten like this.
-; CHECK: define void @_Z4testv() #0 {
+; CHECK: define void @_Z4testv()
 ; CHECK: entry:
 ; CHECK:   %outer = alloca %class.Outer, align 1
 ; CHECK:   %inner = alloca %class.Inner, align 1
 ; CHECK:   %i = alloca i32, align 4
 ; CHECK:   %f = alloca float, align 4
-; CHECK:   call void (...)* @llvm.frameescape(float* %f, i32* %i, %class.Outer* %outer, %class.Inner* %inner)
+; CHECK:   call void (...) @llvm.frameescape(float* %f, i32* %i, %class.Outer* %outer, %class.Inner* %inner)
 ; CHECK:   invoke void @_ZN5OuterC1Ev(%class.Outer* %outer)
 ; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
 
@@ -93,7 +93,7 @@ invoke.cont5:                                     ; preds = %invoke.cont4
 ; CHECK: [[LPAD_LABEL]]:
 ; CHECK:   landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
 ; CHECK-NEXT:           catch i8* bitcast (i8** @_ZTIf to i8*)
-; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...)* @llvm.eh.actions(i32 1, i8* bitcast (i8** @_ZTIf to i8*), float* %f, i8* (i8*, i8*)* @_Z4testv.catch)
+; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (i8** @_ZTIf to i8*), i32 0, i8* (i8*, i8*)* @_Z4testv.catch)
 ; CHECK-NEXT:   indirectbr i8* [[RECOVER]], [label %try.cont19]
 
 lpad:                                             ; preds = %try.cont, %entry
@@ -110,10 +110,10 @@ lpad:                                             ; preds = %try.cont, %entry
 ; CHECK-NEXT:           cleanup
 ; CHECK-NEXT:           catch i8* bitcast (i8** @_ZTIi to i8*)
 ; CHECK-NEXT:           catch i8* bitcast (i8** @_ZTIf to i8*)
-; CHECK-NEXT:   [[RECOVER1:\%.+]] = call i8* (...)* @llvm.eh.actions(
-; CHECK-SAME:       i32 1, i8* bitcast (i8** @_ZTIi to i8*), i32* %i, i8* (i8*, i8*)* @_Z4testv.catch1,
+; CHECK-NEXT:   [[RECOVER1:\%.+]] = call i8* (...) @llvm.eh.actions(
+; CHECK-SAME:       i32 1, i8* bitcast (i8** @_ZTIi to i8*), i32 1, i8* (i8*, i8*)* @_Z4testv.catch1,
 ; CHECK-SAME:       i32 0, void (i8*, i8*)* @_Z4testv.cleanup,
-; CHECK-SAME:       i32 1, i8* bitcast (i8** @_ZTIf to i8*), float* %f, i8* (i8*, i8*)* @_Z4testv.catch)
+; CHECK-SAME:       i32 1, i8* bitcast (i8** @_ZTIf to i8*), i32 0, i8* (i8*, i8*)* @_Z4testv.catch)
 ; CHECK-NEXT:   indirectbr i8* [[RECOVER1]], [label %try.cont, label %try.cont19]
 
 lpad1:                                            ; preds = %invoke.cont4, %invoke.cont
@@ -132,11 +132,11 @@ lpad1:                                            ; preds = %invoke.cont4, %invo
 ; CHECK-NEXT:           cleanup
 ; CHECK-NEXT:           catch i8* bitcast (i8** @_ZTIi to i8*)
 ; CHECK-NEXT:           catch i8* bitcast (i8** @_ZTIf to i8*)
-; CHECK-NEXT:   [[RECOVER3:\%.+]] = call i8* (...)* @llvm.eh.actions(
+; CHECK-NEXT:   [[RECOVER3:\%.+]] = call i8* (...) @llvm.eh.actions(
 ; CHECK-SAME:       i32 0, void (i8*, i8*)* @_Z4testv.cleanup2,
-; CHECK-SAME:       i32 1, i8* bitcast (i8** @_ZTIi to i8*), i32* %i, i8* (i8*, i8*)* @_Z4testv.catch1,
+; CHECK-SAME:       i32 1, i8* bitcast (i8** @_ZTIi to i8*), i32 1, i8* (i8*, i8*)* @_Z4testv.catch1,
 ; CHECK-SAME:       i32 0, void (i8*, i8*)* @_Z4testv.cleanup,
-; CHECK-SAME:       i32 1, i8* bitcast (i8** @_ZTIf to i8*), float* %f, i8* (i8*, i8*)* @_Z4testv.catch)
+; CHECK-SAME:       i32 1, i8* bitcast (i8** @_ZTIf to i8*), i32 0, i8* (i8*, i8*)* @_Z4testv.catch)
 ; CHECK-NEXT:   indirectbr i8* [[RECOVER3]], [label %try.cont, label %try.cont19]
 
 lpad3:                                            ; preds = %invoke.cont2
@@ -241,7 +241,7 @@ eh.resume:                                        ; preds = %catch.dispatch11
 }
 
 ; This catch handler should be outlined.
-; CHECK: define internal i8* @_Z4testv.catch(i8*, i8*) {
+; CHECK: define internal i8* @_Z4testv.catch(i8*, i8*)
 ; CHECK: entry:
 ; CHECK:   [[RECOVER_F:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 0)
 ; CHECK:   [[F_PTR:\%.+]] = bitcast i8* [[RECOVER_F]] to float*
@@ -251,7 +251,7 @@ eh.resume:                                        ; preds = %catch.dispatch11
 ; CHECK: }
 
 ; This catch handler should be outlined.
-; CHECK: define internal i8* @_Z4testv.catch1(i8*, i8*) {
+; CHECK: define internal i8* @_Z4testv.catch1(i8*, i8*)
 ; CHECK: entry:
 ; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 1)
 ; CHECK:   [[I_PTR:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
@@ -268,7 +268,7 @@ eh.resume:                                        ; preds = %catch.dispatch11
 ; CHECK: }
 
 ; This cleanup handler should be outlined.
-; CHECK: define internal void @_Z4testv.cleanup(i8*, i8*) {
+; CHECK: define internal void @_Z4testv.cleanup(i8*, i8*)
 ; CHECK: entry:
 ; CHECK:   [[RECOVER_OUTER:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 2)
 ; CHECK:   [[OUTER_PTR:\%.+]] = bitcast i8* [[RECOVER_OUTER]] to %class.Outer*
@@ -277,7 +277,7 @@ eh.resume:                                        ; preds = %catch.dispatch11
 ; CHECK: }
 
 ; This cleanup handler should be outlined.
-; CHECK: define internal void @_Z4testv.cleanup2(i8*, i8*) {
+; CHECK: define internal void @_Z4testv.cleanup2(i8*, i8*)
 ; CHECK: entry:
 ; CHECK:   [[RECOVER_INNER:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 3)
 ; CHECK:   [[INNER_PTR:\%.+]] = bitcast i8* [[RECOVER_INNER]] to %class.Inner*
diff --git a/test/CodeGen/WinEH/cppeh-nested-3.ll b/test/CodeGen/WinEH/cppeh-nested-3.ll
index d28fa4d..4e33c55 100644
--- a/test/CodeGen/WinEH/cppeh-nested-3.ll
+++ b/test/CodeGen/WinEH/cppeh-nested-3.ll
@@ -1,5 +1,4 @@
 ; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
-; XFAIL: *
 
 ; This test is based on the following code:
 ;
@@ -37,14 +36,12 @@ $"\01??_R0H@8" = comdat any
 @"\01??_R0M@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".M\00" }, comdat
 @"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
 
-; CHECK: define void @"\01?test@@YAXXZ"() #0 {
+; CHECK: define void @"\01?test@@YAXXZ"()
 ; CHECK: entry:
 ; CHECK:   %i = alloca i32, align 4
-; ------------================= FAIL here =================------------
 ; CHECK:   %j = alloca i32, align 4
 ; CHECK:   %f = alloca float, align 4
-; ------------================= FAIL here =================------------
-; CHECK:   call void (...)* @llvm.frameescape(i32* %i, float* %f, int32* %j)
+; CHECK:   call void (...) @llvm.frameescape(i32* %i, float* %f, i32* %j)
 ; CHECK:   invoke void @"\01?may_throw@@YAXXZ"()
 ; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
 
@@ -66,7 +63,7 @@ invoke.cont:                                      ; preds = %entry
 ; CHECK:   landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
 ; CHECK:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
 ; CHECK:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*)
-; CHECK:   [[RECOVER:\%.+]] = call i8* (...)* @llvm.eh.actions(i32 0, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32* %i, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch" to i8*), i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*), float* %f, i8* bitcast (i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch1")
+; CHECK:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32 0, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch", i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*), i32 1, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch1")
 ; CHECK:   indirectbr i8* [[RECOVER]], [label %try.cont10, label %try.cont19]
 
 lpad:                                             ; preds = %entry
@@ -182,20 +179,14 @@ eh.resume:                                        ; preds = %lpad16, %catch.disp
 ; CHECK: }
 }
 
-; CHECK: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*) {
+; CHECK: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
 ; CHECK: entry:
 ; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
 ; CHECK:   [[I_PTR:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
-; ------------================= FAIL here =================------------
-; CHECK:   [[RECOVER_F:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
-; CHECK:   [[F_PTR:\%.+]] = bitcast i8* [[RECOVER_F]] to float*
-; ------------================= FAIL here =================------------
-; CHECK:   [[RECOVER_J:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
-; CHECK:   [[J_PTR:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
 ; CHECK:   invoke void @"\01?may_throw@@YAXXZ"()
 ; CHECK:           to label %invoke.cont2 unwind label %[[LPAD1_LABEL:lpad[0-9]*]]
 ;
-; CHECK: invoke.cont2:                                     ; preds = %entry
+; CHECK: invoke.cont2:                                     ; preds = %[[LPAD1_LABEL]], %entry
 ; CHECK:   [[TMP1:\%.+]] = load i32, i32* [[I_PTR]], align 4
 ; CHECK:   invoke void @"\01?handle_int@@YAXH@Z"(i32 [[TMP1]])
 ; CHECK:           to label %invoke.cont9 unwind label %[[LPAD8_LABEL:lpad[0-9]*]]
@@ -204,9 +195,8 @@ eh.resume:                                        ; preds = %lpad16, %catch.disp
 ; CHECK:   [[LPAD1_VAL:\%.+]] = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
 ; CHECK:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
 ; CHECK:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*)
-; ------------================= FAIL here =================------------
-; CHECK:   [[RECOVER1:\%.+]] = call i8* (...)* @llvm.eh.actions({ i8*, i32 } [[LPAD1_VAL]], i32 0, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32* [[J_PTR]], i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch2" to i8*), i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*), float* [[F_PTR1]], i8* bitcast (i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch1")
-; CHECK:   indirectbr i8* [[RECOVER1]], []
+; CHECK:   [[RECOVER1:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32 2, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch2", i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*), i32 1, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch1")
+; CHECK:   indirectbr i8* [[RECOVER1]], [label %invoke.cont2]
 ;
 ; CHECK: invoke.cont9:
 ; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %try.cont10)
@@ -214,30 +204,28 @@ eh.resume:                                        ; preds = %lpad16, %catch.disp
 ; CHECK: [[LPAD8_LABEL]]:{{[ ]+}}; preds = %invoke.cont2
 ; CHECK:   [[LPAD8_VAL:\%.+]] = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
 ; CHECK:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*)
-; ------------================= FAIL here =================------------
-; CHECK:   [[RECOVER2:\%.+]] = call i8* (...)* @llvm.eh.actions({ i8*, i32 } [[LPAD8_VAL]], i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*), float* [[F_PTR1]], i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch1")
+; CHECK:   [[RECOVER2:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*), i32 1, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch1")
 ; CHECK:   indirectbr i8* [[RECOVER2]], []
 ;
 ; CHECK: }
 
-; CHECK: define internal i8* @"\01?test@@YAXXZ.catch1"(i8*, i8*) {
+; CHECK: define internal i8* @"\01?test@@YAXXZ.catch1"(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_F1:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
-; CHECK:   [[F_PTR1:\%.+]] = bitcast i8* [[RECOVER_F1]] to float*
-; CHECK:   [[TMP2:\%.+]] = load float, float* [[F_PTR1]], align 4
+; CHECK:   [[RECOVER_F:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
+; CHECK:   [[F_PTR:\%.+]] = bitcast i8* [[RECOVER_F]] to float*
+; CHECK:   [[TMP2:\%.+]] = load float, float* [[F_PTR]], align 4
 ; CHECK:   call void @"\01?handle_float@@YAXM@Z"(float [[TMP2]])
 ; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %try.cont19)
 ; CHECK: }
 
-; CHECK: define internal i8* @"\01?test@@YAXXZ.catch2"(i8*, i8*) {
+; CHECK: define internal i8* @"\01?test@@YAXXZ.catch2"(i8*, i8*)
 ; CHECK: entry:
-; ------------================= FAIL here =================------------
-; SHOULD-CHECK:   [[J_PTR1:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
+; CHECK:   [[RECOVER_J:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
+; CHECK:   [[J_PTR:\%.+]] = bitcast i8* [[RECOVER_J]] to i32*
 ; CHECK:   [[RECOVER_I1:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
 ; CHECK:   [[I_PTR1:\%.+]] = bitcast i8* [[RECOVER_I1]] to i32*
-; CHECK:   [[TMP3:\%.+]] = load i32, i32* [[J_PTR1]], align 4
+; CHECK:   [[TMP3:\%.+]] = load i32, i32* [[J_PTR]], align 4
 ; CHECK:   store i32 [[TMP3]], i32* [[I_PTR1]]
-; ------------================= FAIL here =================------------
 ; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ.catch", %invoke.cont2)
 ; CHECK: }
 
diff --git a/test/CodeGen/WinEH/cppeh-nonalloca-frame-values.ll b/test/CodeGen/WinEH/cppeh-nonalloca-frame-values.ll
index 41d9006..de873d1 100644
--- a/test/CodeGen/WinEH/cppeh-nonalloca-frame-values.ll
+++ b/test/CodeGen/WinEH/cppeh-nonalloca-frame-values.ll
@@ -51,10 +51,12 @@ $"\01??_R0H@8" = comdat any
 @"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
 
 ; The function entry should be rewritten like this.
-; CHECK: define void @"\01?test@@YAXXZ"() #0 {
+; CHECK: define void @"\01?test@@YAXXZ"()
 ; CHECK: entry:
 ; CHECK:   [[NUMEXCEPTIONS_REGMEM:\%.+]] = alloca i32
 ; CHECK:   [[I_REGMEM:\%.+]] = alloca i32
+; CHECK:   [[A_REGMEM:\%.+]] = alloca i32*
+; CHECK:   [[B_REGMEM:\%.+]] = alloca i32*
 ; CHECK:   [[E_PTR:\%.+]] = alloca i32, align 4
 ; CHECK:   [[EXCEPTIONVAL:\%.+]] = alloca [10 x i32], align 16
 ; CHECK:   [[DATA_PTR:\%.+]] = alloca i64, align 8
@@ -62,15 +64,13 @@ $"\01??_R0H@8" = comdat any
 ; CHECK:   [[TMP:\%.+]] = bitcast [10 x i32]* [[EXCEPTIONVAL]] to i8*
 ; CHECK:   call void @llvm.lifetime.start(i64 40, i8* [[TMP]])
 ; CHECK:   store i64 0, i64* [[DATA_PTR]], align 8
-; CHECK:   [[A_REGMEM:\%.+]] = alloca i32*
 ; CHECK:   [[A_PTR:\%.+]] = bitcast i64* [[DATA_PTR]] to i32*
 ; CHECK:   store i32* [[A_PTR]], i32** [[A_REGMEM]]
 ; CHECK:   [[B_PTR:\%.+]] = getelementptr inbounds %struct.SomeData, %struct.SomeData* [[TMPCAST]], i64 0, i32 1
-; CHECK:   [[B_REGMEM:\%.+]] = alloca i32*
 ; CHECK:   store i32* [[B_PTR]], i32** [[B_REGMEM]]
 ; CHECK:   store i32 0, i32* [[NUMEXCEPTIONS_REGMEM]]
 ; CHECK:   store i32 0, i32* [[I_REGMEM]]
-; CHECK:   call void (...)* @llvm.frameescape(i32* %e, i32* %NumExceptions.020.reg2mem, [10 x i32]* [[EXCEPTIONVAL]], i32* [[I_REGMEM]], i32** [[A_REGMEM]], i32** [[B_REGMEM]])
+; CHECK:   call void (...) @llvm.frameescape(i32* %e, i32* %NumExceptions.020.reg2mem, [10 x i32]* [[EXCEPTIONVAL]], i32* [[I_REGMEM]], i32** [[A_REGMEM]], i32** [[B_REGMEM]])
 ; CHECK:   br label %for.body
 
 ; Function Attrs: uwtable
@@ -114,7 +114,7 @@ invoke.cont:                                      ; preds = %for.body
 ; CHECK: [[LPAD_LABEL:lpad[0-9]*]]:{{[ ]+}}; preds = %for.body
 ; CHECK:   landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
 ; CHECK-NEXT:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...)* @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32* %e, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch")
+; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32 0, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch")
 ; CHECK-NEXT:   indirectbr i8* [[RECOVER]], [label %try.cont]
 
 lpad:                                             ; preds = %for.body
@@ -190,7 +190,7 @@ eh.resume:                                        ; preds = %lpad
 }
 
 ; The following catch handler should be outlined.
-; CHECK: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*) {
+; CHECK: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
 ; CHECK: entry:
 ; CHECK:   [[RECOVER_E:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
 ; CHECK:   [[E_PTR:\%.+]] = bitcast i8* [[RECOVER_E]] to i32*
diff --git a/test/CodeGen/WinEH/cppeh-prepared-catch-all.ll b/test/CodeGen/WinEH/cppeh-prepared-catch-all.ll
new file mode 100644
index 0000000..f395d64
--- /dev/null
+++ b/test/CodeGen/WinEH/cppeh-prepared-catch-all.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+; This test case is equivalent to:
+; extern "C" void may_throw();
+; extern "C" void test_catch_all() {
+;   try {
+;     may_throw();
+;   } catch (...) {
+;   }
+; }
+
+declare void @may_throw() #1
+declare i32 @__CxxFrameHandler3(...)
+declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #2
+declare void @llvm.eh.endcatch() #2
+
+; Function Attrs: nounwind uwtable
+define void @test_catch_all() #0 {
+entry:
+  invoke void @may_throw()
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          catch i8* null
+  %1 = extractvalue { i8*, i32 } %0, 0
+  tail call void @llvm.eh.begincatch(i8* %1, i8* null) #2
+  tail call void @llvm.eh.endcatch() #2
+  br label %try.cont
+
+try.cont:                                         ; preds = %entry, %lpad
+  ret void
+}
+
+; CHECK-LABEL: $handlerMap$0$test_catch_all:
+; CHECK:         .long   {{[0-9]+}}
+; CHECK:         .long   0
+; CHECK:         .long   0
+; CHECK:         .long   test_catch_all.catch@IMGREL
+; CHECK:         .long   .Ltest_catch_all.catch$parent_frame_offset
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/WinEH/cppeh-prepared-catch-reordered.ll b/test/CodeGen/WinEH/cppeh-prepared-catch-reordered.ll
new file mode 100644
index 0000000..4946c6a
--- /dev/null
+++ b/test/CodeGen/WinEH/cppeh-prepared-catch-reordered.ll
@@ -0,0 +1,163 @@
+; RUN: llc < %s | FileCheck %s
+
+; Verify that we get the right frame escape label when the catch comes after the
+; parent function.
+
+; This test case is equivalent to:
+; int main() {
+;   try {
+;     throw 42;
+;   } catch (int e) {
+;     printf("e: %d\n", e);
+;   }
+; }
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
+%eh.CatchableType = type { i32, i32, i32, i32, i32, i32, i32 }
+%eh.CatchableTypeArray.1 = type { i32, [1 x i32] }
+%eh.ThrowInfo = type { i32, i32, i32, i32 }
+%eh.CatchHandlerType = type { i32, i8* }
+
+$"\01??_R0H@8" = comdat any
+
+$"_CT??_R0H@84" = comdat any
+
+$_CTA1H = comdat any
+
+$_TI1H = comdat any
+
+$"\01??_C@_06PNOAJMHG@e?3?5?$CFd?6?$AA@" = comdat any
+
+@"\01??_7type_info@@6B@" = external constant i8*
+@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
+@__ImageBase = external constant i8
+@"_CT??_R0H@84" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 1, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32), i32 0, i32 -1, i32 0, i32 4, i32 0 }, section ".xdata", comdat
+@_CTA1H = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.1 { i32 1, [1 x i32] [i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableType* @"_CT??_R0H@84" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32)] }, section ".xdata", comdat
+@_TI1H = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 0, i32 0, i32 0, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableTypeArray.1* @_CTA1H to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32) }, section ".xdata", comdat
+@llvm.eh.handlertype.H.0 = private unnamed_addr constant %eh.CatchHandlerType { i32 0, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*) }, section "llvm.metadata"
+@"\01??_C@_06PNOAJMHG@e?3?5?$CFd?6?$AA@" = linkonce_odr unnamed_addr constant [7 x i8] c"e: %d\0A\00", comdat, align 1
+
+declare void @_CxxThrowException(i8*, %eh.ThrowInfo*)
+
+; Function Attrs: uwtable
+define i32 @main() #1 {
+entry:
+  %tmp.i = alloca i32, align 4
+  %e = alloca i32, align 4
+  %0 = bitcast i32* %tmp.i to i8*
+  store i32 42, i32* %tmp.i, align 4, !tbaa !2
+  call void (...) @llvm.frameescape(i32* %e)
+  invoke void @_CxxThrowException(i8* %0, %eh.ThrowInfo* @_TI1H) #6
+          to label %.noexc unwind label %lpad1
+
+.noexc:                                           ; preds = %entry
+  unreachable
+
+lpad1:                                            ; preds = %entry
+  %1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          catch %eh.CatchHandlerType* @llvm.eh.handlertype.H.0
+  %recover = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*), i32 0, i8* (i8*, i8*)* @main.catch)
+  indirectbr i8* %recover, [label %try.cont.split]
+
+try.cont.split:                                   ; preds = %lpad1
+  ret i32 0
+}
+
+; CHECK-LABEL: main:
+; CHECK:        .seh_handlerdata
+; CHECK:        .long   ($cppxdata$main)@IMGREL
+
+declare i32 @__CxxFrameHandler3(...)
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.eh.typeid.for(i8*) #2
+
+; Function Attrs: nounwind
+declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #3
+
+; Function Attrs: nounwind
+declare i32 @printf(i8* nocapture readonly, ...) #4
+
+; Function Attrs: nounwind
+declare void @llvm.eh.endcatch() #3
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #3
+
+; Function Attrs: nounwind
+declare i8* @llvm.eh.actions(...) #3
+
+define internal i8* @main.catch(i8*, i8*) #5 {
+entry:
+  %e.i8 = call i8* @llvm.framerecover(i8* bitcast (i32 ()* @main to i8*), i8* %1, i32 0)
+  %e = bitcast i8* %e.i8 to i32*
+  %2 = bitcast i32* %e to i8*
+  %3 = load i32, i32* %e, align 4, !tbaa !2
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @"\01??_C@_06PNOAJMHG@e?3?5?$CFd?6?$AA@", i64 0, i64 0), i32 %3)
+  invoke void @llvm.donothing()
+          to label %entry.split unwind label %stub
+
+entry.split:                                      ; preds = %entry
+  ret i8* blockaddress(@main, %try.cont.split)
+
+stub:                                             ; preds = %entry
+  %4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          cleanup
+  unreachable
+}
+
+; CHECK-LABEL: main.catch:
+; CHECK:        .seh_handlerdata
+; CHECK:        .long   ($cppxdata$main)@IMGREL
+
+; CHECK-NEXT: $cppxdata$main:
+; CHECK-NEXT:         .long   429065506
+; CHECK-NEXT:         .long   2
+; CHECK-NEXT:         .long   ($stateUnwindMap$main)@IMGREL
+; CHECK-NEXT:         .long   1
+; CHECK-NEXT:         .long   ($tryMap$main)@IMGREL
+; CHECK-NEXT:         .long   1
+; CHECK-NEXT:         .long   ($ip2state$main)@IMGREL
+; CHECK-NEXT:         .long   40
+; CHECK-NEXT:         .long   0
+; CHECK-NEXT:         .long   1
+
+; Make sure we get the right frame escape label.
+
+; CHECK: $handlerMap$0$main:
+; CHECK-NEXT:         .long   0
+; CHECK-NEXT:         .long   "??_R0H@8"@IMGREL
+; CHECK-NEXT:         .long   .Lmain$frame_escape_0
+; CHECK-NEXT:         .long   main.catch@IMGREL
+; CHECK-NEXT:         .long   .Lmain.catch$parent_frame_offset
+
+; Function Attrs: nounwind readnone
+declare void @llvm.donothing() #2
+
+; Function Attrs: nounwind
+declare void @llvm.frameescape(...) #3
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.framerecover(i8*, i8*, i32) #2
+
+attributes #0 = { noreturn uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" "wineh-parent"="main" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+attributes #4 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { "wineh-parent"="main" }
+attributes #6 = { noreturn }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 3.7.0 "}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+
diff --git a/test/CodeGen/WinEH/cppeh-prepared-catch.ll b/test/CodeGen/WinEH/cppeh-prepared-catch.ll
new file mode 100644
index 0000000..98b4afc
--- /dev/null
+++ b/test/CodeGen/WinEH/cppeh-prepared-catch.ll
@@ -0,0 +1,203 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+; This test case is equivalent to:
+; void f() {
+;   try {
+;     try {
+;       may_throw();
+;     } catch (int &) {
+;       may_throw();
+;     }
+;     may_throw();
+;   } catch (double) {
+;   }
+; }
+
+
+%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
+%eh.CatchHandlerType = type { i32, i8* }
+
+$"\01??_R0N@8" = comdat any
+
+$"\01??_R0H@8" = comdat any
+
+@"\01??_7type_info@@6B@" = external constant i8*
+@"\01??_R0N@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".N\00" }, comdat
+@llvm.eh.handlertype.N.0 = private unnamed_addr constant %eh.CatchHandlerType { i32 0, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0N@8" to i8*) }, section "llvm.metadata"
+@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
+@llvm.eh.handlertype.H.8 = private unnamed_addr constant %eh.CatchHandlerType { i32 8, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*) }, section "llvm.metadata"
+
+define internal i8* @"\01?f@@YAXXZ.catch"(i8*, i8*) #4 {
+entry:
+  %.i8 = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?f@@YAXXZ" to i8*), i8* %1, i32 0)
+  %bc2 = bitcast i8* %.i8 to i32**
+  %bc3 = bitcast i32** %bc2 to i8*
+  invoke void @"\01?may_throw@@YAXXZ"()
+          to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2:                                     ; preds = %entry
+  ret i8* blockaddress(@"\01?f@@YAXXZ", %try.cont)
+
+lpad1:                                            ; preds = %entry
+  %lp4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          cleanup
+          catch %eh.CatchHandlerType* @llvm.eh.handlertype.N.0
+  %recover = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.N.0 to i8*), i32 1, i8* (i8*, i8*)* @"\01?f@@YAXXZ.catch1")
+  indirectbr i8* %recover, [label %invoke.cont2]
+}
+
+; CHECK-LABEL: "?f@@YAXXZ.catch":
+; No code should be generated for the indirectbr.
+; CHECK-NOT: jmpq *
+; CHECK:        .seh_handlerdata
+; CHECK:        .long   ("$cppxdata$?f@@YAXXZ")@IMGREL
+
+
+define internal i8* @"\01?f@@YAXXZ.catch1"(i8*, i8*) #4 {
+entry:
+  %.i8 = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?f@@YAXXZ" to i8*), i8* %1, i32 1)
+  %2 = bitcast i8* %.i8 to double*
+  %3 = bitcast double* %2 to i8*
+  invoke void (...)* @llvm.donothing()
+          to label %done unwind label %lpad
+
+done:
+  ret i8* blockaddress(@"\01?f@@YAXXZ", %try.cont8)
+
+lpad:                                             ; preds = %entry
+  %4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          cleanup
+  unreachable
+}
+
+; CHECK-LABEL: "?f@@YAXXZ.catch1":
+; No code should be generated for the indirectbr.
+; CHECK-NOT: jmpq *
+; CHECK: ".L?f@@YAXXZ.catch1$parent_frame_offset" = 16
+; CHECK:         movq    %rdx, 16(%rsp)
+; CHECK:        .seh_handlerdata
+; CHECK:        .long   ("$cppxdata$?f@@YAXXZ")@IMGREL
+
+define void @"\01?f@@YAXXZ"() #0 {
+entry:
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  %0 = alloca i32*, align 8
+  %1 = alloca double, align 8
+  call void (...) @llvm.frameescape(i32** %0, double* %1)
+  invoke void @"\01?may_throw@@YAXXZ"()
+          to label %invoke.cont unwind label %lpad2
+
+invoke.cont:                                      ; preds = %entry
+  br label %try.cont
+
+lpad2:                                            ; preds = %entry
+  %2 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          catch %eh.CatchHandlerType* @llvm.eh.handlertype.H.8
+          catch %eh.CatchHandlerType* @llvm.eh.handlertype.N.0
+  %recover = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.8 to i8*), i32 0, i8* (i8*, i8*)* @"\01?f@@YAXXZ.catch", i32 1, i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.N.0 to i8*), i32 1, i8* (i8*, i8*)* @"\01?f@@YAXXZ.catch1")
+  indirectbr i8* %recover, [label %try.cont, label %try.cont8]
+
+try.cont:                                         ; preds = %lpad2, %invoke.cont
+  invoke void @"\01?may_throw@@YAXXZ"()
+          to label %try.cont8 unwind label %lpad1
+
+lpad1:
+  %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          catch %eh.CatchHandlerType* @llvm.eh.handlertype.N.0
+  %recover2 = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.N.0 to i8*), i32 1, i8* (i8*, i8*)* @"\01?f@@YAXXZ.catch1")
+  indirectbr i8* %recover2, [label %try.cont8]
+
+try.cont8:                                        ; preds = %lpad2, %try.cont
+  ret void
+}
+
+; CHECK-LABEL: "?f@@YAXXZ":
+; No code should be generated for the indirectbr.
+; CHECK-NOT: jmpq *
+; CHECK:             .seh_handlerdata
+; CHECK-NEXT:        .long   ("$cppxdata$?f@@YAXXZ")@IMGREL
+; CHECK-NEXT:"$cppxdata$?f@@YAXXZ":
+; CHECK-NEXT:        .long   429065506
+; CHECK-NEXT:        .long   4
+; CHECK-NEXT:        .long   ("$stateUnwindMap$?f@@YAXXZ")@IMGREL
+; CHECK-NEXT:        .long   2
+; CHECK-NEXT:        .long   ("$tryMap$?f@@YAXXZ")@IMGREL
+; CHECK-NEXT:        .long   3
+; CHECK-NEXT:        .long   ("$ip2state$?f@@YAXXZ")@IMGREL
+; CHECK-NEXT:        .long   32
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   1
+; CHECK-NEXT:"$stateUnwindMap$?f@@YAXXZ":
+; CHECK-NEXT:        .long   -1
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   -1
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:"$tryMap$?f@@YAXXZ":
+; CHECK-NEXT:        .long   1
+; CHECK-NEXT:        .long   1
+; CHECK-NEXT:        .long   2
+; CHECK-NEXT:        .long   1
+; CHECK-NEXT:        .long   ("$handlerMap$0$?f@@YAXXZ")@IMGREL
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   2
+; CHECK-NEXT:        .long   3
+; CHECK-NEXT:        .long   1
+; CHECK-NEXT:        .long   ("$handlerMap$1$?f@@YAXXZ")@IMGREL
+; CHECK-NEXT:"$handlerMap$0$?f@@YAXXZ":
+; CHECK-NEXT:        .long   8
+; CHECK-NEXT:        .long   "??_R0H@8"@IMGREL
+; CHECK-NEXT:        .long   ".L?f@@YAXXZ$frame_escape_0"
+; CHECK-NEXT:        .long   "?f@@YAXXZ.catch"@IMGREL
+; CHECK-NEXT:        .long   ".L?f@@YAXXZ.catch$parent_frame_offset"
+; CHECK-NEXT:"$handlerMap$1$?f@@YAXXZ":
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   "??_R0N@8"@IMGREL
+; CHECK-NEXT:        .long   ".L?f@@YAXXZ$frame_escape_1"
+; CHECK-NEXT:        .long   "?f@@YAXXZ.catch1"@IMGREL
+; CHECK-NEXT:        .long   ".L?f@@YAXXZ.catch1$parent_frame_offset"
+; CHECK-NEXT:"$ip2state$?f@@YAXXZ":
+; CHECK-NEXT:        .long   .Ltmp0@IMGREL
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   .Ltmp13@IMGREL
+; CHECK-NEXT:        .long   1
+; CHECK-NEXT:        .long   .Ltmp16@IMGREL
+; CHECK-NEXT:        .long   0
+
+
+declare void @"\01?may_throw@@YAXXZ"() #1
+
+declare i32 @__CxxFrameHandler3(...)
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.eh.typeid.for(i8*) #2
+
+; Function Attrs: nounwind
+declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #3
+
+; Function Attrs: nounwind
+declare void @llvm.eh.endcatch() #3
+
+; Function Attrs: nounwind
+declare i8* @llvm.eh.actions(...) #3
+
+; Function Attrs: nounwind
+declare void @llvm.frameescape(...) #3
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.framerecover(i8*, i8*, i32) #2
+
+declare void @llvm.donothing(...)
+
+attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" "wineh-parent"="?f@@YAXXZ" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+attributes #4 = { "wineh-parent"="?f@@YAXXZ" }
diff --git a/test/CodeGen/WinEH/cppeh-prepared-cleanups.ll b/test/CodeGen/WinEH/cppeh-prepared-cleanups.ll
new file mode 100644
index 0000000..b958589
--- /dev/null
+++ b/test/CodeGen/WinEH/cppeh-prepared-cleanups.ll
@@ -0,0 +1,241 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
+%eh.CatchableType = type { i32, i32, i32, i32, i32, i32, i32 }
+%eh.CatchableTypeArray.1 = type { i32, [1 x i32] }
+%eh.ThrowInfo = type { i32, i32, i32, i32 }
+%struct.S = type { i8 }
+
+$"\01??_DS@@QEAA@XZ" = comdat any
+
+$"\01??_R0H@8" = comdat any
+
+$"_CT??_R0H@84" = comdat any
+
+$_CTA1H = comdat any
+
+$_TI1H = comdat any
+
+@"\01??_7type_info@@6B@" = external constant i8*
+@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
+@__ImageBase = external constant i8
+@"_CT??_R0H@84" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 1, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32), i32 0, i32 -1, i32 0, i32 4, i32 0 }, section ".xdata", comdat
+@_CTA1H = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.1 { i32 1, [1 x i32] [i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableType* @"_CT??_R0H@84" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32)] }, section ".xdata", comdat
+@_TI1H = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 0, i32 0, i32 0, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableTypeArray.1* @_CTA1H to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32) }, section ".xdata", comdat
+
+
+; CHECK-LABEL: "?test1@@YAXXZ":
+; CHECK:             .seh_handlerdata
+; CHECK-NEXT:        .long   ("$cppxdata$?test1@@YAXXZ")@IMGREL
+; CHECK-NEXT:"$cppxdata$?test1@@YAXXZ":
+; CHECK-NEXT:        .long   429065506
+; CHECK-NEXT:        .long   1
+; CHECK-NEXT:        .long   ("$stateUnwindMap$?test1@@YAXXZ")@IMGREL
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   1
+; CHECK-NEXT:        .long   ("$ip2state$?test1@@YAXXZ")@IMGREL
+; CHECK-NEXT:        .long   32
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   1
+; CHECK-NEXT:"$stateUnwindMap$?test1@@YAXXZ":
+; CHECK-NEXT:        .long   -1
+; CHECK-NEXT:        .long   "?test1@@YAXXZ.cleanup"@IMGREL
+; CHECK-NEXT:"$ip2state$?test1@@YAXXZ":
+; CHECK-NEXT:        .long   .Ltmp0@IMGREL
+; CHECK-NEXT:        .long   0
+
+define void @"\01?test1@@YAXXZ"() #0 {
+entry:
+  %unwindhelp = alloca i64
+  %tmp = alloca i32, align 4
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  store i32 0, i32* %tmp
+  %0 = bitcast i32* %tmp to i8*
+  call void (...) @llvm.frameescape()
+  store volatile i64 -2, i64* %unwindhelp
+  %1 = bitcast i64* %unwindhelp to i8*
+  call void @llvm.eh.unwindhelp(i8* %1)
+  invoke void @_CxxThrowException(i8* %0, %eh.ThrowInfo* @_TI1H) #8
+          to label %unreachable unwind label %lpad1
+
+lpad1:                                            ; preds = %entry
+  %2 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          cleanup
+  %recover = call i8* (...) @llvm.eh.actions(i32 0, void (i8*, i8*)* @"\01?test1@@YAXXZ.cleanup")
+  indirectbr i8* %recover, []
+
+unreachable:                                      ; preds = %entry
+  unreachable
+}
+
+declare void @_CxxThrowException(i8*, %eh.ThrowInfo*)
+
+declare i32 @__CxxFrameHandler3(...)
+
+; Function Attrs: nounwind
+define linkonce_odr void @"\01??_DS@@QEAA@XZ"(%struct.S* %this) unnamed_addr #1 comdat align 2 {
+entry:
+  %this.addr = alloca %struct.S*, align 8
+  store %struct.S* %this, %struct.S** %this.addr, align 8
+  %this1 = load %struct.S*, %struct.S** %this.addr
+  call void @"\01??1S@@QEAA@XZ"(%struct.S* %this1) #4
+  ret void
+}
+
+; CHECK-LABEL: "?test2@@YAX_N@Z":
+; CHECK:             .seh_handlerdata
+; CHECK-NEXT:        .long   ("$cppxdata$?test2@@YAX_N@Z")@IMGREL
+; CHECK-NEXT:"$cppxdata$?test2@@YAX_N@Z":
+; CHECK-NEXT:        .long   429065506
+; CHECK-NEXT:        .long   2
+; CHECK-NEXT:        .long   ("$stateUnwindMap$?test2@@YAX_N@Z")@IMGREL
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   4
+; CHECK-NEXT:        .long   ("$ip2state$?test2@@YAX_N@Z")@IMGREL
+; CHECK-NEXT:        .long   40
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   1
+; CHECK-NEXT:"$stateUnwindMap$?test2@@YAX_N@Z":
+; CHECK-NEXT:        .long   -1
+; CHECK-NEXT:        .long   "?test2@@YAX_N@Z.cleanup"@IMGREL
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   "?test2@@YAX_N@Z.cleanup1"@IMGREL
+; CHECK-NEXT:"$ip2state$?test2@@YAX_N@Z":
+; CHECK-NEXT:        .long   .Lfunc_begin1@IMGREL
+; CHECK-NEXT:        .long   -1
+; CHECK-NEXT:        .long   .Ltmp7@IMGREL
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   .Ltmp9@IMGREL
+; CHECK-NEXT:        .long   1
+; CHECK-NEXT:        .long   .Ltmp12@IMGREL
+; CHECK-NEXT:        .long   0
+
+define void @"\01?test2@@YAX_N@Z"(i1 zeroext %b) #2 {
+  %b.addr = alloca i8, align 1
+  %s = alloca %struct.S, align 1
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  %s1 = alloca %struct.S, align 1
+  %frombool = zext i1 %b to i8
+  store i8 %frombool, i8* %b.addr, align 1
+  call void (...) @llvm.frameescape(%struct.S* %s, %struct.S* %s1)
+  call void @"\01?may_throw@@YAXXZ"()
+  invoke void @"\01?may_throw@@YAXXZ"()
+          to label %invoke.cont unwind label %lpad1
+
+invoke.cont:                                      ; preds = %entry
+  %1 = load i8, i8* %b.addr, align 1
+  %tobool = trunc i8 %1 to i1
+  br i1 %tobool, label %if.then, label %if.else
+
+if.then:                                          ; preds = %invoke.cont
+  invoke void @"\01?may_throw@@YAXXZ"()
+          to label %invoke.cont3 unwind label %lpad3
+
+invoke.cont3:                                     ; preds = %if.then
+  call void @"\01??_DS@@QEAA@XZ"(%struct.S* %s1) #4
+  br label %if.end
+
+lpad1:                                            ; preds = %entry, %if.end
+  %2 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          cleanup
+  %recover = call i8* (...) @llvm.eh.actions(i32 0, void (i8*, i8*)* @"\01?test2@@YAX_N@Z.cleanup")
+  indirectbr i8* %recover, []
+
+lpad3:                                            ; preds = %if.then
+  %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          cleanup
+  %recover4 = call i8* (...) @llvm.eh.actions(i32 0, void (i8*, i8*)* @"\01?test2@@YAX_N@Z.cleanup1", i32 0, void (i8*, i8*)* @"\01?test2@@YAX_N@Z.cleanup")
+  indirectbr i8* %recover4, []
+
+if.else:                                          ; preds = %invoke.cont
+  call void @"\01?dont_throw@@YAXXZ"() #4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %invoke.cont3
+  invoke void @"\01?may_throw@@YAXXZ"()
+          to label %invoke.cont4 unwind label %lpad1
+
+invoke.cont4:                                     ; preds = %if.end
+  call void @"\01??_DS@@QEAA@XZ"(%struct.S* %s) #4
+  ret void
+}
+
+declare void @"\01?may_throw@@YAXXZ"() #3
+
+; Function Attrs: nounwind
+declare void @"\01?dont_throw@@YAXXZ"() #1
+
+; Function Attrs: nounwind
+declare void @"\01??1S@@QEAA@XZ"(%struct.S*) #1
+
+; Function Attrs: nounwind
+declare i8* @llvm.eh.actions(...) #4
+
+define internal void @"\01?test1@@YAXXZ.cleanup"(i8*, i8*) #5 {
+entry:
+  %s = alloca %struct.S, align 1
+  call void @"\01??_DS@@QEAA@XZ"(%struct.S* %s) #4
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.frameescape(...) #4
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.framerecover(i8*, i8*, i32) #6
+
+; Function Attrs: nounwind
+declare void @llvm.eh.unwindhelp(i8*) #4
+
+define internal void @"\01?test2@@YAX_N@Z.cleanup"(i8*, i8*) #7 {
+entry:
+  %s.i8 = call i8* @llvm.framerecover(i8* bitcast (void (i1)* @"\01?test2@@YAX_N@Z" to i8*), i8* %1, i32 0)
+  %s = bitcast i8* %s.i8 to %struct.S*
+  call void @"\01??_DS@@QEAA@XZ"(%struct.S* %s) #4
+  invoke void @llvm.donothing()
+          to label %entry.split unwind label %stub
+
+entry.split:                                      ; preds = %entry
+  ret void
+
+stub:                                             ; preds = %entry
+  %2 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          cleanup
+  unreachable
+}
+
+define internal void @"\01?test2@@YAX_N@Z.cleanup1"(i8*, i8*) #7 {
+entry:
+  %s1.i8 = call i8* @llvm.framerecover(i8* bitcast (void (i1)* @"\01?test2@@YAX_N@Z" to i8*), i8* %1, i32 1)
+  %s1 = bitcast i8* %s1.i8 to %struct.S*
+  call void @"\01??_DS@@QEAA@XZ"(%struct.S* %s1) #4
+  invoke void @llvm.donothing()
+          to label %entry.split unwind label %stub
+
+entry.split:                                      ; preds = %entry
+  ret void
+
+stub:                                             ; preds = %entry
+  %2 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          cleanup
+  unreachable
+}
+
+declare void @llvm.donothing()
+
+attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" "wineh-parent"="?test1@@YAXXZ" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" "wineh-parent"="?test2@@YAX_N@Z" }
+attributes #3 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+attributes #5 = { "wineh-parent"="?test1@@YAXXZ" }
+attributes #6 = { nounwind readnone }
+attributes #7 = { "wineh-parent"="?test2@@YAX_N@Z" }
+attributes #8 = { noreturn }
diff --git a/test/CodeGen/WinEH/seh-catch-all.ll b/test/CodeGen/WinEH/seh-catch-all.ll
new file mode 100644
index 0000000..fb2b9ba
--- /dev/null
+++ b/test/CodeGen/WinEH/seh-catch-all.ll
@@ -0,0 +1,59 @@
+; RUN: opt -S -winehprepare -sehprepare < %s | FileCheck %s
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+@str.__except = internal unnamed_addr constant [9 x i8] c"__except\00", align 1
+
+; Function Attrs: uwtable
+
+declare i32 @puts(i8*)
+
+define void @may_crash() {
+entry:
+  store volatile i32 42, i32* null, align 4
+  ret void
+}
+
+declare i32 @__C_specific_handler(...)
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.frameaddress(i32)
+
+; Function Attrs: uwtable
+define void @seh_catch_all() {
+entry:
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  invoke void @may_crash()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  br label %__try.cont
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
+          catch i8* null
+  %1 = extractvalue { i8*, i32 } %0, 0
+  store i8* %1, i8** %exn.slot
+  %2 = extractvalue { i8*, i32 } %0, 1
+  store i32 %2, i32* %ehselector.slot
+  br label %__except
+
+__except:                                         ; preds = %lpad
+  %call = call i32 @puts(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @str.__except, i32 0, i32 0))
+  br label %__try.cont
+
+__try.cont:                                       ; preds = %__except, %invoke.cont
+  ret void
+}
+
+; CHECK-LABEL: define void @seh_catch_all()
+; CHECK: landingpad
+; CHECK-NEXT: catch i8* null
+; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 1, i8* null, i32 -1, i8* blockaddress(@seh_catch_all, %catch.all))
+; CHECK-NEXT: indirectbr
+;
+; CHECK: catch.all:
+; CHECK-NOT: extractvalue
+; CHECK: call i32 @puts
diff --git a/test/CodeGen/WinEH/seh-inlined-finally.ll b/test/CodeGen/WinEH/seh-inlined-finally.ll
new file mode 100644
index 0000000..2e6171a
--- /dev/null
+++ b/test/CodeGen/WinEH/seh-inlined-finally.ll
@@ -0,0 +1,35 @@
+; RUN: opt -S -winehprepare -sehprepare < %s | FileCheck %s
+
+; Check that things work when the mid-level optimizer inlines the finally
+; block.
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+declare i32 @puts(i8*)
+declare void @may_crash()
+declare i32 @__C_specific_handler(...)
+
+define void @use_finally() {
+entry:
+  invoke void @may_crash()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  %call.i = tail call i32 @puts(i8* null)
+  ret void
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
+          cleanup
+  %call.i2 = tail call i32 @puts(i8* null)
+  resume { i8*, i32 } %0
+}
+
+; CHECK-LABEL: define void @use_finally()
+; CHECK: invoke void @may_crash()
+;
+; CHECK: landingpad
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 0, void (i8*, i8*)* @use_finally.cleanup)
+; CHECK-NEXT: indirectbr i8* %recover, []
diff --git a/test/CodeGen/WinEH/seh-outlined-finally.ll b/test/CodeGen/WinEH/seh-outlined-finally.ll
new file mode 100644
index 0000000..bc9d322
--- /dev/null
+++ b/test/CodeGen/WinEH/seh-outlined-finally.ll
@@ -0,0 +1,155 @@
+; RUN: opt -S -winehprepare -sehprepare -mtriple=x86_64-windows-msvc < %s | FileCheck %s
+
+; Test case based on this code:
+;
+; extern "C" int _abnormal_termination();
+; #pragma intrinsic(_abnormal_termination)
+; extern "C" int printf(const char *, ...);
+; extern "C" void may_crash() {
+;   *(volatile int *)0 = 42;
+; }
+; int main() {
+;   int myres = 0;
+;   __try {
+;     __try {
+;       may_crash();
+;     } __finally {
+;       printf("inner finally %d\n", _abnormal_termination());
+;       may_crash();
+;     }
+;   } __finally {
+;     printf("outer finally %d\n", _abnormal_termination());
+;   }
+; }
+;
+; Note that if the inner finally crashes, the outer finally still runs. There
+; is nothing like a std::terminate call in this situation.
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+@str_outer_finally = linkonce_odr unnamed_addr constant [18 x i8] c"outer finally %d\0A\00", align 1
+@str_inner_finally = linkonce_odr unnamed_addr constant [18 x i8] c"inner finally %d\0A\00", align 1
+
+; Function Attrs: nounwind uwtable
+define void @may_crash() #0 {
+entry:
+  store volatile i32 42, i32* null, align 4
+  ret void
+}
+
+; Function Attrs: uwtable
+define i32 @main() #1 {
+entry:
+  %myres = alloca i32, align 4
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  store i32 0, i32* %myres, align 4
+  invoke void @may_crash() #4
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  %0 = call i8* @llvm.frameaddress(i32 0)
+  invoke void @"\01?fin$1@0@main@@"(i1 zeroext false, i8* %0) #4
+          to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2:                                     ; preds = %invoke.cont
+  %1 = call i8* @llvm.frameaddress(i32 0)
+  call void @"\01?fin$0@0@main@@"(i1 zeroext false, i8* %1)
+  ret i32 0
+
+lpad:                                             ; preds = %entry
+  %2 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
+          cleanup
+  %3 = extractvalue { i8*, i32 } %2, 0
+  store i8* %3, i8** %exn.slot
+  %4 = extractvalue { i8*, i32 } %2, 1
+  store i32 %4, i32* %ehselector.slot
+  %5 = call i8* @llvm.frameaddress(i32 0)
+  invoke void @"\01?fin$1@0@main@@"(i1 zeroext true, i8* %5) #4
+          to label %invoke.cont3 unwind label %lpad1
+
+lpad1:                                            ; preds = %lpad, %invoke.cont
+  %6 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
+          cleanup
+  %7 = extractvalue { i8*, i32 } %6, 0
+  store i8* %7, i8** %exn.slot
+  %8 = extractvalue { i8*, i32 } %6, 1
+  store i32 %8, i32* %ehselector.slot
+  br label %ehcleanup
+
+invoke.cont3:                                     ; preds = %lpad
+  br label %ehcleanup
+
+ehcleanup:                                        ; preds = %invoke.cont3, %lpad1
+  %9 = call i8* @llvm.frameaddress(i32 0)
+  call void @"\01?fin$0@0@main@@"(i1 zeroext true, i8* %9)
+  br label %eh.resume
+
+eh.resume:                                        ; preds = %ehcleanup
+  %exn = load i8*, i8** %exn.slot
+  %sel = load i32, i32* %ehselector.slot
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0
+  %lpad.val4 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1
+  resume { i8*, i32 } %lpad.val4
+}
+
+; CHECK-NOT: define internal void @
+
+; CHECK-LABEL: define i32 @main()
+; CHECK: invoke void @may_crash()
+;
+; CHECK: landingpad { i8*, i32 }
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 0, void (i1, i8*)* @"\01?fin$1@0@main@@", i32 0, void (i1, i8*)* @"\01?fin$0@0@main@@")
+; CHECK-NEXT: indirectbr
+;
+; CHECK: landingpad { i8*, i32 }
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 0, void (i1, i8*)* @"\01?fin$0@0@main@@")
+; CHECK-NEXT: indirectbr
+
+; There should not be any *new* cleanup helpers, just the existing ones.
+; CHECK-NOT: define internal void @
+; CHECK: define internal void @"\01?fin$0@0@main@@"(i1 zeroext %abnormal_termination, i8* %frame_pointer)
+; CHECK-NOT: define internal void @
+; CHECK: define internal void @"\01?fin$1@0@main@@"(i1 zeroext %abnormal_termination, i8* %frame_pointer)
+; CHECK-NOT: define internal void @
+
+define internal void @"\01?fin$0@0@main@@"(i1 zeroext %abnormal_termination, i8* %frame_pointer) #2 {
+entry:
+  %frame_pointer.addr = alloca i8*, align 8
+  %abnormal_termination.addr = alloca i8, align 1
+  store i8* %frame_pointer, i8** %frame_pointer.addr, align 8
+  %frombool = zext i1 %abnormal_termination to i8
+  store i8 %frombool, i8* %abnormal_termination.addr, align 1
+  %0 = zext i1 %abnormal_termination to i32
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @str_outer_finally, i32 0, i32 0), i32 %0)
+  ret void
+}
+
+declare i32 @printf(i8*, ...) #2
+
+define internal void @"\01?fin$1@0@main@@"(i1 zeroext %abnormal_termination, i8* %frame_pointer) #2 {
+entry:
+  %frame_pointer.addr = alloca i8*, align 8
+  %abnormal_termination.addr = alloca i8, align 1
+  store i8* %frame_pointer, i8** %frame_pointer.addr, align 8
+  %frombool = zext i1 %abnormal_termination to i8
+  store i8 %frombool, i8* %abnormal_termination.addr, align 1
+  %0 = zext i1 %abnormal_termination to i32
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @str_inner_finally, i32 0, i32 0), i32 %0)
+  call void @may_crash()
+  ret void
+}
+
+declare i32 @__C_specific_handler(...)
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.frameaddress(i32) #3
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind readnone }
+attributes #4 = { noinline }
diff --git a/test/CodeGen/WinEH/seh-simple.ll b/test/CodeGen/WinEH/seh-simple.ll
index 26b3f83..344a0c8 100644
--- a/test/CodeGen/WinEH/seh-simple.ll
+++ b/test/CodeGen/WinEH/seh-simple.ll
@@ -39,7 +39,7 @@ eh.resume:
 ; CHECK-LABEL: define i32 @simple_except_store()
 ; CHECK: landingpad { i8*, i32 }
 ; CHECK-NEXT: catch i32 ()* @filt
-; CHECK-NEXT: call i8* (...)* @llvm.eh.actions(i32 1, i8* bitcast (i32 ()* @filt to i8*), i8* null, i8* blockaddress(@simple_except_store, %__except))
+; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (i32 ()* @filt to i8*), i32 -1, i8* blockaddress(@simple_except_store, %__except))
 ; CHECK-NEXT: indirectbr {{.*}} [label %__except]
 
 define i32 @catch_all() {
@@ -63,7 +63,7 @@ return:
 ; CHECK-LABEL: define i32 @catch_all()
 ; CHECK: landingpad { i8*, i32 }
 ; CHECK-NEXT: catch i8* null
-; CHECK-NEXT: call i8* (...)* @llvm.eh.actions(i32 1, i8* null, i8* null, i8* blockaddress(@catch_all, %catch.all))
+; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 1, i8* null, i32 -1, i8* blockaddress(@catch_all, %catch.all))
 ; CHECK-NEXT: indirectbr {{.*}} [label %catch.all]
 ;
 ; CHECK: catch.all:
@@ -94,7 +94,7 @@ eh.resume:
 ; CHECK-LABEL: define i32 @except_phi()
 ; CHECK: landingpad { i8*, i32 }
 ; CHECK-NEXT: catch i32 ()* @filt
-; CHECK-NEXT: call i8* (...)* @llvm.eh.actions(i32 1, i8* bitcast (i32 ()* @filt to i8*), i8* null, i8* blockaddress(@except_phi, %return))
+; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (i32 ()* @filt to i8*), i32 -1, i8* blockaddress(@except_phi, %return))
 ; CHECK-NEXT: indirectbr {{.*}} [label %return]
 ;
 ; CHECK: return:
@@ -128,9 +128,9 @@ eh.resume:
 ; CHECK: landingpad { i8*, i32 }
 ; CHECK-NEXT: cleanup
 ; CHECK-NEXT: catch i32 ()* @filt
-; CHECK-NEXT: call i8* (...)* @llvm.eh.actions(
+; CHECK-NEXT: call i8* (...) @llvm.eh.actions(
 ; CHECK: i32 0, void (i8*, i8*)* @cleanup_and_except.cleanup,
-; CHECK: i32 1, i8* bitcast (i32 ()* @filt to i8*), i8* null, i8* blockaddress(@cleanup_and_except, %return))
+; CHECK: i32 1, i8* bitcast (i32 ()* @filt to i8*), i32 -1, i8* blockaddress(@cleanup_and_except, %return))
 ; CHECK-NEXT: indirectbr {{.*}} [label %return]
 ;
 ; CHECK: return:
diff --git a/test/CodeGen/X86/2006-10-13-CycleInDAG.ll b/test/CodeGen/X86/2006-10-13-CycleInDAG.ll
index 664da5e..c45469d 100644
--- a/test/CodeGen/X86/2006-10-13-CycleInDAG.ll
+++ b/test/CodeGen/X86/2006-10-13-CycleInDAG.ll
@@ -4,7 +4,7 @@
 define void @test() {
 bb.i:
 	%tmp.i660 = load <4 x float>, <4 x float>* null		; <<4 x float>> [#uses=1]
-	call void (i32, ...)* @printf( i32 0, i8* getelementptr ([18 x i8], [18 x i8]* @str, i32 0, i64 0), double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00 )
+	call void (i32, ...) @printf( i32 0, i8* getelementptr ([18 x i8], [18 x i8]* @str, i32 0, i64 0), double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00 )
 	%tmp152.i = load <4 x i32>, <4 x i32>* null		; <<4 x i32>> [#uses=1]
 	%tmp156.i = bitcast <4 x i32> %tmp152.i to <4 x i32>		; <<4 x i32>> [#uses=1]
 	%tmp175.i = bitcast <4 x float> %tmp.i660 to <4 x i32>		; <<4 x i32>> [#uses=1]
diff --git a/test/CodeGen/X86/2006-10-19-SwitchUnnecessaryBranching.ll b/test/CodeGen/X86/2006-10-19-SwitchUnnecessaryBranching.ll
index 6b062d5..dd67064 100644
--- a/test/CodeGen/X86/2006-10-19-SwitchUnnecessaryBranching.ll
+++ b/test/CodeGen/X86/2006-10-19-SwitchUnnecessaryBranching.ll
@@ -15,11 +15,11 @@ entry:
 	]
 
 bb:		; preds = %entry
-	%tmp1 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([14 x i8], [14 x i8]* @str, i32 0, i64 0) )		; <i32> [#uses=0]
+	%tmp1 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([14 x i8], [14 x i8]* @str, i32 0, i64 0) )		; <i32> [#uses=0]
 	ret i32 0
 
 bb2:		; preds = %entry
-	%tmp4 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([13 x i8], [13 x i8]* @str.upgrd.1, i32 0, i64 0) )		; <i32> [#uses=0]
+	%tmp4 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([13 x i8], [13 x i8]* @str.upgrd.1, i32 0, i64 0) )		; <i32> [#uses=0]
 	ret i32 0
 
 UnifiedReturnBlock:		; preds = %entry
diff --git a/test/CodeGen/X86/2006-11-12-CSRetCC.ll b/test/CodeGen/X86/2006-11-12-CSRetCC.ll
index 9adfff3..b6a8fc0 100644
--- a/test/CodeGen/X86/2006-11-12-CSRetCC.ll
+++ b/test/CodeGen/X86/2006-11-12-CSRetCC.ll
@@ -51,7 +51,7 @@ entry:
         %tmp20 = getelementptr { double, double }, { double, double }* %z, i64 0, i32 0             ; <double*> [#uses=1]
         %tmp21 = load double, double* %tmp20            ; <double> [#uses=1]
         %tmp.upgrd.6 = getelementptr [9 x i8], [9 x i8]* @str, i32 0, i64 0               ; <i8*> [#uses=1]
-        %tmp.upgrd.7 = call i32 (i8*, ...)* @printf( i8* %tmp.upgrd.6, double %tmp21, double %tmp19 )           ; <i32> [#uses=0]
+        %tmp.upgrd.7 = call i32 (i8*, ...) @printf( i8* %tmp.upgrd.6, double %tmp21, double %tmp19 )           ; <i32> [#uses=0]
         br label %finish
 finish:
         %retval.upgrd.8 = load i32, i32* %retval             ; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/2006-12-19-IntelSyntax.ll b/test/CodeGen/X86/2006-12-19-IntelSyntax.ll
index f81b303..2c3c5c9 100644
--- a/test/CodeGen/X86/2006-12-19-IntelSyntax.ll
+++ b/test/CodeGen/X86/2006-12-19-IntelSyntax.ll
@@ -21,55 +21,55 @@ entry:
 	]
 
 bb:		; preds = %entry
-	call void (...)* @foo1( )
+	call void (...) @foo1( )
 	ret void
 
 bb1:		; preds = %entry
-	call void (...)* @foo2( )
+	call void (...) @foo2( )
 	ret void
 
 bb2:		; preds = %entry
-	call void (...)* @foo6( )
+	call void (...) @foo6( )
 	ret void
 
 bb3:		; preds = %entry
-	call void (...)* @foo3( )
+	call void (...) @foo3( )
 	ret void
 
 bb4:		; preds = %entry
-	call void (...)* @foo4( )
+	call void (...) @foo4( )
 	ret void
 
 bb5:		; preds = %entry
-	call void (...)* @foo5( )
+	call void (...) @foo5( )
 	ret void
 
 bb6:		; preds = %entry
-	call void (...)* @foo1( )
+	call void (...) @foo1( )
 	ret void
 
 bb7:		; preds = %entry
-	call void (...)* @foo2( )
+	call void (...) @foo2( )
 	ret void
 
 bb8:		; preds = %entry
-	call void (...)* @foo6( )
+	call void (...) @foo6( )
 	ret void
 
 bb9:		; preds = %entry
-	call void (...)* @foo3( )
+	call void (...) @foo3( )
 	ret void
 
 bb10:		; preds = %entry
-	call void (...)* @foo4( )
+	call void (...) @foo4( )
 	ret void
 
 bb11:		; preds = %entry
-	call void (...)* @foo5( )
+	call void (...) @foo5( )
 	ret void
 
 bb12:		; preds = %entry
-	call void (...)* @foo6( )
+	call void (...) @foo6( )
 	ret void
 }
 
diff --git a/test/CodeGen/X86/2007-02-16-BranchFold.ll b/test/CodeGen/X86/2007-02-16-BranchFold.ll
index 596021a..22e0a4e 100644
--- a/test/CodeGen/X86/2007-02-16-BranchFold.ll
+++ b/test/CodeGen/X86/2007-02-16-BranchFold.ll
@@ -60,7 +60,7 @@ bb.i9.i.i932.ce:		; preds = %newFuncRoot
 	%tmp1.i6.i = getelementptr %struct.operator, %struct.operator* %tmp66.i62.i, i32 0, i32 2		; <i32*> [#uses=1]
 	%tmp2.i7.i = load i32, i32* %tmp1.i6.i		; <i32> [#uses=1]
 	%tmp3.i8.i = load %struct.FILE*, %struct.FILE** @outfile		; <%struct.FILE*> [#uses=1]
-	%tmp5.i9.i = call i32 (%struct.FILE*, i8*, ...)* @fprintf( %struct.FILE* %tmp3.i8.i, i8* getelementptr ([11 x i8], [11 x i8]* @str1, i32 0, i32 0), i32 %tmp2.i7.i )		; <i32> [#uses=0]
+	%tmp5.i9.i = call i32 (%struct.FILE*, i8*, ...) @fprintf( %struct.FILE* %tmp3.i8.i, i8* getelementptr ([11 x i8], [11 x i8]* @str1, i32 0, i32 0), i32 %tmp2.i7.i )		; <i32> [#uses=0]
 	%tmp7.i10.i = getelementptr %struct.operator, %struct.operator* %tmp66.i62.i, i32 0, i32 5		; <i32*> [#uses=1]
 	%tmp8.i11.i = load i32, i32* %tmp7.i10.i		; <i32> [#uses=7]
 	br label %NodeBlock5
diff --git a/test/CodeGen/X86/2007-02-19-LiveIntervalAssert.ll b/test/CodeGen/X86/2007-02-19-LiveIntervalAssert.ll
index 5d2c01a..a9b85b9 100644
--- a/test/CodeGen/X86/2007-02-19-LiveIntervalAssert.ll
+++ b/test/CodeGen/X86/2007-02-19-LiveIntervalAssert.ll
@@ -7,7 +7,7 @@
 
 define void @__eprintf(i8* %string, i8* %expression, i32 %line, i8* %filename) {
 	%tmp = load %struct._IO_FILE*, %struct._IO_FILE** @stderr
-	%tmp5 = tail call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf( %struct._IO_FILE* %tmp, i8* %string, i8* %expression, i32 %line, i8* %filename )
+	%tmp5 = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf( %struct._IO_FILE* %tmp, i8* %string, i8* %expression, i32 %line, i8* %filename )
 	%tmp6 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr
 	%tmp7 = tail call i32 @fflush( %struct._IO_FILE* %tmp6 )
 	tail call void @abort( )
diff --git a/test/CodeGen/X86/2007-05-05-VecCastExpand.ll b/test/CodeGen/X86/2007-05-05-VecCastExpand.ll
index e6eaa57..0edf139 100644
--- a/test/CodeGen/X86/2007-05-05-VecCastExpand.ll
+++ b/test/CodeGen/X86/2007-05-05-VecCastExpand.ll
@@ -6,7 +6,7 @@
 define void @test() {
 bb.i:
 	%tmp.i660 = load <4 x float>, <4 x float>* null		; <<4 x float>> [#uses=1]
-	call void (i32, ...)* @printf( i32 0, i8* getelementptr ([18 x i8], [18 x i8]* @str, i32 0, i64 0), double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00 )
+	call void (i32, ...) @printf( i32 0, i8* getelementptr ([18 x i8], [18 x i8]* @str, i32 0, i64 0), double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00 )
 	%tmp152.i = load <4 x i32>, <4 x i32>* null		; <<4 x i32>> [#uses=1]
 	%tmp156.i = bitcast <4 x i32> %tmp152.i to <4 x i32>		; <<4 x i32>> [#uses=1]
 	%tmp175.i = bitcast <4 x float> %tmp.i660 to <4 x i32>		; <<4 x i32>> [#uses=1]
diff --git a/test/CodeGen/X86/2007-05-14-LiveIntervalAssert.ll b/test/CodeGen/X86/2007-05-14-LiveIntervalAssert.ll
index ecc5835..9ce5f5a 100644
--- a/test/CodeGen/X86/2007-05-14-LiveIntervalAssert.ll
+++ b/test/CodeGen/X86/2007-05-14-LiveIntervalAssert.ll
@@ -19,7 +19,7 @@ cond_true109:		; preds = %entry
 
 cond_next164:		; preds = %cond_true109
 	%tmp176 = call signext i16 @GetParamDesc( %struct.XDesc* null, i32 1701999219, i32 1413830740, %struct.XDesc* null ) 
-	call void (i64, i8*, ...)* @r_raise( i64 0, i8* null )
+	call void (i64, i8*, ...) @r_raise( i64 0, i8* null )
 	unreachable
 
 cond_true239:		; preds = %cond_true109
diff --git a/test/CodeGen/X86/2007-07-10-StackerAssert.ll b/test/CodeGen/X86/2007-07-10-StackerAssert.ll
index b19f445..c8660f7 100644
--- a/test/CodeGen/X86/2007-07-10-StackerAssert.ll
+++ b/test/CodeGen/X86/2007-07-10-StackerAssert.ll
@@ -30,7 +30,7 @@ cond_true425:		; preds = %bb383
 	%tmp432 = fsub float %tmp430, %tmp408		; <float> [#uses=1]
 	%tmp432433 = fpext float %tmp432 to double		; <double> [#uses=1]
 	%tmp434435 = fpext float %tmp408 to double		; <double> [#uses=1]
-	call void (i8*, ...)* @PR_LogPrint( i8* getelementptr ([56 x i8], [56 x i8]* @.str97, i32 0, i32 0), double 0.000000e+00, double %tmp434435, double %tmp432433 )
+	call void (i8*, ...) @PR_LogPrint( i8* getelementptr ([56 x i8], [56 x i8]* @.str97, i32 0, i32 0), double 0.000000e+00, double %tmp434435, double %tmp432433 )
 	ret i32 0
 
 cond_next443:		; preds = %bb383
diff --git a/test/CodeGen/X86/2007-10-15-CoalescerCrash.ll b/test/CodeGen/X86/2007-10-15-CoalescerCrash.ll
index f2ae922..c6eb6f0 100644
--- a/test/CodeGen/X86/2007-10-15-CoalescerCrash.ll
+++ b/test/CodeGen/X86/2007-10-15-CoalescerCrash.ll
@@ -362,7 +362,7 @@ bb1159:		; preds = %cond_next1150
 
 cond_true1169:		; preds = %bb1159
 	%tmp11741175 = trunc i64 %lsum.11225.0 to i32		; <i32> [#uses=1]
-	%tmp1178 = tail call i32 (%struct._IO_FILE*  , i8*  , ...)* @fprintf( %struct._IO_FILE* noalias %file  , i8* getelementptr ([49 x i8], [49 x i8]* @.str32, i32 0, i64 0)  , i32 %tmp11741175, i32 0 )		; <i32> [#uses=0]
+	%tmp1178 = tail call i32 (%struct._IO_FILE*  , i8*  , ...) @fprintf( %struct._IO_FILE* noalias %file  , i8* getelementptr ([49 x i8], [49 x i8]* @.str32, i32 0, i64 0)  , i32 %tmp11741175, i32 0 )		; <i32> [#uses=0]
 	ret void
 
 UnifiedReturnBlock:		; preds = %bb1159
diff --git a/test/CodeGen/X86/2007-11-04-LiveIntervalCrash.ll b/test/CodeGen/X86/2007-11-04-LiveIntervalCrash.ll
index 019c442..a20fb47 100644
--- a/test/CodeGen/X86/2007-11-04-LiveIntervalCrash.ll
+++ b/test/CodeGen/X86/2007-11-04-LiveIntervalCrash.ll
@@ -30,7 +30,7 @@ bb37:           ; preds = %bb37.loopexit, %entry
         %hash.0.reg2mem.1 = phi i32 [ %phitmp, %bb37.loopexit ], [ 0, %entry ]          ; <i32> [#uses=1]
         store i32 %hash.0.reg2mem.1, i32* null, align 8
         %tmp75 = tail call i32 null( %struct.dentry* %dir, %struct.qstr* %name )                ; <i32> [#uses=0]
-        %tmp84 = tail call i32 (...)* @d_lookup( %struct.dentry* %dir, %struct.qstr* %name )            ; <i32> [#uses=0]
+        %tmp84 = tail call i32 (...) @d_lookup( %struct.dentry* %dir, %struct.qstr* %name )            ; <i32> [#uses=0]
         ret %struct.dentry* null
 }
 
diff --git a/test/CodeGen/X86/2008-02-18-TailMergingBug.ll b/test/CodeGen/X86/2008-02-18-TailMergingBug.ll
index efb87f2..ef69bd0 100644
--- a/test/CodeGen/X86/2008-02-18-TailMergingBug.ll
+++ b/test/CodeGen/X86/2008-02-18-TailMergingBug.ll
@@ -213,7 +213,7 @@ bb456:		; preds = %bb448, %bb425, %bb417, %bb395, %bb385, %bb371
 	%tmp460461 = fpext float %iftmp.7.0 to double		; <double> [#uses=1]
 	%tmp462463 = fpext float %iftmp.14.0 to double		; <double> [#uses=1]
 	%tmp464465 = fpext float %iftmp.0.0 to double		; <double> [#uses=1]
-	%tmp467 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([48 x i8], [48 x i8]* @.str, i32 0, i32 0), double %tmp464465, double %tmp462463, double %tmp460461, double %tmp458459 ) nounwind 		; <i32> [#uses=0]
+	%tmp467 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([48 x i8], [48 x i8]* @.str, i32 0, i32 0), double %tmp464465, double %tmp462463, double %tmp460461, double %tmp458459 ) nounwind 		; <i32> [#uses=0]
 	ret void
 }
 
diff --git a/test/CodeGen/X86/2008-04-09-BranchFolding.ll b/test/CodeGen/X86/2008-04-09-BranchFolding.ll
index a758fed..f21a6f3 100644
--- a/test/CodeGen/X86/2008-04-09-BranchFolding.ll
+++ b/test/CodeGen/X86/2008-04-09-BranchFolding.ll
@@ -39,7 +39,7 @@ bb226.i:		; preds = %bb73.i
 bb273.i:		; preds = %bb226.i
 	ret %struct.tree_node* null
 bb260:		; preds = %bb226.i
-	tail call void (i8*, i32, ...)* @pedwarn_with_file_and_line( i8* %file.0, i32 %line.0, i8* null ) nounwind 
+	tail call void (i8*, i32, ...) @pedwarn_with_file_and_line( i8* %file.0, i32 %line.0, i8* null ) nounwind 
 	ret %struct.tree_node* null
 bb344:		; preds = %bb174
 	ret %struct.tree_node* null
diff --git a/test/CodeGen/X86/2008-04-15-LiveVariableBug.ll b/test/CodeGen/X86/2008-04-15-LiveVariableBug.ll
index f83c990..b526591 100644
--- a/test/CodeGen/X86/2008-04-15-LiveVariableBug.ll
+++ b/test/CodeGen/X86/2008-04-15-LiveVariableBug.ll
@@ -43,7 +43,7 @@ entry:
 	%tmp105 = load %struct.NSArray*, %struct.NSArray** null, align 8		; <%struct.NSArray*> [#uses=1]
 	%tmp107 = load %struct.NSObject*, %struct.NSObject** null, align 8		; <%struct.NSObject*> [#uses=1]
 	call void null( %struct.NSObject* %tmp107, %struct._message_ref_t* @"\01L_OBJC_MESSAGE_REF_228", %struct.NSArray* %tmp105, i8 signext  0 )
-	%tmp111 = call %struct.NSObject* (%struct.NSObject*, %struct.objc_selector*, ...)* @objc_msgSend( %struct.NSObject* null, %struct.objc_selector* null, i32 0, i8* null )		; <%struct.NSObject*> [#uses=0]
+	%tmp111 = call %struct.NSObject* (%struct.NSObject*, %struct.objc_selector*, ...) @objc_msgSend( %struct.NSObject* null, %struct.objc_selector* null, i32 0, i8* null )		; <%struct.NSObject*> [#uses=0]
 	ret void
 }
 
diff --git a/test/CodeGen/X86/2008-05-12-tailmerge-5.ll b/test/CodeGen/X86/2008-05-12-tailmerge-5.ll
index df5ceb0..0669a32 100644
--- a/test/CodeGen/X86/2008-05-12-tailmerge-5.ll
+++ b/test/CodeGen/X86/2008-05-12-tailmerge-5.ll
@@ -64,7 +64,7 @@ entry:
 	br i1 %toBool, label %bb, label %bb27
 
 bb:		; preds = %entry
-	call void (...)* @abort( ) noreturn nounwind 
+	call void (...) @abort( ) noreturn nounwind 
 	unreachable
 
 bb27:		; preds = %entry
@@ -77,7 +77,7 @@ bb27:		; preds = %entry
 	br i1 %toBool33, label %bb34, label %bb35
 
 bb34:		; preds = %bb27
-	call void (...)* @abort( ) noreturn nounwind 
+	call void (...) @abort( ) noreturn nounwind 
 	unreachable
 
 bb35:		; preds = %bb27
@@ -98,7 +98,7 @@ bb35:		; preds = %bb27
 	br i1 %toBool49, label %bb50, label %bb51
 
 bb50:		; preds = %bb35
-	call void (...)* @abort( ) noreturn nounwind 
+	call void (...) @abort( ) noreturn nounwind 
 	unreachable
 
 bb51:		; preds = %bb35
@@ -119,7 +119,7 @@ bb51:		; preds = %bb35
 	br i1 %toBool65, label %bb66, label %bb67
 
 bb66:		; preds = %bb51
-	call void (...)* @abort( ) noreturn nounwind 
+	call void (...) @abort( ) noreturn nounwind 
 	unreachable
 
 bb67:		; preds = %bb51
@@ -132,7 +132,7 @@ bb67:		; preds = %bb51
 	br i1 %toBool73, label %bb74, label %bb75
 
 bb74:		; preds = %bb67
-	call void (...)* @abort( ) noreturn nounwind 
+	call void (...) @abort( ) noreturn nounwind 
 	unreachable
 
 bb75:		; preds = %bb67
diff --git a/test/CodeGen/X86/2008-07-16-CoalescerCrash.ll b/test/CodeGen/X86/2008-07-16-CoalescerCrash.ll
index 42752eb..a1b9d9d 100644
--- a/test/CodeGen/X86/2008-07-16-CoalescerCrash.ll
+++ b/test/CodeGen/X86/2008-07-16-CoalescerCrash.ll
@@ -26,7 +26,7 @@ bb31:		; preds = %bb6
 	br label %bb33
 
 bb33:		; preds = %bb31, %bb
-	tail call void (%struct.SV*, i8*, ...)* @Perl_sv_catpvf( %struct.SV* %dsv, i8* getelementptr ([8 x i8], [8 x i8]* @"\01LC25", i32 0, i64 0), i64 %0 ) nounwind 
+	tail call void (%struct.SV*, i8*, ...) @Perl_sv_catpvf( %struct.SV* %dsv, i8* getelementptr ([8 x i8], [8 x i8]* @"\01LC25", i32 0, i64 0), i64 %0 ) nounwind 
 	unreachable
 
 bb40:		; preds = %entry
diff --git a/test/CodeGen/X86/2008-08-06-CmpStride.ll b/test/CodeGen/X86/2008-08-06-CmpStride.ll
index 3a74b48..a030fbe 100644
--- a/test/CodeGen/X86/2008-08-06-CmpStride.ll
+++ b/test/CodeGen/X86/2008-08-06-CmpStride.ll
@@ -13,7 +13,7 @@ forbody:
         %sub14 = sub i32 1027, %i.0             ; <i32> [#uses=1]
         %mul15 = mul i32 %sub14, 10             ; <i32> [#uses=1]
         %add166 = or i32 %mul15, 1              ; <i32> [#uses=1] *
-        call i32 (i8*, ...)* @printf( i8* noalias  getelementptr ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %add166 ) nounwind
+        call i32 (i8*, ...) @printf( i8* noalias  getelementptr ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %add166 ) nounwind
         %inc = add i32 %i.0, 1          ; <i32> [#uses=3]
         %cmp = icmp ne i32 %inc, 1027          ; <i1> [#uses=1]
         br i1 %cmp, label %forbody, label %afterfor
diff --git a/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll b/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll
index d939207..2910902 100644
--- a/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll
+++ b/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll
@@ -37,7 +37,7 @@ if.then:                                          ; preds = %entry
   ret i32 0
 
 if.end:                                           ; preds = %entry
-  %call = tail call i32 (...)* @_Unwind_ForcedUnwind_Phase2() nounwind
+  %call = tail call i32 (...) @_Unwind_ForcedUnwind_Phase2() nounwind
   store i32 %call, i32* @a, align 4
   %tobool1 = icmp eq i32 %call, 0
   br i1 %tobool1, label %cond.end, label %cond.true
diff --git a/test/CodeGen/X86/2008-09-09-LinearScanBug.ll b/test/CodeGen/X86/2008-09-09-LinearScanBug.ll
index c80fbdd..9a1a3dd 100644
--- a/test/CodeGen/X86/2008-09-09-LinearScanBug.ll
+++ b/test/CodeGen/X86/2008-09-09-LinearScanBug.ll
@@ -58,7 +58,7 @@ ifend.i:		; preds = %lor_rhs.i
 safe_mod_int16_t_s_s.exit:		; preds = %ifend.i, %lor_rhs.i, %func_106.exit27
 	%call31 = phi i16 [ %conv8.i, %ifend.i ], [ %conv, %func_106.exit27 ], [ %conv, %lor_rhs.i ]		; <i16> [#uses=1]
 	%conv4 = sext i16 %call31 to i32		; <i32> [#uses=1]
-	%call5 = tail call i32 (...)* @func_104( i32 %conv4 )		; <i32> [#uses=0]
+	%call5 = tail call i32 (...) @func_104( i32 %conv4 )		; <i32> [#uses=0]
 	ret i32 undef
 }
 
diff --git a/test/CodeGen/X86/2008-09-11-CoalescerBug.ll b/test/CodeGen/X86/2008-09-11-CoalescerBug.ll
index 635194f..8c46bb3 100644
--- a/test/CodeGen/X86/2008-09-11-CoalescerBug.ll
+++ b/test/CodeGen/X86/2008-09-11-CoalescerBug.ll
@@ -9,7 +9,7 @@ entry:
 	%1 = load i16, i16* @g_15, align 2		; <i16> [#uses=1]
 	%2 = zext i16 %1 to i32		; <i32> [#uses=1]
 	%3 = and i32 %2, 1		; <i32> [#uses=1]
-	%4 = tail call i32 (...)* @rshift_u_s( i32 1 ) nounwind		; <i32> [#uses=1]
+	%4 = tail call i32 (...) @rshift_u_s( i32 1 ) nounwind		; <i32> [#uses=1]
 	%5 = icmp slt i32 %4, 2		; <i1> [#uses=1]
 	%6 = zext i1 %5 to i32		; <i32> [#uses=1]
 	%7 = icmp sge i32 %3, %6		; <i1> [#uses=1]
@@ -17,7 +17,7 @@ entry:
 	%9 = load i16, i16* @g_15, align 2		; <i16> [#uses=1]
 	%10 = icmp eq i16 %9, 0		; <i1> [#uses=1]
 	%11 = zext i1 %10 to i32		; <i32> [#uses=1]
-	%12 = tail call i32 (...)* @func_20( i32 1 ) nounwind		; <i32> [#uses=1]
+	%12 = tail call i32 (...) @func_20( i32 1 ) nounwind		; <i32> [#uses=1]
 	%13 = icmp sge i32 %11, %12		; <i1> [#uses=1]
 	%14 = zext i1 %13 to i32		; <i32> [#uses=1]
 	%15 = sub i32 %8, %14		; <i32> [#uses=1]
@@ -27,7 +27,7 @@ entry:
 	%or.cond = or i1 false, %18		; <i1> [#uses=1]
 	%19 = select i1 %or.cond, i32 0, i32 %0		; <i32> [#uses=1]
 	%.0 = lshr i32 %17, %19		; <i32> [#uses=1]
-	%20 = tail call i32 (...)* @func_7( i32 %.0 ) nounwind		; <i32> [#uses=0]
+	%20 = tail call i32 (...) @func_7( i32 %.0 ) nounwind		; <i32> [#uses=0]
 	ret i32 undef
 }
 
diff --git a/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll b/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
index 92eb1c8..757dff4 100644
--- a/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
+++ b/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
@@ -38,7 +38,7 @@ bb12:		; preds = %bb11, %entry
 	%.014.in = phi i8 [ %10, %bb11 ], [ %7, %entry ]		; <i8> [#uses=1]
 	%11 = icmp ne i8 %.014.in, 0		; <i1> [#uses=1]
 	%12 = zext i1 %11 to i32		; <i32> [#uses=1]
-	%13 = tail call i32 (...)* @func_48( i32 %12, i32 %3, i32 0 ) nounwind		; <i32> [#uses=0]
+	%13 = tail call i32 (...) @func_48( i32 %12, i32 %3, i32 0 ) nounwind		; <i32> [#uses=0]
 	ret i32 undef
 }
 
diff --git a/test/CodeGen/X86/2008-10-11-CallCrash.ll b/test/CodeGen/X86/2008-10-11-CallCrash.ll
index a859bc6..9ad7ab2 100644
--- a/test/CodeGen/X86/2008-10-11-CallCrash.ll
+++ b/test/CodeGen/X86/2008-10-11-CallCrash.ll
@@ -6,13 +6,13 @@ target triple = "i386-apple-darwin7"
 
 define i32 @func_45(i64 %p_46, i32 %p_48) nounwind {
 entry:
-	%0 = tail call i32 (...)* @lshift_s_u(i64 %p_46, i64 0) nounwind		; <i32> [#uses=0]
+	%0 = tail call i32 (...) @lshift_s_u(i64 %p_46, i64 0) nounwind		; <i32> [#uses=0]
 	%1 = load i32, i32* @g_385, align 4		; <i32> [#uses=1]
 	%2 = shl i32 %1, 1		; <i32> [#uses=1]
 	%3 = and i32 %2, 32		; <i32> [#uses=1]
-	%4 = tail call i32 (...)* @func_87(i32 undef, i32 %p_48, i32 1) nounwind		; <i32> [#uses=1]
+	%4 = tail call i32 (...) @func_87(i32 undef, i32 %p_48, i32 1) nounwind		; <i32> [#uses=1]
 	%5 = add i32 %3, %4		; <i32> [#uses=1]
-	%6 = tail call i32 (...)* @div_rhs(i32 %5) nounwind		; <i32> [#uses=0]
+	%6 = tail call i32 (...) @div_rhs(i32 %5) nounwind		; <i32> [#uses=0]
 	ret i32 undef
 }
 
diff --git a/test/CodeGen/X86/2008-10-13-CoalescerBug.ll b/test/CodeGen/X86/2008-10-13-CoalescerBug.ll
index 4d3f8c2..c285ae4 100644
--- a/test/CodeGen/X86/2008-10-13-CoalescerBug.ll
+++ b/test/CodeGen/X86/2008-10-13-CoalescerBug.ll
@@ -3,7 +3,7 @@
 
 define i32 @func_77(i8 zeroext %p_79) nounwind {
 entry:
-	%0 = tail call i32 (...)* @func_43(i32 1) nounwind		; <i32> [#uses=1]
+	%0 = tail call i32 (...) @func_43(i32 1) nounwind		; <i32> [#uses=1]
 	%1 = icmp eq i32 %0, 0		; <i1> [#uses=1]
 	br i1 %1, label %bb3, label %bb
 
@@ -14,7 +14,7 @@ bb3:		; preds = %bb, %entry
 	%p_79_addr.0 = phi i8 [ 0, %bb ], [ %p_79, %entry ]		; <i8> [#uses=1]
 	%2 = zext i8 %p_79_addr.0 to i32		; <i32> [#uses=2]
 	%3 = zext i1 false to i32		; <i32> [#uses=2]
-	%4 = tail call i32 (...)* @rshift_u_s(i32 1) nounwind		; <i32> [#uses=0]
+	%4 = tail call i32 (...) @rshift_u_s(i32 1) nounwind		; <i32> [#uses=0]
 	%5 = lshr i32 %2, %2		; <i32> [#uses=3]
 	%6 = icmp eq i32 0, 0		; <i1> [#uses=1]
 	br i1 %6, label %bb6, label %bb9
diff --git a/test/CodeGen/X86/2008-11-06-testb.ll b/test/CodeGen/X86/2008-11-06-testb.ll
index 4ee4b4a..c8fad06 100644
--- a/test/CodeGen/X86/2008-11-06-testb.ll
+++ b/test/CodeGen/X86/2008-11-06-testb.ll
@@ -18,7 +18,7 @@ entry:
 	br i1 %4, label %bb5, label %bb
 
 bb:		; preds = %entry
-	%5 = tail call i32 (...)* @xx() nounwind		; <i32> [#uses=1]
+	%5 = tail call i32 (...) @xx() nounwind		; <i32> [#uses=1]
 	ret i32 %5
 
 bb5:		; preds = %entry
diff --git a/test/CodeGen/X86/2008-11-29-ULT-Sign.ll b/test/CodeGen/X86/2008-11-29-ULT-Sign.ll
index 6dca141..03442d6 100644
--- a/test/CodeGen/X86/2008-11-29-ULT-Sign.ll
+++ b/test/CodeGen/X86/2008-11-29-ULT-Sign.ll
@@ -8,7 +8,7 @@ entry:
 	br i1 %cmp, label %if.end, label %if.then
 
 if.then:		; preds = %entry
-	%call = call i32 (...)* @b()		; <i32> [#uses=0]
+	%call = call i32 (...) @b()		; <i32> [#uses=0]
 	br label %if.end
 
 if.end:		; preds = %if.then, %entry
diff --git a/test/CodeGen/X86/2008-12-01-SpillerAssert.ll b/test/CodeGen/X86/2008-12-01-SpillerAssert.ll
index 105489e..cf292e3 100644
--- a/test/CodeGen/X86/2008-12-01-SpillerAssert.ll
+++ b/test/CodeGen/X86/2008-12-01-SpillerAssert.ll
@@ -10,6 +10,6 @@ declare i32 @printk(i8*, ...)
 define void @display_cacheinfo(%struct.cpuinfo_x86* %c) nounwind section ".cpuinit.text" {
 entry:
         %asmtmp = tail call { i32, i32, i32, i32 } asm "cpuid", "={ax},={bx},={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 -2147483643, i32 0) nounwind          ; <{ i32, i32, i32, i32 }> [#uses=0]
-        %0 = tail call i32 (i8*, ...)* @printk(i8* getelementptr ([70 x i8], [70 x i8]* @.str10, i32 0, i64 0), i32 0, i32 0, i32 0, i32 0) nounwind           ; <i32> [#uses=0]
+        %0 = tail call i32 (i8*, ...) @printk(i8* getelementptr ([70 x i8], [70 x i8]* @.str10, i32 0, i64 0), i32 0, i32 0, i32 0, i32 0) nounwind           ; <i32> [#uses=0]
         unreachable
 }
diff --git a/test/CodeGen/X86/2008-12-19-EarlyClobberBug.ll b/test/CodeGen/X86/2008-12-19-EarlyClobberBug.ll
index 7ac2cd2..6bb29fd 100644
--- a/test/CodeGen/X86/2008-12-19-EarlyClobberBug.ll
+++ b/test/CodeGen/X86/2008-12-19-EarlyClobberBug.ll
@@ -16,7 +16,7 @@ entry:
 	%1 = trunc i64 %u to i32		; <i32> [#uses=4]
 	%2 = lshr i64 %u, 32		; <i64> [#uses=1]
 	%3 = trunc i64 %2 to i32		; <i32> [#uses=2]
-	%4 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([7 x i8], [7 x i8]* @"\01LC", i32 0, i32 0), i32 %1) nounwind		; <i32> [#uses=0]
+	%4 = tail call i32 (i8*, ...) @printf(i8* getelementptr ([7 x i8], [7 x i8]* @"\01LC", i32 0, i32 0), i32 %1) nounwind		; <i32> [#uses=0]
 	%5 = icmp ult i32 %1, %0		; <i1> [#uses=1]
 	br i1 %5, label %bb2, label %bb
 
diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
index db31333..172a00a 100644
--- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
+++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "7 machine-licm"
+; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "9 machine-licm"
 ; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn | FileCheck %s
 ; rdar://6627786
 ; rdar://7792037
@@ -21,9 +21,9 @@ bb4:		; preds = %bb.i, %bb26, %bb4, %entry
 ; CHECK: xorl
 ; CHECK: movq
 
-	%0 = call i32 (...)* @xxGetOffsetForCode(i32 undef) nounwind		; <i32> [#uses=0]
+	%0 = call i32 (...) @xxGetOffsetForCode(i32 undef) nounwind		; <i32> [#uses=0]
 	%ins = or i64 %p, 2097152		; <i64> [#uses=1]
-	%1 = call i32 (...)* @xxCalculateMidType(%struct.Key* %desc, i32 0) nounwind		; <i32> [#uses=1]
+	%1 = call i32 (...) @xxCalculateMidType(%struct.Key* %desc, i32 0) nounwind		; <i32> [#uses=1]
 	%cond = icmp eq i32 %1, 1		; <i1> [#uses=1]
 	br i1 %cond, label %bb26, label %bb4
 
diff --git a/test/CodeGen/X86/2009-03-25-TestBug.ll b/test/CodeGen/X86/2009-03-25-TestBug.ll
index 79c0863..367a6d2 100644
--- a/test/CodeGen/X86/2009-03-25-TestBug.ll
+++ b/test/CodeGen/X86/2009-03-25-TestBug.ll
@@ -15,11 +15,11 @@ bb1579.i.i:		; preds = %bb1514.i.i, %bb191.i.i
         br i1 %tmp178, label %hello, label %world
 
 hello:
-	%h = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([7 x i8], [7 x i8]* @hello, i32 0, i32 0))
+	%h = tail call i32 (i8*, ...) @printf( i8* getelementptr ([7 x i8], [7 x i8]* @hello, i32 0, i32 0))
         ret void
 
 world:
-	%w = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([7 x i8], [7 x i8]* @world, i32 0, i32 0))
+	%w = tail call i32 (i8*, ...) @printf( i8* getelementptr ([7 x i8], [7 x i8]* @world, i32 0, i32 0))
         ret void
 }
 
diff --git a/test/CodeGen/X86/2009-04-13-2AddrAssert.ll b/test/CodeGen/X86/2009-04-13-2AddrAssert.ll
index 4362ba4..a3607c6 100644
--- a/test/CodeGen/X86/2009-04-13-2AddrAssert.ll
+++ b/test/CodeGen/X86/2009-04-13-2AddrAssert.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-undermydesk-freebsd8.0"
 
 define i32 @main(i32 %argc, i8** nocapture %argv) nounwind {
 entry:
-        %call = tail call i32 (...)* @getpid()          ; <i32> [#uses=1]
+        %call = tail call i32 (...) @getpid()          ; <i32> [#uses=1]
         %conv = trunc i32 %call to i16          ; <i16> [#uses=1]
         %0 = tail call i16 asm "xchgb ${0:h}, ${0:b}","=Q,0,~{dirflag},~{fpsr},~{flags}"(i16 %conv) nounwind           ; <i16> [#uses=0]
         ret i32 undef
diff --git a/test/CodeGen/X86/2009-04-14-IllegalRegs.ll b/test/CodeGen/X86/2009-04-14-IllegalRegs.ll
index 1e5e933..8055ea8 100644
--- a/test/CodeGen/X86/2009-04-14-IllegalRegs.ll
+++ b/test/CodeGen/X86/2009-04-14-IllegalRegs.ll
@@ -21,7 +21,7 @@ entry:
 	store i8 %5, i8* %7, align 1
 	%8 = getelementptr %struct.X, %struct.X* %xxx, i32 0, i32 0		; <i8*> [#uses=1]
 	store i8 15, i8* %8, align 1
-	%9 = call i32 (...)* bitcast (i32 (%struct.X*, %struct.X*)* @f to i32 (...)*)(%struct.X* byval align 4 %xxx, %struct.X* byval align 4 %xxx) nounwind		; <i32> [#uses=1]
+	%9 = call i32 (...) bitcast (i32 (%struct.X*, %struct.X*)* @f to i32 (...)*)(%struct.X* byval align 4 %xxx, %struct.X* byval align 4 %xxx) nounwind		; <i32> [#uses=1]
 	store i32 %9, i32* %0, align 4
 	%10 = load i32, i32* %0, align 4		; <i32> [#uses=1]
 	store i32 %10, i32* %retval, align 4
diff --git a/test/CodeGen/X86/2009-05-19-SingleElementExtractElement.ll b/test/CodeGen/X86/2009-05-19-SingleElementExtractElement.ll
index 6e062fb..89cd24d 100644
--- a/test/CodeGen/X86/2009-05-19-SingleElementExtractElement.ll
+++ b/test/CodeGen/X86/2009-05-19-SingleElementExtractElement.ll
@@ -7,7 +7,7 @@ entry:
         %tmp5.i = extractelement <1 x i64> %a, i32 0
         %tmp11 = bitcast i64 %tmp5.i to <1 x i64>
         %tmp8 = extractelement <1 x i64> %tmp11, i32 0
-        %call6 = call i32 (i64)* @foo(i64 %tmp8)
+        %call6 = call i32 (i64) @foo(i64 %tmp8)
         ret i32 undef
 }
 
diff --git a/test/CodeGen/X86/2009-08-23-SubRegReuseUndo.ll b/test/CodeGen/X86/2009-08-23-SubRegReuseUndo.ll
index fac6a66..45e770f 100644
--- a/test/CodeGen/X86/2009-08-23-SubRegReuseUndo.ll
+++ b/test/CodeGen/X86/2009-08-23-SubRegReuseUndo.ll
@@ -53,7 +53,7 @@ bb5:                                              ; preds = %bb4, %bb3
 bb6.preheader:                                    ; preds = %bb5
   %21 = sext i8 %p_52 to i32                      ; <i32> [#uses=1]
   %22 = load volatile i32, i32* @uint8, align 4        ; <i32> [#uses=0]
-  %23 = tail call i32 (...)* @safefuncts(i32 %21, i32 1) nounwind; <i32> [#uses=0]
+  %23 = tail call i32 (...) @safefuncts(i32 %21, i32 1) nounwind; <i32> [#uses=0]
   unreachable
 
 return:                                           ; preds = %bb5
diff --git a/test/CodeGen/X86/2009-10-16-Scope.ll b/test/CodeGen/X86/2009-10-16-Scope.ll
index c783ee9..374c696 100644
--- a/test/CodeGen/X86/2009-10-16-Scope.ll
+++ b/test/CodeGen/X86/2009-10-16-Scope.ll
@@ -9,7 +9,7 @@ entry:
   br label %do.body, !dbg !0
 
 do.body:                                          ; preds = %entry
-  call void @llvm.dbg.declare(metadata i32* %count_, metadata !4, metadata !MDExpression())
+  call void @llvm.dbg.declare(metadata i32* %count_, metadata !4, metadata !MDExpression()), !dbg !MDLocation(scope: !5)
   %conv = ptrtoint i32* %count_ to i32, !dbg !0   ; <i32> [#uses=1]
   %call = call i32 @foo(i32 %conv) ssp, !dbg !0   ; <i32> [#uses=0]
   br label %do.end, !dbg !0
diff --git a/test/CodeGen/X86/2010-01-18-DbgValue.ll b/test/CodeGen/X86/2010-01-18-DbgValue.ll
index e0fd9b0..b03556a 100644
--- a/test/CodeGen/X86/2010-01-18-DbgValue.ll
+++ b/test/CodeGen/X86/2010-01-18-DbgValue.ll
@@ -51,5 +51,5 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !17 = distinct !MDLexicalBlock(line: 11, column: 0, file: !19, scope: !1)
 !18 = !{!1}
 !19 = !MDFile(filename: "b2.c", directory: "/tmp/")
-!20 = !{i32 0}
+!20 = !{}
 !21 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
index ced3708..7c0fd68 100644
--- a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
+++ b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
@@ -8,7 +8,7 @@
 
 define i32 @"main(tart.core.String[])->int32"(i32 %args) {
 entry:
-  tail call void @llvm.dbg.value(metadata %tart.reflect.ComplexType* @.type.SwitchStmtTest, i64 0, metadata !8, metadata !MDExpression())
+  tail call void @llvm.dbg.value(metadata %tart.reflect.ComplexType* @.type.SwitchStmtTest, i64 0, metadata !8, metadata !MDExpression()), !dbg !MDLocation(scope: !9)
   tail call void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.ComplexType* @.type.SwitchStmtTest) ; <%tart.core.Object*> [#uses=2]
   ret i32 3
 }
diff --git a/test/CodeGen/X86/2010-02-23-RematImplicitSubreg.ll b/test/CodeGen/X86/2010-02-23-RematImplicitSubreg.ll
index 4e4e006..6fe31b6 100644
--- a/test/CodeGen/X86/2010-02-23-RematImplicitSubreg.ll
+++ b/test/CodeGen/X86/2010-02-23-RematImplicitSubreg.ll
@@ -23,7 +23,7 @@ for.body:                                         ; preds = %if.end40, %entry
 
 if.then:                                          ; preds = %for.body
   %conv18 = sext i8 %tmp6 to i32                  ; <i32> [#uses=1]
-  %call = tail call i32 (...)* @invalid(i32 0, i32 0, i32 %conv18) nounwind ; <i32> [#uses=0]
+  %call = tail call i32 (...) @invalid(i32 0, i32 0, i32 %conv18) nounwind ; <i32> [#uses=0]
   br label %if.end
 
 if.end:                                           ; preds = %if.then, %for.body
@@ -34,7 +34,7 @@ if.end:                                           ; preds = %if.then, %for.body
 
 if.then36:                                        ; preds = %if.end
   %conv38 = sext i8 %tmp24 to i32                 ; <i32> [#uses=1]
-  %call39 = tail call i32 (...)* @invalid(i32 0, i32 0, i32 %conv38) nounwind ; <i32> [#uses=0]
+  %call39 = tail call i32 (...) @invalid(i32 0, i32 0, i32 %conv38) nounwind ; <i32> [#uses=0]
   br label %if.end40
 
 if.end40:                                         ; preds = %if.then36, %if.end
diff --git a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
index 43f05ca..29df291 100644
--- a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
@@ -10,10 +10,10 @@
 
 define hidden %0 @__divsc3(float %a, float %b, float %c, float %d) nounwind readnone {
 entry:
-  tail call void @llvm.dbg.value(metadata float %a, i64 0, metadata !0, metadata !MDExpression())
-  tail call void @llvm.dbg.value(metadata float %b, i64 0, metadata !11, metadata !MDExpression())
-  tail call void @llvm.dbg.value(metadata float %c, i64 0, metadata !12, metadata !MDExpression())
-  tail call void @llvm.dbg.value(metadata float %d, i64 0, metadata !13, metadata !MDExpression())
+  tail call void @llvm.dbg.value(metadata float %a, i64 0, metadata !0, metadata !MDExpression()), !dbg !MDLocation(scope: !1)
+  tail call void @llvm.dbg.value(metadata float %b, i64 0, metadata !11, metadata !MDExpression()), !dbg !MDLocation(scope: !1)
+  tail call void @llvm.dbg.value(metadata float %c, i64 0, metadata !12, metadata !MDExpression()), !dbg !MDLocation(scope: !1)
+  tail call void @llvm.dbg.value(metadata float %d, i64 0, metadata !13, metadata !MDExpression()), !dbg !MDLocation(scope: !1)
   %0 = tail call float @fabsf(float %c) nounwind readnone, !dbg !19 ; <float> [#uses=1]
   %1 = tail call float @fabsf(float %d) nounwind readnone, !dbg !19 ; <float> [#uses=1]
   %2 = fcmp olt float %0, %1, !dbg !19            ; <i1> [#uses=1]
@@ -247,5 +247,5 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !44 = !{!1}
 !45 = !MDFile(filename: "libgcc2.c", directory: "/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc")
 !46 = !MDFile(filename: "libgcc2.h", directory: "/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc")
-!47 = !{i32 0}
+!47 = !{}
 !48 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
index b8f7ba2..fe68711 100644
--- a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-apple-darwin10"
 
 define i8* @bar(%struct.a* %myvar) nounwind optsize noinline ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata %struct.a* %myvar, i64 0, metadata !8, metadata !MDExpression())
+  tail call void @llvm.dbg.value(metadata %struct.a* %myvar, i64 0, metadata !8, metadata !MDExpression()), !dbg !MDLocation(scope: !9)
   %0 = getelementptr inbounds %struct.a, %struct.a* %myvar, i64 0, i32 0, !dbg !28 ; <i32*> [#uses=1]
   %1 = load i32, i32* %0, align 8, !dbg !28            ; <i32> [#uses=1]
   tail call void @foo(i32 %1) nounwind optsize noinline ssp, !dbg !28
diff --git a/test/CodeGen/X86/2010-05-28-Crash.ll b/test/CodeGen/X86/2010-05-28-Crash.ll
index f2e8dbd..097cd24 100644
--- a/test/CodeGen/X86/2010-05-28-Crash.ll
+++ b/test/CodeGen/X86/2010-05-28-Crash.ll
@@ -4,8 +4,8 @@
 
 define i32 @foo(i32 %y) nounwind optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata i32 %y, i64 0, metadata !0, metadata !MDExpression())
-  %0 = tail call i32 (...)* @zoo(i32 %y) nounwind, !dbg !9 ; <i32> [#uses=1]
+  tail call void @llvm.dbg.value(metadata i32 %y, i64 0, metadata !0, metadata !MDExpression()), !dbg !MDLocation(scope: !1)
+  %0 = tail call i32 (...) @zoo(i32 %y) nounwind, !dbg !9 ; <i32> [#uses=1]
   ret i32 %0, !dbg !9
 }
 
@@ -15,9 +15,9 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 
 define i32 @bar(i32 %x) nounwind optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !7, metadata !MDExpression())
-  tail call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !0, metadata !MDExpression()) nounwind
-  %0 = tail call i32 (...)* @zoo(i32 1) nounwind, !dbg !12 ; <i32> [#uses=1]
+  tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !7, metadata !MDExpression()), !dbg !MDLocation(scope: !8)
+  tail call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !0, metadata !MDExpression()) nounwind, !dbg !MDLocation(scope: !1)
+  %0 = tail call i32 (...) @zoo(i32 1) nounwind, !dbg !12 ; <i32> [#uses=1]
   %1 = add nsw i32 %0, %x, !dbg !13               ; <i32> [#uses=1]
   ret i32 %1, !dbg !13
 }
@@ -44,7 +44,7 @@ entry:
 !16 = !{!7}
 !17 = !{!1, !8}
 !18 = !MDFile(filename: "f.c", directory: "/tmp")
-!19 = !{i32 0}
+!19 = !{}
 
 ;CHECK: DEBUG_VALUE: bar:x <- E
 ;CHECK: Ltmp
diff --git a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
index b0185ba..942faf4 100644
--- a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
+++ b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
@@ -10,8 +10,8 @@ target triple = "x86_64-apple-darwin10.2"
 define i32 @_ZN3foo3bazEi(%struct.foo* nocapture %this, i32 %x) nounwind readnone optsize noinline ssp align 2 {
 ;CHECK: DEBUG_VALUE: baz:this <- RDI{{$}}
 entry:
-  tail call void @llvm.dbg.value(metadata %struct.foo* %this, i64 0, metadata !15, metadata !MDExpression())
-  tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !16, metadata !MDExpression())
+  tail call void @llvm.dbg.value(metadata %struct.foo* %this, i64 0, metadata !15, metadata !MDExpression()), !dbg !MDLocation(scope: !8)
+  tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !16, metadata !MDExpression()), !dbg !MDLocation(scope: !8)
   %0 = mul nsw i32 %x, 7, !dbg !29                ; <i32> [#uses=1]
   %1 = add nsw i32 %0, 1, !dbg !29                ; <i32> [#uses=1]
   ret i32 %1, !dbg !29
@@ -55,6 +55,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !29 = !MDLocation(line: 16, scope: !30)
 !30 = distinct !MDLexicalBlock(line: 15, column: 0, file: !31, scope: !8)
 !31 = !MDFile(filename: "foo.cp", directory: "/tmp/")
-!32 = !{i32 0}
+!32 = !{}
 !33 = !{!1, !8, !18}
 !34 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll b/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
index 198eb31..0b1c36f 100644
--- a/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
+++ b/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
@@ -18,7 +18,7 @@ entry:
   %0 = call i32 asm "bsr   $1, $0\0A\09cmovz $2, $0", "=&r,ro,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i32 %zero, i32 -1) nounwind, !srcloc !0 ; <i32> [#uses=1]
   store i32 %0, i32* %v
   %tmp = load i32, i32* %v                             ; <i32> [#uses=1]
-  %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0), i32 %tmp) ; <i32> [#uses=0]
+  %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0), i32 %tmp) ; <i32> [#uses=0]
   store i32 0, i32* %retval
   %1 = load i32, i32* %retval                          ; <i32> [#uses=1]
   ret i32 %0
diff --git a/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll b/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll
index 1a05d0a..ab9715d 100644
--- a/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll
+++ b/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll
@@ -29,7 +29,7 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %entry.if.end_crit_edge, %if.then
   %tmp4 = phi i32 [ %tmp4.pre, %entry.if.end_crit_edge ], [ 1, %if.then ] ; <i32> [#uses=1]
-  %call5 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %tmp4) nounwind ; <i32> [#uses=0]
+  %call5 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %tmp4) nounwind ; <i32> [#uses=0]
   ret i32 0
 }
 
diff --git a/test/CodeGen/X86/2010-08-04-StackVariable.ll b/test/CodeGen/X86/2010-08-04-StackVariable.ll
index 6bd1217..d3ad860 100644
--- a/test/CodeGen/X86/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/X86/2010-08-04-StackVariable.ll
@@ -125,5 +125,5 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !44 = !MDLocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 26, scope: !39, file: !2, type: !13)
 !45 = !MDLocation(line: 27, scope: !39)
 !47 = !MDFile(filename: "small.cc", directory: "/Users/manav/R8248330")
-!48 = !{i32 0}
+!48 = !{}
 !49 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
index fa4fd75..30abdc5 100644
--- a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
+++ b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
@@ -31,5 +31,5 @@ entry:
 !13 = !{!0, !6}
 !14 = !MDFile(filename: "", directory: "/private/tmp")
 !15 = !MDFile(filename: "bug.c", directory: "/private/tmp")
-!16 = !{i32 0}
+!16 = !{}
 !17 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/2010-11-02-DbgParameter.ll b/test/CodeGen/X86/2010-11-02-DbgParameter.ll
index 783b34d..d920513 100644
--- a/test/CodeGen/X86/2010-11-02-DbgParameter.ll
+++ b/test/CodeGen/X86/2010-11-02-DbgParameter.ll
@@ -36,5 +36,5 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !15 = !{!0}
 !16 = !{!6}
 !17 = !MDFile(filename: "one.c", directory: "/private/tmp")
-!18 = !{i32 0}
+!18 = !{}
 !19 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
index c6a3a78..c02bd2d 100644
--- a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
+++ b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
@@ -60,7 +60,7 @@ cond.end:                                         ; preds = %entry, %cond.true
 
 if.then:                                          ; preds = %cond.end
   %puts = tail call i32 @puts(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @str, i64 0, i64 0))
-  %call12 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str1, i64 0, i64 0), i32 %call, i32 %cond) nounwind optsize, !dbg !26
+  %call12 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str1, i64 0, i64 0), i32 %call, i32 %cond) nounwind optsize, !dbg !26
   ret i32 1, !dbg !27
 
 return:                                           ; preds = %cond.end
@@ -110,5 +110,5 @@ declare i32 @puts(i8* nocapture) nounwind
 !29 = !{!10, !11, !12}
 !30 = !{!14, !17}
 !31 = !MDFile(filename: "rem_small.c", directory: "/private/tmp")
-!32 = !{i32 0}
+!32 = !{}
 !33 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/2011-02-23-UnfoldBug.ll b/test/CodeGen/X86/2011-02-23-UnfoldBug.ll
index 900106a..90b90d7 100644
--- a/test/CodeGen/X86/2011-02-23-UnfoldBug.ll
+++ b/test/CodeGen/X86/2011-02-23-UnfoldBug.ll
@@ -22,7 +22,7 @@ for.body33.lr.ph:                                 ; preds = %for.body
 for.end:                                          ; preds = %for.body
   %vecins.i94 = insertelement <2 x double> undef, double 0.000000e+00, i32 0
   %cmpsd.i = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %vecins.i94, <2 x double> <double 0x3FE984B204153B34, double 0x3FE984B204153B34>, i8 2) nounwind
-  tail call void (...)* @_mm_movemask_pd(<2 x double> %cmpsd.i) nounwind
+  tail call void (...) @_mm_movemask_pd(<2 x double> %cmpsd.i) nounwind
   br i1 undef, label %if.then67, label %if.end71
 
 if.then67:                                        ; preds = %for.end
diff --git a/test/CodeGen/X86/2011-03-02-DAGCombiner.ll b/test/CodeGen/X86/2011-03-02-DAGCombiner.ll
index 86e579a..d25fbf7 100644
--- a/test/CodeGen/X86/2011-03-02-DAGCombiner.ll
+++ b/test/CodeGen/X86/2011-03-02-DAGCombiner.ll
@@ -43,7 +43,7 @@ entry:
   %14 = and i32 %13, -129
   %15 = or i32 %14, %12
   store i32 %15, i32* %10, align 4
-  %call = call i32 (...)* @iequals(i32 1841, i32 %bf.value, i32 0)
+  %call = call i32 (...) @iequals(i32 1841, i32 %bf.value, i32 0)
   %16 = load i32, i32* %retval
   ret i32 %16
 }
diff --git a/test/CodeGen/X86/2011-09-14-valcoalesce.ll b/test/CodeGen/X86/2011-09-14-valcoalesce.ll
index a086a79..b8e5100 100644
--- a/test/CodeGen/X86/2011-09-14-valcoalesce.ll
+++ b/test/CodeGen/X86/2011-09-14-valcoalesce.ll
@@ -144,7 +144,7 @@ if.end117.i:                                      ; preds = %if.then108.i, %land
   br i1 undef, label %if.then122.i, label %for.cond138.preheader.i
 
 if.then122.i:                                     ; preds = %if.end117.i
-  call void (...)* @fprintf(i32 undef, i32 %gs.0526.i, i32 %ge.1.i, i32 %aFreq.1.i, double undef) nounwind
+  call void (...) @fprintf(i32 undef, i32 %gs.0526.i, i32 %ge.1.i, i32 %aFreq.1.i, double undef) nounwind
   br label %for.cond138.preheader.i
 
 for.cond138.preheader.i:                          ; preds = %if.then122.i, %if.end117.i
diff --git a/test/CodeGen/X86/2011-10-12-MachineCSE.ll b/test/CodeGen/X86/2011-10-12-MachineCSE.ll
index 5018db7..341a14b 100644
--- a/test/CodeGen/X86/2011-10-12-MachineCSE.ll
+++ b/test/CodeGen/X86/2011-10-12-MachineCSE.ll
@@ -102,7 +102,7 @@ if.end:                                           ; preds = %lor.lhs.false23
   %arrayidx38 = getelementptr inbounds [0 x %struct.insn_data], [0 x %struct.insn_data]* @insn_data, i32 0, i64 %idxprom37
   %genfun = getelementptr inbounds %struct.insn_data, %struct.insn_data* %arrayidx38, i32 0, i32 2
   %23 = load %struct.rtx_def* (%struct.rtx_def*, ...)*, %struct.rtx_def* (%struct.rtx_def*, ...)** %genfun, align 8
-  %call39 = tail call %struct.rtx_def* (%struct.rtx_def*, ...)* %23(%struct.rtx_def* %r0, %struct.rtx_def* %r1, %struct.rtx_def* %c)
+  %call39 = tail call %struct.rtx_def* (%struct.rtx_def*, ...) %23(%struct.rtx_def* %r0, %struct.rtx_def* %r1, %struct.rtx_def* %c)
   br label %return
 
 return:                                           ; preds = %if.end, %if.then
diff --git a/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
index da3c322..07dff95 100644
--- a/test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -26,7 +26,7 @@ entry:
 }
 
 ; CHECK-LABEL: zero_test
-; CHECK: pxor %xmm0, %xmm0
+; CHECK: xorps %xmm0, %xmm0
 ; CHECK: ret
 
 define void @zero_test() {
diff --git a/test/CodeGen/X86/2012-01-12-extract-sv.ll b/test/CodeGen/X86/2012-01-12-extract-sv.ll
index 75409f2..677c902 100644
--- a/test/CodeGen/X86/2012-01-12-extract-sv.ll
+++ b/test/CodeGen/X86/2012-01-12-extract-sv.ll
@@ -1,12 +1,25 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7-avx -mattr=+avx -mtriple=i686-pc-win32 | FileCheck %s
+; RUN: llc < %s -mattr=+avx -mtriple=i686-pc-win32 | FileCheck %s
 
-; CHECK: endless_loop
 define void @endless_loop() {
+; CHECK-LABEL: endless_loop:
+; CHECK-NEXT:     # BB#0:
+; CHECK-NEXT:	vmovaps	(%eax), %ymm0
+; CHECK-NEXT:	vextractf128	$1, %ymm0, %xmm0
+; CHECK-NEXT:	vmovsldup	%xmm0, %xmm0    # xmm0 = xmm0[0,0,2,2]
+; CHECK-NEXT:	vmovddup	%xmm0, %xmm1    # xmm1 = xmm0[0,0]
+; CHECK-NEXT:	vinsertf128	$1, %xmm1, %ymm0, %ymm1
+; CHECK-NEXT:	vxorps	%xmm2, %xmm2, %xmm2
+; CHECK-NEXT:	vblendps	$128, %ymm1, %ymm2, %ymm1 # ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; CHECK-NEXT:	vxorps	%ymm2, %ymm2, %ymm2
+; CHECK-NEXT:	vblendps	$1, %ymm0, %ymm2, %ymm0 # ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
+; CHECK-NEXT:	vmovaps	%ymm0, (%eax)
+; CHECK-NEXT:	vmovaps	%ymm1, (%eax)
+; CHECK-NEXT:	vzeroupper
+; CHECK-NEXT:	retl
 entry:
   %0 = load <8 x i32>, <8 x i32> addrspace(1)* undef, align 32
   %1 = shufflevector <8 x i32> %0, <8 x i32> undef, <16 x i32> <i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %2 = shufflevector <16 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef>, <16 x i32> %1, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 17>
   store <16 x i32> %2, <16 x i32> addrspace(1)* undef, align 64
   ret void
-; CHECK: ret
 }
diff --git a/test/CodeGen/X86/2012-07-10-extload64.ll b/test/CodeGen/X86/2012-07-10-extload64.ll
index f33fc8c..a366102 100644
--- a/test/CodeGen/X86/2012-07-10-extload64.ll
+++ b/test/CodeGen/X86/2012-07-10-extload64.ll
@@ -6,7 +6,7 @@ entry:
 ; CHECK: pmovzxwd
   %A27 = load <4 x i16>, <4 x i16>* %in, align 4
   %A28 = add <4 x i16> %A27, %A27
-; CHECK: movlpd
+; CHECK: movq
   store <4 x i16> %A28, <4 x i16>* %in, align 4
   ret void
 ; CHECK: ret
@@ -18,7 +18,7 @@ define void @store_64(<2 x i32>* %ptr) {
 BB:
   store <2 x i32> zeroinitializer, <2 x i32>* %ptr
   ret void
-;CHECK: movlpd
+;CHECK: movlps
 ;CHECK: ret
 }
 
diff --git a/test/CodeGen/X86/2012-09-28-CGPBug.ll b/test/CodeGen/X86/2012-09-28-CGPBug.ll
index 57af20e..a8e0625 100644
--- a/test/CodeGen/X86/2012-09-28-CGPBug.ll
+++ b/test/CodeGen/X86/2012-09-28-CGPBug.ll
@@ -35,7 +35,7 @@ define void @h(i8*) nounwind ssp {
   indirectbr i8* %16, [label %17, label %18]
 
 ; <label>:17                                      ; preds = %11
-  tail call void (i8*, ...)* @g(i8* getelementptr inbounds ([35 x i8], [35 x i8]* @.str40, i32 0, i32 0))
+  tail call void (i8*, ...) @g(i8* getelementptr inbounds ([35 x i8], [35 x i8]* @.str40, i32 0, i32 0))
   br label %22
 
 ; <label>:18                                      ; preds = %11
diff --git a/test/CodeGen/X86/2012-1-10-buildvector.ll b/test/CodeGen/X86/2012-1-10-buildvector.ll
index a9b8cc6..d1c0266 100644
--- a/test/CodeGen/X86/2012-1-10-buildvector.ll
+++ b/test/CodeGen/X86/2012-1-10-buildvector.ll
@@ -1,27 +1,28 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7-avx -mattr=+avx -mtriple=i686-pc-win32 | FileCheck %s
+; RUN: llc < %s -mattr=+avx -mtriple=i686-unknown-unknown | FileCheck %s
 
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:128:128-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S32"
-target triple = "i686-pc-win32"
-
-;CHECK-LABEL: bad_cast:
 define void @bad_cast() {
-entry:
+; CHECK-LABEL: bad_cast:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%eax)
+; CHECK-NEXT:    movl $0, (%eax)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retl
   %vext.i = shufflevector <2 x i64> undef, <2 x i64> undef, <3 x i32> <i32 0, i32 1, i32 undef>
   %vecinit8.i = shufflevector <3 x i64> zeroinitializer, <3 x i64> %vext.i, <3 x i32> <i32 0, i32 3, i32 4>
   store <3 x i64> %vecinit8.i, <3 x i64>* undef, align 32
-;CHECK: ret
   ret void
 }
 
-
-;CHECK-LABEL: bad_insert:
 define void @bad_insert(i32 %t) {
-entry:
-;CHECK: vxorps %ymm1, %ymm1, %ymm1
-;CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; CHECK-LABEL: bad_insert:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovaps %ymm0, (%eax)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retl
   %v2 = insertelement <8 x i32> zeroinitializer, i32 %t, i32 0
   store <8 x i32> %v2, <8 x i32> addrspace(1)* undef, align 32
-;CHECK: ret
   ret void
 }
 
diff --git a/test/CodeGen/X86/2012-11-28-merge-store-alias.ll b/test/CodeGen/X86/2012-11-28-merge-store-alias.ll
index df4f028..ed1daad 100644
--- a/test/CodeGen/X86/2012-11-28-merge-store-alias.ll
+++ b/test/CodeGen/X86/2012-11-28-merge-store-alias.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: merge_stores_can
 ; CHECK: callq foo
-; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK: xorps %xmm0, %xmm0
 ; CHECK-NEXT: movups  %xmm0
 ; CHECK: callq foo
 ; CHECK: ret
diff --git a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
index 84e77a8..d175fab 100644
--- a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
@@ -16,7 +16,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define signext i16 @subdivp(%struct.node.0.27* nocapture %p, double %dsq, double %tolsq, %struct.hgstruct.2.29* nocapture byval align 8 %hg) nounwind uwtable readonly ssp {
 entry:
-  call void @llvm.dbg.declare(metadata %struct.hgstruct.2.29* %hg, metadata !4, metadata !MDExpression())
+  call void @llvm.dbg.declare(metadata %struct.hgstruct.2.29* %hg, metadata !4, metadata !MDExpression()), !dbg !MDLocation(scope: !14)
   %type = getelementptr inbounds %struct.node.0.27, %struct.node.0.27* %p, i64 0, i32 0
   %0 = load i16, i16* %type, align 2
   %cmp = icmp eq i16 %0, 1
diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
index b7124c9..08ade9c 100644
--- a/test/CodeGen/X86/2012-11-30-misched-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
@@ -43,14 +43,14 @@ if.then3344:
   br label %if.then4073
 
 if.then4073:                                      ; preds = %if.then3344
-  call void @llvm.dbg.declare(metadata [20 x i8]* %num14075, metadata !4, metadata !MDExpression())
+  call void @llvm.dbg.declare(metadata [20 x i8]* %num14075, metadata !4, metadata !MDExpression()), !dbg !MDLocation(scope: !5)
   %arraydecay4078 = getelementptr inbounds [20 x i8], [20 x i8]* %num14075, i64 0, i64 0
   %0 = load i32, i32* undef, align 4
   %add4093 = add nsw i32 %0, 0
   %conv4094 = sitofp i32 %add4093 to float
   %div4095 = fdiv float %conv4094, 5.670000e+02
   %conv4096 = fpext float %div4095 to double
-  %call4097 = call i32 (i8*, i32, i64, i8*, ...)* @__sprintf_chk(i8* %arraydecay4078, i32 0, i64 20, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str15, i64 0, i64 0), double %conv4096) nounwind
+  %call4097 = call i32 (i8*, i32, i64, i8*, ...) @__sprintf_chk(i8* %arraydecay4078, i32 0, i64 20, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str15, i64 0, i64 0), double %conv4096) nounwind
   br i1 %cmp1733, label %if.then4107, label %if.else4114
 
 if.then4107:                                      ; preds = %if.then4073
@@ -108,7 +108,7 @@ cond.true:                                        ; preds = %entry
   unreachable
 
 cond.end:                                         ; preds = %entry
-  call void @llvm.dbg.declare(metadata %"class.__gnu_cxx::hash_map"* %X, metadata !31, metadata !MDExpression())
+  call void @llvm.dbg.declare(metadata %"class.__gnu_cxx::hash_map"* %X, metadata !31, metadata !MDExpression()), !dbg !MDLocation(scope: !37)
   %_M_num_elements.i.i.i.i = getelementptr inbounds %"class.__gnu_cxx::hash_map", %"class.__gnu_cxx::hash_map"* %X, i64 0, i32 0, i32 5
   invoke void @_Znwm()
           to label %exit.i unwind label %lpad2.i.i.i.i
diff --git a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
index 5bcff57..871c68f 100644
--- a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
@@ -20,7 +20,7 @@ if.then:                                          ; preds = %entry
   unreachable
 
 if.end:                                           ; preds = %entry
-  call void @llvm.dbg.declare(metadata %struct.btCompoundLeafCallback* %callback, metadata !3, metadata !MDExpression())
+  call void @llvm.dbg.declare(metadata %struct.btCompoundLeafCallback* %callback, metadata !3, metadata !MDExpression()), !dbg !MDLocation(scope: !2)
   %m = getelementptr inbounds %struct.btCompoundLeafCallback, %struct.btCompoundLeafCallback* %callback, i64 0, i32 1
   store i32 0, i32* undef, align 8
   %cmp12447 = icmp sgt i32 undef, 0
diff --git a/test/CodeGen/X86/2014-08-29-CompactUnwind.ll b/test/CodeGen/X86/2014-08-29-CompactUnwind.ll
index 3d9dc57..120eba7 100644
--- a/test/CodeGen/X86/2014-08-29-CompactUnwind.ll
+++ b/test/CodeGen/X86/2014-08-29-CompactUnwind.ll
@@ -36,7 +36,7 @@ print_shadow_bytes.exit.i: ; preds = %print_shadow_bytes.exit.i, %0
   %reg16 = getelementptr inbounds [3 x i8], [3 x i8]* %.str..str1.i, i64 0, i64 0
   %reg17 = shl i64 %iv.i, 1
   %reg19 = inttoptr i64 %reg17 to i8*
-  call void (i64*, i8*, ...)* @append(i64* %str.i, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str2, i64 0, i64 0), i8* %reg16, i8* %reg19)
+  call void (i64*, i8*, ...) @append(i64* %str.i, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str2, i64 0, i64 0), i8* %reg16, i8* %reg19)
   %iv.next.i = add nsw i64 %iv.i, 0
   br label %print_shadow_bytes.exit.i
 }
diff --git a/test/CodeGen/X86/GC/dynamic-frame-size.ll b/test/CodeGen/X86/GC/dynamic-frame-size.ll
new file mode 100644
index 0000000..a3583d4
--- /dev/null
+++ b/test/CodeGen/X86/GC/dynamic-frame-size.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+declare void @use(<4 x i8*>*)
+
+; Test that a frame which requires dynamic relocation produces a stack map
+; with a size of UINT64_MAX.
+define void @test(i8* %ptr) gc "erlang" {
+   ; 32 byte alignment (for the alloca) is larger than the default
+   ; 16 byte alignment
+   %slot = alloca <4 x i8*>
+   call void @use(<4 x i8*>* %slot);
+   ret void
+}
+
+; CHECK: .note.gc
+; CHECK-NEXT: .align 8
+; safe point count
+; CHECK .short	1
+; CHECK .long	.Ltmp0
+; stack frame size (in words)
+; CHECK .short	-1
+; stack arity (arguments on the stack)
+; CHECK .short	0
+; live root count
+; CHECK .short	0
+
diff --git a/test/CodeGen/X86/MachineSink-DbgValue.ll b/test/CodeGen/X86/MachineSink-DbgValue.ll
index 265fec4..79db445 100644
--- a/test/CodeGen/X86/MachineSink-DbgValue.ll
+++ b/test/CodeGen/X86/MachineSink-DbgValue.ll
@@ -49,5 +49,5 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !18 = !{!1}
 !19 = !{!6, !7, !10}
 !20 = !MDFile(filename: "a.c", directory: "/private/tmp")
-!21 = !{i32 0}
+!21 = !{}
 !22 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/StackColoring-dbg.ll b/test/CodeGen/X86/StackColoring-dbg.ll
index da4d58a..7ac08d1 100644
--- a/test/CodeGen/X86/StackColoring-dbg.ll
+++ b/test/CodeGen/X86/StackColoring-dbg.ll
@@ -17,7 +17,7 @@ entry:
 for.body:
   call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
   call void @llvm.lifetime.start(i64 -1, i8* %x.i) nounwind
-  call void @llvm.dbg.declare(metadata i8* %x.i, metadata !22, metadata !MDExpression()) nounwind
+  call void @llvm.dbg.declare(metadata i8* %x.i, metadata !22, metadata !MDExpression()) nounwind, !dbg !MDLocation(scope: !2)
   br label %for.body
 }
 
@@ -27,7 +27,7 @@ declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!23}
-!0 = !MDCompileUnit(language: DW_LANG_C89, producer: "clang", isOptimized: true, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2)
+!0 = !MDCompileUnit(language: DW_LANG_C89, producer: "clang", isOptimized: true, emissionKind: 0, file: !1, enums: !{}, retainedTypes: !{})
 !1 = !MDFile(filename: "t.c", directory: "")
 !16 = !MDBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
 !2 = !MDSubprogram()
diff --git a/test/CodeGen/X86/add-of-carry.ll b/test/CodeGen/X86/add-of-carry.ll
index 9c24be4..44b587a 100644
--- a/test/CodeGen/X86/add-of-carry.ll
+++ b/test/CodeGen/X86/add-of-carry.ll
@@ -4,43 +4,26 @@
 define i32 @test1(i32 %sum, i32 %x) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: test1:
-; CHECK: cmpl %ecx, %eax 
-; CHECK-NOT: addl
-; CHECK: adcl $0, %eax
-  %add4 = add i32 %x, %sum
-  %cmp = icmp ult i32 %add4, %x
-  %inc = zext i1 %cmp to i32
-  %z.0 = add i32 %add4, %inc
-  ret i32 %z.0
-}
-
-; Instcombine transforms test1 into test2:
-; CHECK-LABEL: test2:
 ; CHECK: movl
 ; CHECK-NEXT: addl
 ; CHECK-NEXT: adcl $0
 ; CHECK-NEXT: ret
-define i32 @test2(i32 %sum, i32 %x) nounwind readnone ssp {
-entry:
-  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %x, i32 %sum)
-  %0 = extractvalue { i32, i1 } %uadd, 0
-  %cmp = extractvalue { i32, i1 } %uadd, 1
+  %add4 = add i32 %x, %sum
+  %cmp = icmp ult i32 %add4, %x
   %inc = zext i1 %cmp to i32
-  %z.0 = add i32 %0, %inc
+  %z.0 = add i32 %add4, %inc
   ret i32 %z.0
 }
 
 ; <rdar://problem/12579915>
-define i32 @test3(i32 %x, i32 %y, i32 %res) nounwind uwtable readnone ssp {
+define i32 @test2(i32 %x, i32 %y, i32 %res) nounwind uwtable readnone ssp {
 entry:
   %cmp = icmp ugt i32 %x, %y
   %dec = sext i1 %cmp to i32
   %dec.res = add nsw i32 %dec, %res
   ret i32 %dec.res
-; CHECK-LABEL: test3:
+; CHECK-LABEL: test2:
 ; CHECK: cmpl
 ; CHECK: sbbl
 ; CHECK: ret
 }
-
-declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
diff --git a/test/CodeGen/X86/aliases.ll b/test/CodeGen/X86/aliases.ll
index 360f141..3f19a06 100644
--- a/test/CodeGen/X86/aliases.ll
+++ b/test/CodeGen/X86/aliases.ll
@@ -69,7 +69,7 @@ entry:
    %tmp0 = load i32, i32* @bar_i
    %tmp2 = call i32 @foo_f()
    %tmp3 = add i32 %tmp, %tmp2
-   %tmp4 = call %FunTy* @bar_f()
+   %tmp4 = call i32 @bar_f()
    %tmp5 = add i32 %tmp3, %tmp4
    %tmp6 = add i32 %tmp1, %tmp5
    %tmp7 = add i32 %tmp6, %tmp0
diff --git a/test/CodeGen/X86/and-or-fold.ll b/test/CodeGen/X86/and-or-fold.ll
index 836b5f1..ec39522 100644
--- a/test/CodeGen/X86/and-or-fold.ll
+++ b/test/CodeGen/X86/and-or-fold.ll
@@ -21,6 +21,6 @@ entry:
   %tmp1 = and i64 %x, 123127
   %tmp2 = or i64 %tmp1, 3
   ret i64 %tmp2
-; DARWIN-OPT:       andq $123124
+; DARWIN-OPT:       andl $123124
 ; DARWIN-OPT-NEXT:  leaq 3
 }
diff --git a/test/CodeGen/X86/andimm8.ll b/test/CodeGen/X86/andimm8.ll
index 640237d..d9e676a 100644
--- a/test/CodeGen/X86/andimm8.ll
+++ b/test/CodeGen/X86/andimm8.ll
@@ -17,3 +17,15 @@ define void @foo(i64 %zed, i64* %x) nounwind {
   store i64 %t2, i64* %x, align 8
   ret void
 }
+
+define i64 @bar(i64 %zed) nounwind {
+; CHECK:  andl     $42, %edi               # encoding: [0x83,0xe7,0x2a]
+  %t1 = and i64 %zed, 42
+  ret i64 %t1
+}
+
+define i64 @baz(i64 %zed) nounwind {
+; CHECK:  andl $2147483647, %edi      # encoding: [0x81,0xe7,0xff,0xff,0xff,0x7f]
+  %t1 = and i64 %zed, 2147483647
+  ret i64 %t1
+}
diff --git a/test/CodeGen/X86/anyregcc-crash.ll b/test/CodeGen/X86/anyregcc-crash.ll
index 3abe3d1..a7c104e 100644
--- a/test/CodeGen/X86/anyregcc-crash.ll
+++ b/test/CodeGen/X86/anyregcc-crash.ll
@@ -7,7 +7,7 @@ define i64 @anyreglimit(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6,
                         i64 %v7, i64 %v8, i64 %v9, i64 %v10, i64 %v11, i64 %v12,
                         i64 %v13, i64 %v14, i64 %v15, i64 %v16) {
 entry:
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 16,
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 16,
                 i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6,
                 i64 %v7, i64 %v8, i64 %v9, i64 %v10, i64 %v11, i64 %v12,
                 i64 %v13, i64 %v14, i64 %v15, i64 %v16)
diff --git a/test/CodeGen/X86/anyregcc.ll b/test/CodeGen/X86/anyregcc.ll
index 98ba17c..129aadf 100644
--- a/test/CodeGen/X86/anyregcc.ll
+++ b/test/CodeGen/X86/anyregcc.ll
@@ -60,7 +60,7 @@
 ; CHECK-NEXT:   .long 3
 define i64 @test() nounwind ssp uwtable {
 entry:
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 0, i32 15, i8* null, i32 2, i32 1, i32 2, i64 3)
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 0, i32 15, i8* null, i32 2, i32 1, i32 2, i64 3)
   ret i64 0
 }
 
@@ -82,7 +82,7 @@ entry:
 define i64 @property_access1(i8* %obj) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 12297829382473034410 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 1, i32 15, i8* %f, i32 1, i8* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 1, i32 15, i8* %f, i32 1, i8* %obj)
   ret i64 %ret
 }
 
@@ -105,7 +105,7 @@ define i64 @property_access2() nounwind ssp uwtable {
 entry:
   %obj = alloca i64, align 8
   %f = inttoptr i64 12297829382473034410 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %f, i32 1, i64* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %f, i32 1, i64* %obj)
   ret i64 %ret
 }
 
@@ -128,7 +128,7 @@ define i64 @property_access3() nounwind ssp uwtable {
 entry:
   %obj = alloca i64, align 8
   %f = inttoptr i64 12297829382473034410 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 3, i32 15, i8* %f, i32 0, i64* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 3, i32 15, i8* %f, i32 0, i64* %obj)
   ret i64 %ret
 }
 
@@ -210,7 +210,7 @@ entry:
 define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 12297829382473034410 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 4, i32 15, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 4, i32 15, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
   ret i64 %ret
 }
 
@@ -292,7 +292,7 @@ entry:
 define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 12297829382473034410 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
   ret i64 %ret
 }
 
@@ -320,7 +320,7 @@ entry:
 ; CHECK-NEXT: .long  0
 define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
   tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
   ret i64 %result
 }
@@ -360,7 +360,7 @@ entry:
 define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
   tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 13, i32 15, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 13, i32 15, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
   ret i64 %result
 }
 
diff --git a/test/CodeGen/X86/atomic64.ll b/test/CodeGen/X86/atomic64.ll
index 11b4e68..c6b1c39 100644
--- a/test/CodeGen/X86/atomic64.ll
+++ b/test/CodeGen/X86/atomic64.ll
@@ -48,7 +48,7 @@ define void @atomic_fetch_and64() nounwind {
 ; X64:       lock
 ; X64:       andq $3
   %t2 = atomicrmw and  i64* @sc64, i64 5 acquire
-; X64:       andq
+; X64:       andl
 ; X64:       lock
 ; X64:       cmpxchgq
   %t3 = atomicrmw and  i64* @sc64, i64 %t2 acquire
diff --git a/test/CodeGen/X86/avoid-loop-align.ll b/test/CodeGen/X86/avoid-loop-align.ll
index 5d00ed0..d82cf94 100644
--- a/test/CodeGen/X86/avoid-loop-align.ll
+++ b/test/CodeGen/X86/avoid-loop-align.ll
@@ -11,7 +11,7 @@
 
 define i8* @test(i8* %Q, i32* %L) nounwind {
 entry:
-	%tmp = tail call i32 (...)* @foo() nounwind		; <i32> [#uses=2]
+	%tmp = tail call i32 (...) @foo() nounwind		; <i32> [#uses=2]
 	%tmp1 = inttoptr i32 %tmp to i8*		; <i8*> [#uses=1]
 	br label %bb1
 
diff --git a/test/CodeGen/X86/avx-bitcast.ll b/test/CodeGen/X86/avx-bitcast.ll
index bb3e5a5..e34c20f 100644
--- a/test/CodeGen/X86/avx-bitcast.ll
+++ b/test/CodeGen/X86/avx-bitcast.ll
@@ -1,8 +1,11 @@
-; RUN: llc < %s -O0 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
 
-; CHECK: vmovsd (%
-; CHECK-NEXT: vmovq %xmm
 define i64 @bitcasti64tof64() {
+; CHECK-LABEL: bitcasti64tof64:
+; CHECK:       # BB#0:
+; CHECK:         vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vmovq %xmm0, %rax
+; CHECK-NEXT:    retq
   %a = load double, double* undef
   %b = bitcast double %a to i64
   ret i64 %b
diff --git a/test/CodeGen/X86/avx-cvt-2.ll b/test/CodeGen/X86/avx-cvt-2.ll
index 8cc7190..583c7d5 100644
--- a/test/CodeGen/X86/avx-cvt-2.ll
+++ b/test/CodeGen/X86/avx-cvt-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s
 
 ; Check that we generate vector conversion from float to narrower int types
 
@@ -8,8 +8,16 @@
 
 define void @fptoui16(%f32vec_t %a, %i16vec_t *%p) {
 ; CHECK-LABEL: fptoui16:
-; CHECK: vcvttps2dq %ymm
-; CHECK-NOT: vcvttss2si
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    vmovdqa %xmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %b = fptoui %f32vec_t %a to %i16vec_t
   store %i16vec_t %b, %i16vec_t * %p
   ret void
@@ -17,8 +25,16 @@ define void @fptoui16(%f32vec_t %a, %i16vec_t *%p) {
 
 define void @fptosi16(%f32vec_t %a, %i16vec_t *%p) {
 ; CHECK-LABEL: fptosi16:
-; CHECK: vcvttps2dq %ymm
-; CHECK-NOT: vcvttss2si
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    vmovdqa %xmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %b = fptosi %f32vec_t %a to %i16vec_t
   store %i16vec_t %b, %i16vec_t * %p
   ret void
@@ -26,8 +42,17 @@ define void @fptosi16(%f32vec_t %a, %i16vec_t *%p) {
 
 define void @fptoui8(%f32vec_t %a, %i8vec_t *%p) {
 ; CHECK-LABEL: fptoui8:
-; CHECK: vcvttps2dq %ymm
-; CHECK-NOT: vcvttss2si
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    vmovq %xmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %b = fptoui %f32vec_t %a to %i8vec_t
   store %i8vec_t %b, %i8vec_t * %p
   ret void
@@ -35,8 +60,17 @@ define void @fptoui8(%f32vec_t %a, %i8vec_t *%p) {
 
 define void @fptosi8(%f32vec_t %a, %i8vec_t *%p) {
 ; CHECK-LABEL: fptosi8:
-; CHECK: vcvttps2dq %ymm
-; CHECK-NOT: vcvttss2si
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    vmovq %xmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %b = fptosi %f32vec_t %a to %i8vec_t
   store %i8vec_t %b, %i8vec_t * %p
   ret void
diff --git a/test/CodeGen/X86/avx-cvt.ll b/test/CodeGen/X86/avx-cvt.ll
index 9f154ab..6df3e53 100644
--- a/test/CodeGen/X86/avx-cvt.ll
+++ b/test/CodeGen/X86/avx-cvt.ll
@@ -1,84 +1,122 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
 
-; CHECK: vcvtdq2ps %ymm
 define <8 x float> @sitofp00(<8 x i32> %a) nounwind {
+; CHECK-LABEL: sitofp00:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %b = sitofp <8 x i32> %a to <8 x float>
   ret <8 x float> %b
 }
 
-; CHECK: vcvttps2dq %ymm
 define <8 x i32> @fptosi00(<8 x float> %a) nounwind {
+; CHECK-LABEL: fptosi00:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %b = fptosi <8 x float> %a to <8 x i32>
   ret <8 x i32> %b
 }
 
-; CHECK: vcvtdq2pd %xmm
 define <4 x double> @sitofp01(<4 x i32> %a) {
+; CHECK-LABEL: sitofp01:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; CHECK-NEXT:    retq
   %b = sitofp <4 x i32> %a to <4 x double>
   ret <4 x double> %b
 }
 
-; CHECK: vcvtdq2ps %ymm
 define <8 x float> @sitofp02(<8 x i16> %a) {
+; CHECK-LABEL: sitofp02:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %b = sitofp <8 x i16> %a to <8 x float>
   ret <8 x float> %b
 }
 
-; CHECK: vcvttpd2dqy %ymm
 define <4 x i32> @fptosi01(<4 x double> %a) {
+; CHECK-LABEL: fptosi01:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttpd2dqy %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %b = fptosi <4 x double> %a to <4 x i32>
   ret <4 x i32> %b
 }
 
-; CHECK: vcvtpd2psy %ymm
-; CHECK-NEXT: vcvtpd2psy %ymm
-; CHECK-NEXT: vinsertf128 $1
 define <8 x float> @fptrunc00(<8 x double> %b) nounwind {
+; CHECK-LABEL: fptrunc00:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtpd2psy %ymm0, %xmm0
+; CHECK-NEXT:    vcvtpd2psy %ymm1, %xmm1
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %a = fptrunc <8 x double> %b to <8 x float>
   ret <8 x float> %a
 }
 
-; CHECK: vcvtps2pd %xmm
 define <4 x double> @fpext00(<4 x float> %b) nounwind {
+; CHECK-LABEL: fpext00:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtps2pd %xmm0, %ymm0
+; CHECK-NEXT:    retq
   %a = fpext <4 x float> %b to <4 x double>
   ret <4 x double> %a
 }
 
-; CHECK: vcvtsi2sdq (%
 define double @funcA(i64* nocapture %e) nounwind uwtable readonly ssp {
-entry:
+; CHECK-LABEL: funcA:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtsi2sdq (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %tmp1 = load i64, i64* %e, align 8
   %conv = sitofp i64 %tmp1 to double
   ret double %conv
 }
 
-; CHECK: vcvtsi2sdl (%
 define double @funcB(i32* nocapture %e) nounwind uwtable readonly ssp {
-entry:
+; CHECK-LABEL: funcB:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtsi2sdl (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %tmp1 = load i32, i32* %e, align 4
   %conv = sitofp i32 %tmp1 to double
   ret double %conv
 }
 
-; CHECK: vcvtsi2ssl (%
 define float @funcC(i32* nocapture %e) nounwind uwtable readonly ssp {
-entry:
+; CHECK-LABEL: funcC:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtsi2ssl (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %tmp1 = load i32, i32* %e, align 4
   %conv = sitofp i32 %tmp1 to float
   ret float %conv
 }
 
-; CHECK: vcvtsi2ssq  (%
 define float @funcD(i64* nocapture %e) nounwind uwtable readonly ssp {
-entry:
+; CHECK-LABEL: funcD:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtsi2ssq (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %tmp1 = load i64, i64* %e, align 8
   %conv = sitofp i64 %tmp1 to float
   ret float %conv
 }
 
-; CHECK: vcvtss2sd
 define void @fpext() nounwind uwtable {
-entry:
+; CHECK-LABEL: fpext:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovsd %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    retq
   %f = alloca float, align 4
   %d = alloca double, align 8
   %tmp = load float, float* %f, align 4
@@ -88,16 +126,20 @@ entry:
 }
 
 define double @nearbyint_f64(double %a) {
-; CHECK-LABEL: nearbyint_f64
-; CHECK: vroundsd $12
+; CHECK-LABEL: nearbyint_f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vroundsd $12, %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res = call double @llvm.nearbyint.f64(double %a)
   ret double %res
 }
 declare double @llvm.nearbyint.f64(double %p)
 
 define float @floor_f32(float %a) {
-; CHECK-LABEL: floor_f32
-; CHECK: vroundss $1
+; CHECK-LABEL: floor_f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vroundss $1, %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res = call float @llvm.floor.f32(float %a)
   ret float %res
 }
diff --git a/test/CodeGen/X86/avx-shift.ll b/test/CodeGen/X86/avx-shift.ll
index a70d45a..83585b5 100644
--- a/test/CodeGen/X86/avx-shift.ll
+++ b/test/CodeGen/X86/avx-shift.ll
@@ -1,147 +1,224 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
 
 ;;; Shift left
-; CHECK: vpslld
-; CHECK: vpslld
-define <8 x i32> @vshift00(<8 x i32> %a) nounwind readnone {
+define <8 x i32> @vshift00(<8 x i32> %a) {
+; CHECK-LABEL: vshift00:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $2, %xmm0, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpslld $2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %s = shl <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
 2>
   ret <8 x i32> %s
 }
 
-; CHECK: vpsllw
-; CHECK: vpsllw
-define <16 x i16> @vshift01(<16 x i16> %a) nounwind readnone {
+define <16 x i16> @vshift01(<16 x i16> %a) {
+; CHECK-LABEL: vshift01:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $2, %xmm0, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpsllw $2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %s = shl <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
   ret <16 x i16> %s
 }
 
-; CHECK: vpsllq
-; CHECK: vpsllq
-define <4 x i64> @vshift02(<4 x i64> %a) nounwind readnone {
+define <4 x i64> @vshift02(<4 x i64> %a) {
+; CHECK-LABEL: vshift02:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllq $2, %xmm0, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpsllq $2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %s = shl <4 x i64> %a, <i64 2, i64 2, i64 2, i64 2>
   ret <4 x i64> %s
 }
 
 ;;; Logical Shift right
-; CHECK: vpsrld
-; CHECK: vpsrld
-define <8 x i32> @vshift03(<8 x i32> %a) nounwind readnone {
+define <8 x i32> @vshift03(<8 x i32> %a) {
+; CHECK-LABEL: vshift03:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrld $2, %xmm0, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpsrld $2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %s = lshr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
 2>
   ret <8 x i32> %s
 }
 
-; CHECK: vpsrlw
-; CHECK: vpsrlw
-define <16 x i16> @vshift04(<16 x i16> %a) nounwind readnone {
+define <16 x i16> @vshift04(<16 x i16> %a) {
+; CHECK-LABEL: vshift04:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrlw $2, %xmm0, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpsrlw $2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %s = lshr <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
   ret <16 x i16> %s
 }
 
-; CHECK: vpsrlq
-; CHECK: vpsrlq
-define <4 x i64> @vshift05(<4 x i64> %a) nounwind readnone {
+define <4 x i64> @vshift05(<4 x i64> %a) {
+; CHECK-LABEL: vshift05:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrlq $2, %xmm0, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpsrlq $2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %s = lshr <4 x i64> %a, <i64 2, i64 2, i64 2, i64 2>
   ret <4 x i64> %s
 }
 
 ;;; Arithmetic Shift right
-; CHECK: vpsrad
-; CHECK: vpsrad
-define <8 x i32> @vshift06(<8 x i32> %a) nounwind readnone {
+define <8 x i32> @vshift06(<8 x i32> %a) {
+; CHECK-LABEL: vshift06:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrad $2, %xmm0, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpsrad $2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %s = ashr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
 2>
   ret <8 x i32> %s
 }
 
-; CHECK: vpsraw
-; CHECK: vpsraw
-define <16 x i16> @vshift07(<16 x i16> %a) nounwind readnone {
+define <16 x i16> @vshift07(<16 x i16> %a) {
+; CHECK-LABEL: vshift07:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsraw $2, %xmm0, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpsraw $2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %s = ashr <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
   ret <16 x i16> %s
 }
 
-; CHECK: vpsrlw
-; CHECK: pand
-; CHECK: pxor
-; CHECK: psubb
-; CHECK: vpsrlw
-; CHECK: pand
-; CHECK: pxor
-; CHECK: psubb
-define <32 x i8> @vshift09(<32 x i8> %a) nounwind readnone {
+define <32 x i8> @vshift09(<32 x i8> %a) {
+; CHECK-LABEL: vshift09:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpsrlw $2, %xmm1, %xmm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; CHECK-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; CHECK-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
+; CHECK-NEXT:    vpsrlw $2, %xmm0, %xmm0
+; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; CHECK-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %s = ashr <32 x i8> %a, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
   ret <32 x i8> %s
 }
 
-; CHECK: pxor
-; CHECK: pcmpgtb
-; CHECK: pcmpgtb
-define <32 x i8> @vshift10(<32 x i8> %a) nounwind readnone {
+define <32 x i8> @vshift10(<32 x i8> %a) {
+; CHECK-LABEL: vshift10:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm1
+; CHECK-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %s = ashr <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   ret <32 x i8> %s
 }
 
-; CHECK: vpsrlw
-; CHECK: pand
-; CHECK: vpsrlw
-; CHECK: pand
-define <32 x i8> @vshift11(<32 x i8> %a) nounwind readnone {
+define <32 x i8> @vshift11(<32 x i8> %a) {
+; CHECK-LABEL: vshift11:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpsrlw $2, %xmm1, %xmm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpsrlw $2, %xmm0, %xmm0
+; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %s = lshr <32 x i8> %a, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
   ret <32 x i8> %s
 }
 
-; CHECK: vpsllw
-; CHECK: pand
-; CHECK: vpsllw
-; CHECK: pand
-define <32 x i8> @vshift12(<32 x i8> %a) nounwind readnone {
+define <32 x i8> @vshift12(<32 x i8> %a) {
+; CHECK-LABEL: vshift12:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpsllw $2, %xmm1, %xmm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpsllw $2, %xmm0, %xmm0
+; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %s = shl <32 x i8> %a, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
   ret <32 x i8> %s
 }
 
 ;;; Support variable shifts
-; CHECK: _vshift08
-; CHECK: vpslld $23
-; CHECK: vextractf128 $1
-; CHECK: vpslld $23
-; CHECK: ret
-define <8 x i32> @vshift08(<8 x i32> %a) nounwind {
+define <8 x i32> @vshift08(<8 x i32> %a)  {
+; CHECK-LABEL: vshift08:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $23, %xmm0, %xmm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vcvttps2dq %xmm1, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpslld $23, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %bitop = shl <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, %a
   ret <8 x i32> %bitop
 }
 
 ; PR15141
-; CHECK: _vshift13:
-; CHECK-NOT: vpsll
-; CHECK-NOT: vcvttps2dq
-; CHECK: vpmulld
 define <4 x i32> @vshift13(<4 x i32> %in) {
+; CHECK-LABEL: vshift13:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %T = shl <4 x i32> %in, <i32 0, i32 1, i32 2, i32 4>
   ret <4 x i32> %T
 }
 
 ;;; Uses shifts for sign extension
-; CHECK: _sext_v16i16
-; CHECK: vpsllw
-; CHECK: vpsraw
-; CHECK: vpsllw
-; CHECK: vpsraw
-; CHECK: vinsertf128
-define <16 x i16> @sext_v16i16(<16 x i16> %a) nounwind {
+define <16 x i16> @sext_v16i16(<16 x i16> %a)  {
+; CHECK-LABEL: sext_v16i16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $8, %xmm0, %xmm1
+; CHECK-NEXT:    vpsraw $8, %xmm1, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpsllw $8, %xmm0, %xmm0
+; CHECK-NEXT:    vpsraw $8, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %b = trunc <16 x i16> %a to <16 x i8>
   %c = sext <16 x i8> %b to <16 x i16>
   ret <16 x i16> %c
 }
 
-; CHECK: _sext_v8i32
-; CHECK: vpslld
-; CHECK: vpsrad
-; CHECK: vpslld
-; CHECK: vpsrad
-; CHECK: vinsertf128
-define <8 x i32> @sext_v8i32(<8 x i32> %a) nounwind {
+define <8 x i32> @sext_v8i32(<8 x i32> %a)  {
+; CHECK-LABEL: sext_v8i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $16, %xmm0, %xmm1
+; CHECK-NEXT:    vpsrad $16, %xmm1, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpslld $16, %xmm0, %xmm0
+; CHECK-NEXT:    vpsrad $16, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %b = trunc <8 x i32> %a to <8 x i16>
   %c = sext <8 x i16> %b to <8 x i32>
   ret <8 x i32> %c
diff --git a/test/CodeGen/X86/avx-varargs-x86_64.ll b/test/CodeGen/X86/avx-varargs-x86_64.ll
index f550733..7ce5e19 100644
--- a/test/CodeGen/X86/avx-varargs-x86_64.ll
+++ b/test/CodeGen/X86/avx-varargs-x86_64.ll
@@ -10,6 +10,6 @@ declare i32 @f(i32, ...)
 define void @test1() nounwind uwtable ssp {
 entry:
   %0 = load <8 x float>, <8 x float>* @x, align 32
-  %call = call i32 (i32, ...)* @f(i32 1, <8 x float> %0)
+  %call = call i32 (i32, ...) @f(i32 1, <8 x float> %0)
   ret void
 }
diff --git a/test/CodeGen/X86/avx512-fma-intrinsics.ll b/test/CodeGen/X86/avx512-fma-intrinsics.ll
index 9b82c88..9814a61 100644
--- a/test/CodeGen/X86/avx512-fma-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-fma-intrinsics.ll
@@ -1,50 +1,8 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding | FileCheck %s
 
-define <16 x float> @test_x86_vfmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
-  ; CHECK-LABEL: test_x86_vfmadd_ps_z
-  ; CHECK: vfmadd213ps %zmm
-  %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
-
-define <16 x float> @test_mask_vfmadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
-  ; CHECK-LABEL: test_mask_vfmadd_ps
-  ; CHECK: vfmadd213ps %zmm
-  %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
-  ret <16 x float> %res
-}
-
-define <8 x double> @test_x86_vfmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
-  ; CHECK-LABEL: test_x86_vfmadd_pd_z
-  ; CHECK: vfmadd213pd %zmm
-  %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_mask_fmadd_pd(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
-; CHECK-LABEL: test_mask_fmadd_pd:
-; CHECK: vfmadd213pd %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa8,0xc2]
-  %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4)
-  ret <8 x double> %res
-}
-
+declare <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
 declare <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
-
-define <16 x float> @test_x86_vfmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
-  ; CHECK-LABEL: test_x86_vfmsubps_z
-  ; CHECK: vfmsub213ps %zmm
-  %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
-
-define <16 x float> @test_mask_vfmsub_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
-  ; CHECK-LABEL: test_mask_vfmsub_ps
-  ; CHECK: vfmsub213ps %zmm
-  %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
-  ret <16 x float> %res
-}
+declare <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
 
 define <8 x double> @test_x86_vfmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
   ; CHECK-LABEL: test_x86_vfmsubpd_z
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 46581f7..07d984a 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -515,14 +515,6 @@ define <16 x i32> @test_vpmaxsd(<16 x i32> %a0, <16 x i32> %a1) {
 }
 declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
-define <8 x i64> @test_vpmuludq(<16 x i32> %a0, <16 x i32> %a1) {
-  ; CHECK: vpmuludq {{.*}}encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1]
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a0, <16 x i32> %a1,
-                    <8 x i64>zeroinitializer, i8 -1)
-  ret <8 x i64> %res
-}
-declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
-
 define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1) {
   ; CHECK: vptestmq {{.*}}encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc1]
   %res = call i8 @llvm.x86.avx512.mask.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
@@ -1606,3 +1598,568 @@ define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8
                     <8 x double> zeroinitializer, i8 %mask, i32 3)
   ret <8 x double> %res
 }
+
+define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) {
+  ;CHECK-LABEL: test_xor_epi32
+  ;CHECK: vpxord {{.*}}encoding: [0x62,0xf1,0x7d,0x48,0xef,0xc1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
+  ;CHECK-LABEL: test_mask_xor_epi32
+  ;CHECK: vpxord %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xef,0xd1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) {
+  ;CHECK-LABEL: test_or_epi32
+  ;CHECK: vpord {{.*}}encoding: [0x62,0xf1,0x7d,0x48,0xeb,0xc1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
+  ;CHECK-LABEL: test_mask_or_epi32
+  ;CHECK: vpord %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xeb,0xd1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) {
+  ;CHECK-LABEL: test_and_epi32
+  ;CHECK: vpandd {{.*}}encoding: [0x62,0xf1,0x7d,0x48,0xdb,0xc1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
+  ;CHECK-LABEL: test_mask_and_epi32
+  ;CHECK: vpandd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xdb,0xd1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b) {
+  ;CHECK-LABEL: test_xor_epi64
+  ;CHECK: vpxorq {{.*}}encoding: [0x62,0xf1,0xfd,0x48,0xef,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_xor_epi64
+  ;CHECK: vpxorq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xef,0xd1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b) {
+  ;CHECK-LABEL: test_or_epi64
+  ;CHECK: vporq {{.*}}encoding: [0x62,0xf1,0xfd,0x48,0xeb,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_or_epi64
+  ;CHECK: vporq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xeb,0xd1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b) {
+  ;CHECK-LABEL: test_and_epi64
+  ;CHECK: vpandq {{.*}}encoding: [0x62,0xf1,0xfd,0x48,0xdb,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_and_epi64
+  ;CHECK: vpandq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xdb,0xd1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+
+define <16 x i32> @test_mask_add_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
+  ;CHECK-LABEL: test_mask_add_epi32_rr
+  ;CHECK: vpaddd %zmm1, %zmm0, %zmm0     ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi32_rrk
+  ;CHECK: vpaddd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfe,0xd1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi32_rrkz
+  ;CHECK: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfe,0xc1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
+  ;CHECK-LABEL: test_mask_add_epi32_rm
+  ;CHECK: vpaddd (%rdi), %zmm0, %zmm0    ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0x07]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi32_rmk
+  ;CHECK: vpaddd (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfe,0x0f]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi32_rmkz
+  ;CHECK: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfe,0x07]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
+  ;CHECK-LABEL: test_mask_add_epi32_rmb
+  ;CHECK: vpaddd (%rdi){1to16}, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0x7d,0x58,0xfe,0x07]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi32_rmbk
+  ;CHECK: vpaddd (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x59,0xfe,0x0f]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi32_rmbkz
+  ;CHECK: vpaddd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xd9,0xfe,0x07]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32> @test_mask_sub_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
+  ;CHECK-LABEL: test_mask_sub_epi32_rr
+  ;CHECK: vpsubd %zmm1, %zmm0, %zmm0     ## encoding: [0x62,0xf1,0x7d,0x48,0xfa,0xc1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi32_rrk
+  ;CHECK: vpsubd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfa,0xd1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi32_rrkz
+  ;CHECK: vpsubd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfa,0xc1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
+  ;CHECK-LABEL: test_mask_sub_epi32_rm
+  ;CHECK: vpsubd (%rdi), %zmm0, %zmm0    ## encoding: [0x62,0xf1,0x7d,0x48,0xfa,0x07]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi32_rmk
+  ;CHECK: vpsubd (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfa,0x0f]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi32_rmkz
+  ;CHECK: vpsubd (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfa,0x07]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
+  ;CHECK-LABEL: test_mask_sub_epi32_rmb
+  ;CHECK: vpsubd (%rdi){1to16}, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0x7d,0x58,0xfa,0x07]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi32_rmbk
+  ;CHECK: vpsubd (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x59,0xfa,0x0f]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi32_rmbkz
+  ;CHECK: vpsubd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xd9,0xfa,0x07]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <8 x i64> @test_mask_add_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
+  ;CHECK-LABEL: test_mask_add_epi64_rr
+  ;CHECK: vpaddq %zmm1, %zmm0, %zmm0     ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi64_rrk
+  ;CHECK: vpaddq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xd4,0xd1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi64_rrkz
+  ;CHECK: vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xd4,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
+  ;CHECK-LABEL: test_mask_add_epi64_rm
+  ;CHECK: vpaddq (%rdi), %zmm0, %zmm0    ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0x07]
+  %b = load <8 x i64>, <8 x i64>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi64_rmk
+  ;CHECK: vpaddq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xd4,0x0f]
+  %b = load <8 x i64>, <8 x i64>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi64_rmkz
+  ;CHECK: vpaddq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xd4,0x07]
+  %b = load <8 x i64>, <8 x i64>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
+  ;CHECK-LABEL: test_mask_add_epi64_rmb
+  ;CHECK: vpaddq (%rdi){1to8}, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0xfd,0x58,0xd4,0x07]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi64_rmbk
+  ;CHECK: vpaddq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xd4,0x0f]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi64_rmbkz
+  ;CHECK: vpaddq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xd4,0x07]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64> @test_mask_sub_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
+  ;CHECK-LABEL: test_mask_sub_epi64_rr
+  ;CHECK: vpsubq %zmm1, %zmm0, %zmm0     ## encoding: [0x62,0xf1,0xfd,0x48,0xfb,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi64_rrk
+  ;CHECK: vpsubq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xfb,0xd1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi64_rrkz
+  ;CHECK: vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xfb,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
+  ;CHECK-LABEL: test_mask_sub_epi64_rm
+  ;CHECK: vpsubq (%rdi), %zmm0, %zmm0    ## encoding: [0x62,0xf1,0xfd,0x48,0xfb,0x07]
+  %b = load <8 x i64>, <8 x i64>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi64_rmk
+  ;CHECK: vpsubq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xfb,0x0f]
+  %b = load <8 x i64>, <8 x i64>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi64_rmkz
+  ;CHECK: vpsubq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xfb,0x07]
+  %b = load <8 x i64>, <8 x i64>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
+  ;CHECK-LABEL: test_mask_sub_epi64_rmb
+  ;CHECK: vpsubq (%rdi){1to8}, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0xfd,0x58,0xfb,0x07]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi64_rmbk
+  ;CHECK: vpsubq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xfb,0x0f]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi64_rmbkz
+  ;CHECK: vpsubq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xfb,0x07]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64> @test_mask_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
+  ;CHECK-LABEL: test_mask_mul_epi32_rr
+  ;CHECK: vpmuldq %zmm1, %zmm0, %zmm0     ## encoding: [0x62,0xf2,0xfd,0x48,0x28,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epi32_rrk
+  ;CHECK: vpmuldq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0xd1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epi32_rrkz
+  ;CHECK: vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
+  ;CHECK-LABEL: test_mask_mul_epi32_rm
+  ;CHECK: vpmuldq (%rdi), %zmm0, %zmm0    ## encoding: [0x62,0xf2,0xfd,0x48,0x28,0x07]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epi32_rmk
+  ;CHECK: vpmuldq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0x0f]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epi32_rmkz
+  ;CHECK: vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0x07]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) {
+  ;CHECK-LABEL: test_mask_mul_epi32_rmb
+  ;CHECK: vpmuldq (%rdi){1to8}, %zmm0, %zmm0  ## encoding: [0x62,0xf2,0xfd,0x58,0x28,0x07]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epi32_rmbk
+  ;CHECK: vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x59,0x28,0x0f]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epi32_rmbkz
+  ;CHECK: vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xd9,0x28,0x07]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
+
+define <8 x i64> @test_mask_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) {
+  ;CHECK-LABEL: test_mask_mul_epu32_rr
+  ;CHECK: vpmuludq %zmm1, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epu32_rrk
+  ;CHECK: vpmuludq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0xd1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epu32_rrkz
+  ;CHECK: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
+  ;CHECK-LABEL: test_mask_mul_epu32_rm
+  ;CHECK: vpmuludq (%rdi), %zmm0, %zmm0  ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0x07]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epu32_rmk
+  ;CHECK: vpmuludq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0x0f]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epu32_rmkz
+  ;CHECK: vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0x07]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) {
+  ;CHECK-LABEL: test_mask_mul_epu32_rmb
+  ;CHECK: vpmuludq (%rdi){1to8}, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0xfd,0x58,0xf4,0x07]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epu32_rmbk
+  ;CHECK: vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xf4,0x0f]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epu32_rmbkz
+  ;CHECK: vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xf4,0x07]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
diff --git a/test/CodeGen/X86/bmi.ll b/test/CodeGen/X86/bmi.ll
index f1ef9ef..8b13e96 100644
--- a/test/CodeGen/X86/bmi.ll
+++ b/test/CodeGen/X86/bmi.ll
@@ -260,7 +260,7 @@ entry:
   %and = and i64 %x, 2147483647
   ret i64 %and
 ; CHECK-LABEL: bzhi64_small_constant_mask:
-; CHECK: andq  $2147483647, %r[[ARG1]]
+; CHECK: andl  $2147483647, %e[[ARG1]]
 }
 
 define i32 @blsi32(i32 %x) nounwind readnone {
diff --git a/test/CodeGen/X86/bool-zext.ll b/test/CodeGen/X86/bool-zext.ll
index 3558376..c98ad9e 100644
--- a/test/CodeGen/X86/bool-zext.ll
+++ b/test/CodeGen/X86/bool-zext.ll
@@ -10,7 +10,7 @@
 define void @bar1(i1 zeroext %v1) nounwind ssp {
 entry:
   %conv = zext i1 %v1 to i32
-  %call = tail call i32 (...)* @foo1(i32 %conv) nounwind
+  %call = tail call i32 (...) @foo1(i32 %conv) nounwind
   ret void
 }
 
@@ -23,7 +23,7 @@ entry:
 define void @bar2(i8 zeroext %v1) nounwind ssp {
 entry:
   %conv = zext i8 %v1 to i32
-  %call = tail call i32 (...)* @foo1(i32 %conv) nounwind
+  %call = tail call i32 (...) @foo1(i32 %conv) nounwind
   ret void
 }
 
diff --git a/test/CodeGen/X86/brcond.ll b/test/CodeGen/X86/brcond.ll
index 3ebe1a1..f4db3ba 100644
--- a/test/CodeGen/X86/brcond.ll
+++ b/test/CodeGen/X86/brcond.ll
@@ -17,11 +17,11 @@ entry:
   br i1 %4, label %bb1, label %bb
 
 bb:                                               ; preds = %entry
-  %5 = tail call i32 (...)* @foo() nounwind       ; <i32> [#uses=1]
+  %5 = tail call i32 (...) @foo() nounwind       ; <i32> [#uses=1]
   ret i32 %5
 
 bb1:                                              ; preds = %entry
-  %6 = tail call i32 (...)* @bar() nounwind       ; <i32> [#uses=1]
+  %6 = tail call i32 (...) @bar() nounwind       ; <i32> [#uses=1]
   ret i32 %6
 }
 
diff --git a/test/CodeGen/X86/byval-align.ll b/test/CodeGen/X86/byval-align.ll
index ac0ab75..8366ae3 100644
--- a/test/CodeGen/X86/byval-align.ll
+++ b/test/CodeGen/X86/byval-align.ll
@@ -18,7 +18,7 @@ entry:
   %1 = ptrtoint i8* %0 to i64                     ; <i64> [#uses=1]
   store i64 %1, i64* %p, align 8
   %2 = load i8*, i8** %ptr, align 8                    ; <i8*> [#uses=1]
-  %3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i64 0, i64 0), i8* %2) nounwind ; <i32> [#uses=0]
+  %3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i64 0, i64 0), i8* %2) nounwind ; <i32> [#uses=0]
   %4 = load i64, i64* %p, align 8                      ; <i64> [#uses=1]
   %5 = and i64 %4, 140737488355264                ; <i64> [#uses=1]
   %6 = load i64, i64* %p, align 8                      ; <i64> [#uses=1]
diff --git a/test/CodeGen/X86/byval6.ll b/test/CodeGen/X86/byval6.ll
index 2d39901..c3e7b7e 100644
--- a/test/CodeGen/X86/byval6.ll
+++ b/test/CodeGen/X86/byval6.ll
@@ -6,8 +6,8 @@
 
 define i32 @main() nounwind  {
 entry:
-	tail call void (i32, ...)* @bar( i32 3, %struct.W* byval  @.cpx ) nounwind 
-	tail call void (i32, ...)* @baz( i32 3, %struct.W* byval  @B ) nounwind 
+	tail call void (i32, ...) @bar( i32 3, %struct.W* byval  @.cpx ) nounwind 
+	tail call void (i32, ...) @baz( i32 3, %struct.W* byval  @B ) nounwind 
 	ret i32 undef
 }
 
diff --git a/test/CodeGen/X86/cache-intrinsic.ll b/test/CodeGen/X86/cache-intrinsic.ll
index c023047..0b9d77a 100644
--- a/test/CodeGen/X86/cache-intrinsic.ll
+++ b/test/CodeGen/X86/cache-intrinsic.ll
@@ -10,10 +10,10 @@ define i32 @main() {
 entry:
   %retval = alloca i32, align 4
   store i32 0, i32* %retval
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0))
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0))
   %call1 = call i8* @strcpy(i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str1, i32 0, i32 0)) #3
   call void @llvm.clear_cache(i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0), i8* getelementptr inbounds (i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0), i32 32)) #3
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0))
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0))
   ret i32 0
 }
 
diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll
index a885183..f2f36b1 100644
--- a/test/CodeGen/X86/cmov.ll
+++ b/test/CodeGen/X86/cmov.ll
@@ -108,7 +108,7 @@ func_1.exit:                                      ; preds = %bb.i.i, %func_4.exi
   %g_96.tmp.0.i = phi i8 [ %g_96.promoted.i, %bb.i.i ], [ %.mux.i, %func_4.exit.i ] ; <i8> [#uses=2]
   store i8 %g_96.tmp.0.i, i8* @g_96
   %6 = zext i8 %g_96.tmp.0.i to i32               ; <i32> [#uses=1]
-  %7 = tail call i32 (i8*, ...)* @printf(i8* noalias getelementptr ([15 x i8], [15 x i8]* @_2E_str, i64 0, i64 0), i32 %6) nounwind ; <i32> [#uses=0]
+  %7 = tail call i32 (i8*, ...) @printf(i8* noalias getelementptr ([15 x i8], [15 x i8]* @_2E_str, i64 0, i64 0), i32 %6) nounwind ; <i32> [#uses=0]
   ret i32 0
 }
 
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index 818138a..584179a 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -75,7 +75,7 @@ define i32 @test5(double %A) nounwind  {
  br i1 %bothcond, label %bb8, label %bb12
 
  bb8:; preds = %entry
- %tmp9 = tail call i32 (...)* @foo( ) nounwind ; <i32> [#uses=1]
+ %tmp9 = tail call i32 (...) @foo( ) nounwind ; <i32> [#uses=1]
  ret i32 %tmp9
 
  bb12:; preds = %entry
diff --git a/test/CodeGen/X86/coalescer-remat.ll b/test/CodeGen/X86/coalescer-remat.ll
index 13fb46b..62e0562 100644
--- a/test/CodeGen/X86/coalescer-remat.ll
+++ b/test/CodeGen/X86/coalescer-remat.ll
@@ -7,7 +7,7 @@ define i32 @main() nounwind {
 entry:
   %t0 = cmpxchg i64* @val, i64 0, i64 1 monotonic monotonic
   %0 = extractvalue { i64, i1 } %t0, 0
-  %1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([7 x i8], [7 x i8]* @"\01LC", i32 0, i64 0), i64 %0) nounwind
+  %1 = tail call i32 (i8*, ...) @printf(i8* getelementptr ([7 x i8], [7 x i8]* @"\01LC", i32 0, i64 0), i64 %0) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/X86/crash.ll b/test/CodeGen/X86/crash.ll
index 5fe5dc5..a95b84d 100644
--- a/test/CodeGen/X86/crash.ll
+++ b/test/CodeGen/X86/crash.ll
@@ -439,7 +439,7 @@ entry:
   %conv = uitofp i64 %sub to float
   %div = fmul float %conv, 5.000000e-01
   %conv2 = fpext float %div to double
-  tail call void (...)* @_Z6PrintFz(i8* getelementptr inbounds ({ [1 x i8], [63 x i8] }, { [1 x i8], [63 x i8] }* @.str, i64 0, i32 0, i64 0), double %conv2)
+  tail call void (...) @_Z6PrintFz(i8* getelementptr inbounds ({ [1 x i8], [63 x i8] }, { [1 x i8], [63 x i8] }* @.str, i64 0, i32 0, i64 0), double %conv2)
   ret void
 }
 declare void @_Z6PrintFz(...)
@@ -462,7 +462,7 @@ for.cond:                                         ; preds = %for.inc, %entry
   %cmp = icmp eq i32* undef, %3
   %conv2 = zext i1 %cmp to i32
   %and = and i32 %conv2, %0
-  tail call void (...)* @fn3(i32 %and) nounwind
+  tail call void (...) @fn3(i32 %and) nounwind
   %tobool = icmp eq i32 undef, 0
   br i1 %tobool, label %for.inc, label %if.then
 
diff --git a/test/CodeGen/X86/dag-optnone.ll b/test/CodeGen/X86/dag-optnone.ll
new file mode 100644
index 0000000..f7774e6
--- /dev/null
+++ b/test/CodeGen/X86/dag-optnone.ll
@@ -0,0 +1,73 @@
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -O0 -mattr=+avx | FileCheck %s
+
+; Background:
+; If fast-isel bails out to normal selection, then the DAG combiner will run,
+; even at -O0. In principle this should not happen (those are optimizations,
+; and we said -O0) but as a practical matter there are some instruction
+; selection patterns that depend on the legalizations and transforms that the
+; DAG combiner does.
+;
+; The 'optnone' attribute implicitly sets -O0 and fast-isel for the function.
+; The DAG combiner was disabled for 'optnone' (but not -O0) by r221168, then
+; re-enabled in r233153 because of problems with instruction selection patterns
+; mentioned above. (Note: because 'optnone' is supposed to match -O0, r221168
+; really should have disabled the combiner for both.)
+;
+; If instruction selection eventually becomes smart enough to run without DAG
+; combiner, then the combiner can be turned off for -O0 (not just 'optnone')
+; and this test can go away. (To be replaced by a different test that verifies
+; the DAG combiner does *not* run at -O0 or for 'optnone' functions.)
+;
+; In the meantime, this test wants to make sure the combiner stays enabled for
+; 'optnone' functions, just as it is for -O0.
+
+
+; The test cases @foo[WithOptnone] prove that the same DAG combine happens
+; with -O0 and with 'optnone' set.  To prove this, we use a Windows triple to
+; cause fast-isel to bail out (because something about the calling convention
+; is not handled in fast-isel).  Then we have a repeated fadd that can be
+; combined into an fmul.  We show that this happens in both the non-optnone
+; function and the optnone function.
+
+define float @foo(float %x) #0 {
+entry:
+  %add = fadd fast float %x, %x
+  %add1 = fadd fast float %add, %x
+  ret float %add1
+}
+
+; CHECK-LABEL: @foo
+; CHECK-NOT:   add
+; CHECK:       mul
+; CHECK-NEXT:  ret
+
+define float @fooWithOptnone(float %x) #1 {
+entry:
+  %add = fadd fast float %x, %x
+  %add1 = fadd fast float %add, %x
+  ret float %add1
+}
+
+; CHECK-LABEL: @fooWithOptnone
+; CHECK-NOT:   add
+; CHECK:       mul
+; CHECK-NEXT:  ret
+
+
+; The test case @bar is derived from an instruction selection failure case
+; that was solved by r233153. It depends on -mattr=+avx.
+; Really all we're trying to prove is that it doesn't crash any more.
+
+@id84 = common global <16 x i32> zeroinitializer, align 64
+
+define void @bar() #1 {
+entry:
+  %id83 = alloca <16 x i8>, align 16
+  %0 = load <16 x i32>, <16 x i32>* @id84, align 64
+  %conv = trunc <16 x i32> %0 to <16 x i8>
+  store <16 x i8> %conv, <16 x i8>* %id83, align 16
+  ret void
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }
+attributes #1 = { noinline optnone "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/dagcombine-and-setcc.ll b/test/CodeGen/X86/dagcombine-and-setcc.ll
index bb2bfbe..57adc8b 100644
--- a/test/CodeGen/X86/dagcombine-and-setcc.ll
+++ b/test/CodeGen/X86/dagcombine-and-setcc.ll
@@ -39,7 +39,7 @@ ret2:
 define i32 @main(i32 %argc, i8** nocapture readnone %argv) {
   %res = alloca i32, align 4
   %t = call i32 @foo(i32 1, i32 2, i32* %res) #3
-  %v = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %t)
+  %v = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %t)
   ret i32 0
 }
 
diff --git a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
index fe502bb..c5085a2 100644
--- a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
+++ b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
@@ -169,36 +169,36 @@ attributes #2 = { nounwind readnone }
 !53 = distinct !MDLexicalBlock(line: 14, column: 0, file: !1, scope: !51)
 !54 = !MDLocation(line: 16, scope: !53)
 !55 = !MDLocation(line: 17, scope: !24)
-!56 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !40, type: !38, inlinedAt: !55)
+!56 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !40, type: !38)
 !57 = !MDLocation(line: 0, scope: !40, inlinedAt: !55)
 !58 = !{i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0)}
-!59 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 5, arg: 2, scope: !40, file: !25, type: !15, inlinedAt: !55)
+!59 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 5, arg: 2, scope: !40, file: !25, type: !15)
 !60 = !MDLocation(line: 5, scope: !40, inlinedAt: !55)
 !61 = !MDLocation(line: 5, scope: !62, inlinedAt: !55)
 !62 = distinct !MDLexicalBlock(line: 5, column: 0, file: !1, scope: !40)
 !63 = !MDLocation(line: 18, scope: !24)
-!64 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !40, type: !38, inlinedAt: !63)
+!64 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !40, type: !38)
 !65 = !MDLocation(line: 0, scope: !40, inlinedAt: !63)
-!66 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 5, arg: 2, scope: !40, file: !25, type: !15, inlinedAt: !63)
+!66 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 5, arg: 2, scope: !40, file: !25, type: !15)
 !67 = !MDLocation(line: 5, scope: !40, inlinedAt: !63)
 !68 = !MDLocation(line: 5, scope: !62, inlinedAt: !63)
 !69 = !MDLocation(line: 20, scope: !70)
 !70 = distinct !MDLexicalBlock(line: 20, column: 0, file: !1, scope: !24)
-!71 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38, inlinedAt: !72)
+!71 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
 !72 = !MDLocation(line: 21, scope: !70)
 !73 = !MDLocation(line: 0, scope: !35, inlinedAt: !72)
 !74 = !{i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str1, i64 0, i64 0)}
-!75 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15, inlinedAt: !72)
+!75 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
 !76 = !MDLocation(line: 6, scope: !35, inlinedAt: !72)
-!77 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38, inlinedAt: !78)
+!77 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
 !78 = !MDLocation(line: 23, scope: !70)
 !79 = !MDLocation(line: 0, scope: !35, inlinedAt: !78)
 !80 = !{i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str2, i64 0, i64 0)}
-!81 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15, inlinedAt: !78)
+!81 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
 !82 = !MDLocation(line: 6, scope: !35, inlinedAt: !78)
-!83 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38, inlinedAt: !84)
+!83 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
 !84 = !MDLocation(line: 24, scope: !24)
 !85 = !MDLocation(line: 0, scope: !35, inlinedAt: !84)
-!86 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15, inlinedAt: !84)
+!86 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
 !87 = !MDLocation(line: 6, scope: !35, inlinedAt: !84)
 !88 = !MDLocation(line: 25, scope: !24)
diff --git a/test/CodeGen/X86/dbg-changes-codegen.ll b/test/CodeGen/X86/dbg-changes-codegen.ll
index 6cdfdc2..8f95338 100644
--- a/test/CodeGen/X86/dbg-changes-codegen.ll
+++ b/test/CodeGen/X86/dbg-changes-codegen.ll
@@ -44,7 +44,7 @@
 define zeroext i1 @_ZN3Foo3batEv(%struct.Foo* %this) #0 align 2 {
 entry:
   %0 = load %struct.Foo*, %struct.Foo** @pfoo, align 8
-  tail call void @llvm.dbg.value(metadata %struct.Foo* %0, i64 0, metadata !62, metadata !MDExpression())
+  tail call void @llvm.dbg.value(metadata %struct.Foo* %0, i64 0, metadata !62, metadata !MDExpression()), !dbg !MDLocation(scope: !MDSubprogram())
   %cmp.i = icmp eq %struct.Foo* %0, %this
   ret i1 %cmp.i
 }
@@ -53,7 +53,7 @@ entry:
 define void @_Z3bazv() #1 {
 entry:
   %0 = load %struct.Wibble*, %struct.Wibble** @wibble1, align 8
-  tail call void @llvm.dbg.value(metadata %struct.Flibble* undef, i64 0, metadata !65, metadata !MDExpression())
+  tail call void @llvm.dbg.value(metadata %struct.Flibble* undef, i64 0, metadata !65, metadata !MDExpression()), !dbg !MDLocation(scope: !MDSubprogram())
   %1 = load %struct.Wibble*, %struct.Wibble** @wibble2, align 8
   %cmp.i = icmp ugt %struct.Wibble* %1, %0
   br i1 %cmp.i, label %if.then.i, label %_ZN7Flibble3barEP6Wibble.exit
@@ -78,6 +78,6 @@ attributes #2 = { nounwind readnone }
 
 !17 = !MDDerivedType(tag: DW_TAG_reference_type, baseType: null)
 !45 = !MDDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: null)
-!62 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "arg", line: 4, arg: 2, scope: null, type: !17)
+!62 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "arg", line: 4, arg: 2, scope: !MDSubprogram(), type: !17)
 !64 = !{%struct.Flibble* undef}
-!65 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", line: 13, arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: null, type: !45)
+!65 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", line: 13, arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !MDSubprogram(), type: !45)
diff --git a/test/CodeGen/X86/discontiguous-loops.ll b/test/CodeGen/X86/discontiguous-loops.ll
index fa7692b..20db750 100644
--- a/test/CodeGen/X86/discontiguous-loops.ll
+++ b/test/CodeGen/X86/discontiguous-loops.ll
@@ -40,7 +40,7 @@ ybb8:                                              ; preds = %ybb1
 
 bb10:                                             ; preds = %ybb8
   %tmp11 = load i8*, i8** undef, align 8               ; <i8*> [#uses=1]
-  call void (i8*, ...)* @fatal(i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str96, i64 0, i64 0), i8* %tmp11) nounwind
+  call void (i8*, ...) @fatal(i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str96, i64 0, i64 0), i8* %tmp11) nounwind
   unreachable
 
 ybb12:                                             ; preds = %ybb8
@@ -51,7 +51,7 @@ ybb13:                                             ; preds = %ybb12
   br i1 %tmp14, label %bb16, label %ybb1
 
 bb15:                                             ; preds = %ybb12
-  call void (i8*, ...)* @fatal(i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str96, i64 0, i64 0), i8* undef) nounwind
+  call void (i8*, ...) @fatal(i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str96, i64 0, i64 0), i8* undef) nounwind
   unreachable
 
 bb16:                                             ; preds = %ybb13
diff --git a/test/CodeGen/X86/dllimport-x86_64.ll b/test/CodeGen/X86/dllimport-x86_64.ll
index af15a86..7ee6b43 100644
--- a/test/CodeGen/X86/dllimport-x86_64.ll
+++ b/test/CodeGen/X86/dllimport-x86_64.ll
@@ -36,13 +36,13 @@ define void @use() nounwind {
 ; OPT-NOT: call void @inline1()
 ; OPT-NOT: call void @inline2()
 ; OPT-NOT: load i32, i32* @Var2
-; OPT: call void (...)* @dummy(i32 %1, i32 1)
+; OPT: call void (...) @dummy(i32 %1, i32 1)
 
 ; CHECK-DAG: movq __imp_Var1(%rip), [[R1:%[a-z]{3}]]
 ; CHECK-DAG: movq __imp_Var2(%rip), [[R2:%[a-z]{3}]]
   %1 = load i32, i32* @Var1
   %2 = load i32, i32* @Var2
-  call void(...)* @dummy(i32 %1, i32 %2)
+  call void(...) @dummy(i32 %1, i32 %2)
 
   ret void
 }
diff --git a/test/CodeGen/X86/dllimport.ll b/test/CodeGen/X86/dllimport.ll
index eb9484c..9db654f 100644
--- a/test/CodeGen/X86/dllimport.ll
+++ b/test/CodeGen/X86/dllimport.ll
@@ -47,13 +47,13 @@ define void @use() nounwind {
 ; OPT-NOT: call void @inline1()
 ; OPT-NOT: call void @inline2()
 ; OPT-NOT: load i32, i32* @Var2
-; OPT: call void (...)* @dummy(i32 %1, i32 1)
+; OPT: call void (...) @dummy(i32 %1, i32 1)
 
 ; CHECK-DAG: movl __imp__Var1, [[R1:%[a-z]{3}]]
 ; CHECK-DAG: movl __imp__Var2, [[R2:%[a-z]{3}]]
   %1 = load i32, i32* @Var1
   %2 = load i32, i32* @Var2
-  call void(...)* @dummy(i32 %1, i32 %2)
+  call void(...) @dummy(i32 %1, i32 %2)
 
   ret void
 }
diff --git a/test/CodeGen/X86/early-ifcvt.ll b/test/CodeGen/X86/early-ifcvt.ll
index 6215519..7fcd530 100644
--- a/test/CodeGen/X86/early-ifcvt.ll
+++ b/test/CodeGen/X86/early-ifcvt.ll
@@ -62,7 +62,7 @@ if.then37:
 
 if.end41:
   %exit_status.0 = phi i32 [ 2, %if.then29 ], [ 0, %if.then37 ], [ 66, %entry ]
-  call void (...)* @fprintf(i32 %exit_status.0) nounwind
+  call void (...) @fprintf(i32 %exit_status.0) nounwind
   unreachable
 }
 
diff --git a/test/CodeGen/X86/exedeps-movq.ll b/test/CodeGen/X86/exedeps-movq.ll
new file mode 100644
index 0000000..b702c87
--- /dev/null
+++ b/test/CodeGen/X86/exedeps-movq.ll
@@ -0,0 +1,73 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=AVX
+
+; Verify that we select the correct version of the instruction that stores the low 64-bits
+; of a 128-bit vector. We want to avoid int/fp domain crossing penalties, so ignore the
+; bitcast ops and choose:
+;
+; movlps for floats
+; movlpd for doubles
+; movq for integers
+
+define void @store_floats(<4 x float> %x, i64* %p) {
+; SSE-LABEL: store_floats:
+; SSE:       # BB#0:
+; SSE-NEXT:    addps %xmm0, %xmm0
+; SSE-NEXT:    movlps %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: store_floats:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddps %xmm0, %xmm0, %xmm0
+
+
+; !!! FIXME - the AVX version is not handled correctly.
+; AVX-NEXT:    vmovq %xmm0, (%rdi)
+
+
+; AVX-NEXT:    retq
+  %a = fadd <4 x float> %x, %x
+  %b = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %c = bitcast <2 x float> %b to i64
+  store i64 %c, i64* %p
+  ret void
+}
+
+define void @store_double(<2 x double> %x, i64* %p) {
+; SSE-LABEL: store_double:
+; SSE:       # BB#0:
+; SSE-NEXT:    addpd %xmm0, %xmm0
+; SSE-NEXT:    movlpd %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: store_double:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovlpd %xmm0, (%rdi)
+; AVX-NEXT:    retq
+  %a = fadd <2 x double> %x, %x
+  %b = extractelement <2 x double> %a, i32 0
+  %c = bitcast double %b to i64
+  store i64 %c, i64* %p
+  ret void
+}
+
+define void @store_int(<4 x i32> %x, <2 x float>* %p) {
+; SSE-LABEL: store_int:
+; SSE:       # BB#0:
+; SSE-NEXT:    paddd %xmm0, %xmm0
+; SSE-NEXT:    movq %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: store_int:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovq %xmm0, (%rdi)
+; AVX-NEXT:    retq
+  %a = add <4 x i32> %x, %x
+  %b = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %c = bitcast <2 x i32> %b to <2 x float>
+  store <2 x float> %c, <2 x float>* %p
+  ret void
+}
+
diff --git a/test/CodeGen/X86/extern_weak.ll b/test/CodeGen/X86/extern_weak.ll
index 01e32aa..c2ff09f 100644
--- a/test/CodeGen/X86/extern_weak.ll
+++ b/test/CodeGen/X86/extern_weak.ll
@@ -5,7 +5,7 @@
 declare extern_weak i32 @X(i8*)
 
 define void @bar() {
-        tail call void (...)* @foo( )
+        tail call void (...) @foo( )
         ret void
 }
 
diff --git a/test/CodeGen/X86/fast-isel-i1.ll b/test/CodeGen/X86/fast-isel-i1.ll
index d72a31c..589de76 100644
--- a/test/CodeGen/X86/fast-isel-i1.ll
+++ b/test/CodeGen/X86/fast-isel-i1.ll
@@ -23,14 +23,15 @@ exit:		; preds = %next
 
 define void @test2(i8* %a) nounwind {
 entry:
+; clang uses i8 constants for booleans, so we test with an i8 1.
 ; CHECK-LABEL: test2:
 ; CHECK: movb {{.*}} %al
 ; CHECK-NEXT: xorb $1, %al
 ; CHECK-NEXT: testb $1
   %tmp = load i8, i8* %a, align 1
-  %tobool = trunc i8 %tmp to i1
-  %tobool2 = xor i1 %tobool, true
-  br i1 %tobool2, label %if.then, label %if.end
+  %xor = xor i8 %tmp, 1
+  %tobool = trunc i8 %xor to i1
+  br i1 %tobool, label %if.then, label %if.end
 
 if.then:
   call void @test2(i8* null)
diff --git a/test/CodeGen/X86/fast-isel-sext.ll b/test/CodeGen/X86/fast-isel-sext.ll
new file mode 100644
index 0000000..ca1558e
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-sext.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mtriple=x86_64-linux -fast-isel -show-mc-encoding < %s | FileCheck %s
+
+; CHECK-LABEL: f:
+; CHECK:       addl $-2, %eax         # encoding: [0x83,0xc0,0xfe]
+define i32 @f(i32* %y) {
+  %x = load i32, i32* %y
+  %dec = add i32 %x, -2
+  ret i32 %dec
+}
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index d4bbb63..d748cba 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -190,7 +190,7 @@ define void @test16() nounwind {
 ; CHECK: movl $1, %edi
 ; CHECK: movb $0, %al
 ; CHECK: callq _test16callee
-  call void (...)* @test16callee(i32 1)
+  call void (...) @test16callee(i32 1)
   br label %block2
 
 block2:
@@ -201,7 +201,7 @@ block2:
 ; AVX: vmovsd LCP{{.*}}_{{.*}}(%rip), %xmm0
 ; AVX: movb $1, %al
 ; AVX: callq _test16callee
-  call void (...)* @test16callee(double 1.000000e+00)
+  call void (...) @test16callee(double 1.000000e+00)
   ret void
 }
 
diff --git a/test/CodeGen/X86/fdiv-combine.ll b/test/CodeGen/X86/fdiv-combine.ll
new file mode 100644
index 0000000..279bb06
--- /dev/null
+++ b/test/CodeGen/X86/fdiv-combine.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; Anything more than one division using a single divisor operand
+; should be converted into a reciprocal and multiplication.
+
+define float @div1_arcp(float %x, float %y, float %z) #0 {
+; CHECK-LABEL: div1_arcp:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    divss %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %div1 = fdiv arcp float %x, %y
+  ret float %div1
+}
+
+define float @div2_arcp(float %x, float %y, float %z) #0 {
+; CHECK-LABEL: div2_arcp:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    divss %xmm2, %xmm3
+; CHECK-NEXT:    mulss %xmm3, %xmm0
+; CHECK-NEXT:    mulss %xmm1, %xmm0
+; CHECK-NEXT:    mulss %xmm3, %xmm0
+; CHECK-NEXT:    retq
+  %div1 = fdiv arcp float %x, %z
+  %mul = fmul arcp float %div1, %y
+  %div2 = fdiv arcp float %mul, %z
+  ret float %div2
+}
+
+; FIXME: If the backend understands 'arcp', then this attribute is unnecessary.
+attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/fltused.ll b/test/CodeGen/X86/fltused.ll
index dcc1382..6c5d8ce 100644
--- a/test/CodeGen/X86/fltused.ll
+++ b/test/CodeGen/X86/fltused.ll
@@ -11,7 +11,7 @@
 
 define i32 @main() nounwind {
 entry:
-  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double 1.000000e+000) nounwind
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double 1.000000e+000) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/X86/fltused_function_pointer.ll b/test/CodeGen/X86/fltused_function_pointer.ll
index ba5879a..a41ae48 100644
--- a/test/CodeGen/X86/fltused_function_pointer.ll
+++ b/test/CodeGen/X86/fltused_function_pointer.ll
@@ -11,7 +11,7 @@
 
 define i32 @foo(i32 (i8*, ...)* %f) nounwind {
 entry:
-  %call = tail call i32 (i8*, ...)* %f(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double 1.000000e+000) nounwind
+  %call = tail call i32 (i8*, ...) %f(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double 1.000000e+000) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/X86/fp-stack-O0.ll b/test/CodeGen/X86/fp-stack-O0.ll
index df90254..79ef28b 100644
--- a/test/CodeGen/X86/fp-stack-O0.ll
+++ b/test/CodeGen/X86/fp-stack-O0.ll
@@ -17,7 +17,7 @@ declare i32 @x2(x86_fp80, x86_fp80) nounwind
 ; CHECK-NEXT: x2
 define i32 @test1() nounwind uwtable ssp {
 entry:
-  %call = call x86_fp80 (...)* bitcast (x86_fp80 (i32)* @x1 to x86_fp80 (...)*)(i32 -1)
+  %call = call x86_fp80 (...) bitcast (x86_fp80 (i32)* @x1 to x86_fp80 (...)*)(i32 -1)
   %call1 = call i32 @x2(x86_fp80 %call, x86_fp80 0xK401EFFFFFFFF00000000)
   ret i32 %call1
 }
diff --git a/test/CodeGen/X86/fp-stack-ret-store.ll b/test/CodeGen/X86/fp-stack-ret-store.ll
index 05dfc54..c7cbb2a 100644
--- a/test/CodeGen/X86/fp-stack-ret-store.ll
+++ b/test/CodeGen/X86/fp-stack-ret-store.ll
@@ -7,7 +7,7 @@ target triple = "i686-apple-darwin8"
 
 define void @bar(double* %P) {
 entry:
-	%tmp = tail call double (...)* @foo( )		; <double> [#uses=1]
+	%tmp = tail call double (...) @foo( )		; <double> [#uses=1]
 	store double %tmp, double* %P, align 8
 	ret void
 }
@@ -16,7 +16,7 @@ declare double @foo(...)
 
 define void @bar2(float* %P) {
 entry:
-	%tmp = tail call double (...)* @foo2( )		; <double> [#uses=1]
+	%tmp = tail call double (...) @foo2( )		; <double> [#uses=1]
 	%tmp1 = fptrunc double %tmp to float		; <float> [#uses=1]
 	store float %tmp1, float* %P, align 4
 	ret void
diff --git a/test/CodeGen/X86/fpstack-debuginstr-kill.ll b/test/CodeGen/X86/fpstack-debuginstr-kill.ll
index 56c8c27..8eb452a 100644
--- a/test/CodeGen/X86/fpstack-debuginstr-kill.ll
+++ b/test/CodeGen/X86/fpstack-debuginstr-kill.ll
@@ -32,7 +32,7 @@ sw.bb735:                                         ; preds = %if.end511
   unreachable
 
 if.end41.i2210:                                   ; preds = %if.end511
-  call void @llvm.dbg.value(metadata x86_fp80 %src.sroa.0.0.src.sroa.0.0.2280, i64 0, metadata !20, metadata !MDExpression())
+  call void @llvm.dbg.value(metadata x86_fp80 %src.sroa.0.0.src.sroa.0.0.2280, i64 0, metadata !20, metadata !MDExpression()), !dbg !MDLocation(scope: !4)
   unreachable
 
 sw.bb992:                                         ; preds = %if.end511
diff --git a/test/CodeGen/X86/frameescape.ll b/test/CodeGen/X86/frameescape.ll
index 40eeb0e..3a624ae 100644
--- a/test/CodeGen/X86/frameescape.ll
+++ b/test/CodeGen/X86/frameescape.ll
@@ -12,11 +12,11 @@ define void @print_framealloc_from_fp(i8* %fp) {
   %a.i8 = call i8* @llvm.framerecover(i8* bitcast (void()* @alloc_func to i8*), i8* %fp, i32 0)
   %a = bitcast i8* %a.i8 to i32*
   %a.val = load i32, i32* %a
-  call i32 (i8*, ...)* @printf(i8* getelementptr ([10 x i8], [10 x i8]* @str, i32 0, i32 0), i32 %a.val)
+  call i32 (i8*, ...) @printf(i8* getelementptr ([10 x i8], [10 x i8]* @str, i32 0, i32 0), i32 %a.val)
   %b.i8 = call i8* @llvm.framerecover(i8* bitcast (void()* @alloc_func to i8*), i8* %fp, i32 1)
   %b = bitcast i8* %b.i8 to i32*
   %b.val = load i32, i32* %b
-  call i32 (i8*, ...)* @printf(i8* getelementptr ([10 x i8], [10 x i8]* @str, i32 0, i32 0), i32 %b.val)
+  call i32 (i8*, ...) @printf(i8* getelementptr ([10 x i8], [10 x i8]* @str, i32 0, i32 0), i32 %b.val)
   store i32 42, i32* %b
   ret void
 }
@@ -53,7 +53,7 @@ define void @print_framealloc_from_fp(i8* %fp) {
 define void @alloc_func() {
   %a = alloca i32
   %b = alloca i32
-  call void (...)* @llvm.frameescape(i32* %a, i32* %b)
+  call void (...) @llvm.frameescape(i32* %a, i32* %b)
   store i32 42, i32* %a
   store i32 13, i32* %b
   %fp = call i8* @llvm.frameaddress(i32 0)
@@ -97,7 +97,7 @@ define i32 @main() {
 define void @alloc_func_no_frameaddr() {
   %a = alloca i32
   %b = alloca i32
-  call void (...)* @llvm.frameescape(i32* %a, i32* %b)
+  call void (...) @llvm.frameescape(i32* %a, i32* %b)
   store i32 42, i32* %a
   store i32 13, i32* %b
   call void @print_framealloc_from_fp(i8* null)
diff --git a/test/CodeGen/X86/h-registers-3.ll b/test/CodeGen/X86/h-registers-3.ll
index 8a0b07b..29d0c28 100644
--- a/test/CodeGen/X86/h-registers-3.ll
+++ b/test/CodeGen/X86/h-registers-3.ll
@@ -3,7 +3,7 @@
 
 define zeroext i8 @foo() nounwind ssp {
 entry:
-  %0 = tail call zeroext i16 (...)* @bar() nounwind
+  %0 = tail call zeroext i16 (...) @bar() nounwind
   %1 = lshr i16 %0, 8
   %2 = trunc i16 %1 to i8
   ret i8 %2
diff --git a/test/CodeGen/X86/hoist-common.ll b/test/CodeGen/X86/hoist-common.ll
index 01d1b8c..65f8340 100644
--- a/test/CodeGen/X86/hoist-common.ll
+++ b/test/CodeGen/X86/hoist-common.ll
@@ -26,7 +26,7 @@ entry:
 
 if.then:
 ; CHECK: callq
-  %call = tail call zeroext i1 (...)* @foo() nounwind
+  %call = tail call zeroext i1 (...) @foo() nounwind
   br label %return
 
 return:
diff --git a/test/CodeGen/X86/inline-asm-duplicated-constraint.ll b/test/CodeGen/X86/inline-asm-duplicated-constraint.ll
new file mode 100644
index 0000000..2ef5474
--- /dev/null
+++ b/test/CodeGen/X86/inline-asm-duplicated-constraint.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=x86-64 -no-integrated-as -mtriple=x86_64-linux-gnu | FileCheck %s
+
+; CHECK-LABEL: test1:
+; CHECK: movl	(%rdi), %eax
+; CHECK: nop
+; CHECK: movl	%eax, (%rdi)
+; CHECK: ret
+define void @test1(i32* %l) {
+  %load = load i32, i32* %l
+  call void asm "nop", "=*rmrm,0m0m,~{dirflag},~{fpsr},~{flags}"(i32* %l, i32 %load)
+  ret void
+}
diff --git a/test/CodeGen/X86/invalid-shift-immediate.ll b/test/CodeGen/X86/invalid-shift-immediate.ll
index 21ad6e8..1fb80c7 100644
--- a/test/CodeGen/X86/invalid-shift-immediate.ll
+++ b/test/CodeGen/X86/invalid-shift-immediate.ll
@@ -17,7 +17,7 @@ entry:
 	br i1 %toBool, label %bb, label %bb5
 
 bb:		; preds = %entry
-	%tmp4 = call i32 (...)* @bar( ) nounwind 		; <i32> [#uses=0]
+	%tmp4 = call i32 (...) @bar( ) nounwind 		; <i32> [#uses=0]
 	br label %bb5
 
 bb5:		; preds = %bb, %entry
diff --git a/test/CodeGen/X86/jump_sign.ll b/test/CodeGen/X86/jump_sign.ll
index 31a7af3..ca3e8bf 100644
--- a/test/CodeGen/X86/jump_sign.ll
+++ b/test/CodeGen/X86/jump_sign.ll
@@ -9,11 +9,11 @@ entry:
 	br i1 %tmp, label %cond_true, label %cond_next
 
 cond_true:		; preds = %entry
-	%tmp2 = tail call i32 (...)* @bar( )		; <i32> [#uses=0]
+	%tmp2 = tail call i32 (...) @bar( )		; <i32> [#uses=0]
 	br label %cond_next
 
 cond_next:		; preds = %cond_true, %entry
-	%tmp3 = tail call i32 (...)* @baz( )		; <i32> [#uses=0]
+	%tmp3 = tail call i32 (...) @baz( )		; <i32> [#uses=0]
 	ret i32 undef
 }
 
diff --git a/test/CodeGen/X86/licm-nested.ll b/test/CodeGen/X86/licm-nested.ll
index 4ec2b52..42e6d12 100644
--- a/test/CodeGen/X86/licm-nested.ll
+++ b/test/CodeGen/X86/licm-nested.ll
@@ -81,7 +81,7 @@ for.inc35:                                        ; preds = %for.body15, %for.en
 
 while.end:                                        ; preds = %while.cond.loopexit, %while.cond.preheader
   %count.0.lcssa = phi i32 [ 0, %while.cond.preheader ], [ %count.1, %while.cond.loopexit ] ; <i32> [#uses=1]
-  %call40 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i64 0, i64 0), i32 %count.0.lcssa) nounwind ; <i32> [#uses=0]
+  %call40 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i64 0, i64 0), i32 %count.0.lcssa) nounwind ; <i32> [#uses=0]
   ret i32 0
 }
 
diff --git a/test/CodeGen/X86/licm-regpressure.ll b/test/CodeGen/X86/licm-regpressure.ll
new file mode 100644
index 0000000..0ab6554
--- /dev/null
+++ b/test/CodeGen/X86/licm-regpressure.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+; This tests currently fails as MachineLICM does not compute register pressure
+; correctly. More details: llvm.org/PR23143
+; XFAIL: *
+
+; MachineLICM should take register pressure into account.
+; CHECK-NOT: Spill
+
+%struct.A = type { i32, i32, i32, i32, i32, i32, i32 }
+
+define void @test(i1 %b, %struct.A* %a) nounwind {
+entry:
+  br label %loop-header
+
+loop-header:
+  br label %loop-body
+
+loop-body:
+  %0 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 0
+  %1 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 1
+  %2 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 2
+  %3 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 3
+  %4 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 4
+  %5 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 5
+  %6 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 6
+  call void @assign(i32* %0)
+  call void @assign(i32* %1)
+  call void @assign(i32* %2)
+  call void @assign(i32* %3)
+  call void @assign(i32* %4)
+  call void @assign(i32* %5)
+  call void @assign(i32* %6)
+  br i1 %b, label %loop-body, label %loop-exit
+
+loop-exit:
+  ret void
+}
+
+declare void @assign(i32*)
diff --git a/test/CodeGen/X86/licm-symbol.ll b/test/CodeGen/X86/licm-symbol.ll
index 854ea0b..0f115dd 100644
--- a/test/CodeGen/X86/licm-symbol.ll
+++ b/test/CodeGen/X86/licm-symbol.ll
@@ -29,11 +29,11 @@ bb151:                                            ; preds = %bb59, %bb56, %bb14
   br i1 undef, label %bb56, label %bb59
 
 bb56:                                             ; preds = %bb151
-  %t0 = call i32 (%struct.FILE*)* @fprintf(%struct.FILE* getelementptr inbounds ([0 x %struct.FILE], [0 x %struct.FILE]* @__sF, i32 0, i32 2)) nounwind
+  %t0 = call i32 (%struct.FILE*) @fprintf(%struct.FILE* getelementptr inbounds ([0 x %struct.FILE], [0 x %struct.FILE]* @__sF, i32 0, i32 2)) nounwind
   br label %bb151
 
 bb59:                                             ; preds = %bb151
-  %t1 = call i32 (%struct.FILE*)* @fprintf(%struct.FILE* getelementptr inbounds ([0 x %struct.FILE], [0 x %struct.FILE]* @__sF, i32 0, i32 2)) nounwind
+  %t1 = call i32 (%struct.FILE*) @fprintf(%struct.FILE* getelementptr inbounds ([0 x %struct.FILE], [0 x %struct.FILE]* @__sF, i32 0, i32 2)) nounwind
   br label %bb151
 }
 
diff --git a/test/CodeGen/X86/lsr-normalization.ll b/test/CodeGen/X86/lsr-normalization.ll
index e75f5b2..09c892c 100644
--- a/test/CodeGen/X86/lsr-normalization.ll
+++ b/test/CodeGen/X86/lsr-normalization.ll
@@ -71,7 +71,7 @@ bb25:                                             ; preds = %bb25, %bb23
 
 bb32:                                             ; preds = %bb25
   %tmp33 = mul i64 %tmp31, %tmp24                 ; <i64> [#uses=1]
-  %tmp34 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @0, i64 0, i64 0), i64 %tmp33) nounwind
+  %tmp34 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @0, i64 0, i64 0), i64 %tmp33) nounwind
   br label %bb35
 
 bb35:                                             ; preds = %bb32, %bb14
diff --git a/test/CodeGen/X86/machine-cse.ll b/test/CodeGen/X86/machine-cse.ll
index ce3ab4c..c6876d2 100644
--- a/test/CodeGen/X86/machine-cse.ll
+++ b/test/CodeGen/X86/machine-cse.ll
@@ -62,7 +62,7 @@ if.end34:                                         ; preds = %sw.bb
 ; CHECK: %if.end34
 ; CHECK: leal
 ; CHECK-NOT: imull
-  tail call void (...)* @printf(i32 %test_case, i32 %mul20) nounwind
+  tail call void (...) @printf(i32 %test_case, i32 %mul20) nounwind
   %tmp = mul i32 %scale, %test_case
   %tmp752 = mul i32 %tmp, 3
   %tmp753 = zext i32 %tmp752 to i64
diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll
index d5a3d8e..e5f1f52 100644
--- a/test/CodeGen/X86/memcmp.ll
+++ b/test/CodeGen/X86/memcmp.ll
@@ -11,7 +11,7 @@ declare i32 @memcmp(...)
 
 define void @memcmp2(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
 entry:
-  %0 = tail call i32 (...)* @memcmp(i8* %X, i8* %Y, i32 2) nounwind ; <i32> [#uses=1]
+  %0 = tail call i32 (...) @memcmp(i8* %X, i8* %Y, i32 2) nounwind ; <i32> [#uses=1]
   %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
   br i1 %1, label %return, label %bb
 
@@ -31,7 +31,7 @@ return:                                           ; preds = %entry
 
 define void @memcmp2a(i8* %X, i32* nocapture %P) nounwind {
 entry:
-  %0 = tail call i32 (...)* @memcmp(i8* %X, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 1), i32 2) nounwind ; <i32> [#uses=1]
+  %0 = tail call i32 (...) @memcmp(i8* %X, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 1), i32 2) nounwind ; <i32> [#uses=1]
   %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
   br i1 %1, label %return, label %bb
 
@@ -49,7 +49,7 @@ return:                                           ; preds = %entry
 
 define void @memcmp4(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
 entry:
-  %0 = tail call i32 (...)* @memcmp(i8* %X, i8* %Y, i32 4) nounwind ; <i32> [#uses=1]
+  %0 = tail call i32 (...) @memcmp(i8* %X, i8* %Y, i32 4) nounwind ; <i32> [#uses=1]
   %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
   br i1 %1, label %return, label %bb
 
@@ -66,7 +66,7 @@ return:                                           ; preds = %entry
 
 define void @memcmp4a(i8* %X, i32* nocapture %P) nounwind {
 entry:
-  %0 = tail call i32 (...)* @memcmp(i8* %X, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 1), i32 4) nounwind ; <i32> [#uses=1]
+  %0 = tail call i32 (...) @memcmp(i8* %X, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 1), i32 4) nounwind ; <i32> [#uses=1]
   %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
   br i1 %1, label %return, label %bb
 
@@ -82,7 +82,7 @@ return:                                           ; preds = %entry
 
 define void @memcmp8(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
 entry:
-  %0 = tail call i32 (...)* @memcmp(i8* %X, i8* %Y, i32 8) nounwind ; <i32> [#uses=1]
+  %0 = tail call i32 (...) @memcmp(i8* %X, i8* %Y, i32 8) nounwind ; <i32> [#uses=1]
   %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
   br i1 %1, label %return, label %bb
 
@@ -99,7 +99,7 @@ return:                                           ; preds = %entry
 
 define void @memcmp8a(i8* %X, i32* nocapture %P) nounwind {
 entry:
-  %0 = tail call i32 (...)* @memcmp(i8* %X, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0), i32 8) nounwind ; <i32> [#uses=1]
+  %0 = tail call i32 (...) @memcmp(i8* %X, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0), i32 8) nounwind ; <i32> [#uses=1]
   %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
   br i1 %1, label %return, label %bb
 
diff --git a/test/CodeGen/X86/misched-code-difference-with-debug.ll b/test/CodeGen/X86/misched-code-difference-with-debug.ll
index 256db8b..2cc70e1 100644
--- a/test/CodeGen/X86/misched-code-difference-with-debug.ll
+++ b/test/CodeGen/X86/misched-code-difference-with-debug.ll
@@ -34,9 +34,9 @@ entry:
   %c = alloca %class.C, align 1
   %0 = load i8, i8* @argc, align 1
   %conv = sext i8 %0 to i32
-  %call = call i32 (%class.C*, i8, i8, i8, ...)* @test_function(%class.C* %c, i8 signext 0, i8 signext %0, i8 signext 0, i32 %conv)
+  %call = call i32 (%class.C*, i8, i8, i8, ...) @test_function(%class.C* %c, i8 signext 0, i8 signext %0, i8 signext 0, i32 %conv)
   %1 = load i8, i8* @argc, align 1
-  %call2 = call i32 (%class.C*, i8, i8, i8, ...)* @test_function(%class.C* %c, i8 signext 0, i8 signext %1, i8 signext 0, i32 %conv)
+  %call2 = call i32 (%class.C*, i8, i8, i8, ...) @test_function(%class.C* %c, i8 signext 0, i8 signext %1, i8 signext 0, i32 %conv)
   ret void
 }
 
@@ -47,13 +47,13 @@ define void @test_with_debug() {
 entry:
   %c = alloca %class.C, align 1
   %0 = load i8, i8* @argc, align 1
-  tail call void @llvm.dbg.value(metadata i8 %0, i64 0, metadata !19, metadata !29)
+  tail call void @llvm.dbg.value(metadata i8 %0, i64 0, metadata !19, metadata !29), !dbg !MDLocation(scope: !13)
   %conv = sext i8 %0 to i32
-  tail call void @llvm.dbg.value(metadata %class.C* %c, i64 0, metadata !18, metadata !29)
-  %call = call i32 (%class.C*, i8, i8, i8, ...)* @test_function(%class.C* %c, i8 signext 0, i8 signext %0, i8 signext 0, i32 %conv)
+  tail call void @llvm.dbg.value(metadata %class.C* %c, i64 0, metadata !18, metadata !29), !dbg !MDLocation(scope: !13)
+  %call = call i32 (%class.C*, i8, i8, i8, ...) @test_function(%class.C* %c, i8 signext 0, i8 signext %0, i8 signext 0, i32 %conv)
   %1 = load i8, i8* @argc, align 1
-  call void @llvm.dbg.value(metadata %class.C* %c, i64 0, metadata !18, metadata !29)
-  %call2 = call i32 (%class.C*, i8, i8, i8, ...)* @test_function(%class.C* %c, i8 signext 0, i8 signext %1, i8 signext 0, i32 %conv)
+  call void @llvm.dbg.value(metadata %class.C* %c, i64 0, metadata !18, metadata !29), !dbg !MDLocation(scope: !13)
+  %call2 = call i32 (%class.C*, i8, i8, i8, ...) @test_function(%class.C* %c, i8 signext 0, i8 signext %1, i8 signext 0, i32 %conv)
   ret void
 }
 
diff --git a/test/CodeGen/X86/mmx-arg-passing-x86-64.ll b/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
index 36ccfe9..2727e3e 100644
--- a/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
+++ b/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
@@ -14,7 +14,7 @@ define void @t3() nounwind  {
 ; X86-64-NEXT:    jmp _pass_v8qi ## TAILCALL
   %tmp3 = load <8 x i8>, <8 x i8>* @g_v8qi, align 8
   %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
-  %tmp4 = tail call i32 (...)* @pass_v8qi( x86_mmx %tmp3a ) nounwind
+  %tmp4 = tail call i32 (...) @pass_v8qi( x86_mmx %tmp3a ) nounwind
   ret void
 }
 
@@ -34,7 +34,7 @@ define void @t4(x86_mmx %v1, x86_mmx %v2) nounwind  {
   %v2b = bitcast x86_mmx %v2 to <8 x i8>
   %tmp3 = add <8 x i8> %v1a, %v2b
   %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
-  %tmp4 = tail call i32 (...)* @pass_v8qi( x86_mmx %tmp3a ) nounwind
+  %tmp4 = tail call i32 (...) @pass_v8qi( x86_mmx %tmp3a ) nounwind
   ret void
 }
 
diff --git a/test/CodeGen/X86/mmx-bitcast.ll b/test/CodeGen/X86/mmx-bitcast.ll
index 4aa10a9..00c8039 100644
--- a/test/CodeGen/X86/mmx-bitcast.ll
+++ b/test/CodeGen/X86/mmx-bitcast.ll
@@ -75,8 +75,7 @@ define i64 @t5(i32 %a, i32 %b) nounwind readnone {
 ; CHECK-NEXT:    movd
 ; CHECK-NEXT:    movd
 ; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
-; CHECK-NEXT:    movd %xmm0, %rax
+; CHECK-NEXT:    movd %xmm1, %rax
 ; CHECK-NEXT:    retq
   %v0 = insertelement <2 x i32> undef, i32 %a, i32 0
   %v1 = insertelement <2 x i32> %v0, i32 %b, i32 1
diff --git a/test/CodeGen/X86/movtopush.ll b/test/CodeGen/X86/movtopush.ll
index 4278910..f89e524 100644
--- a/test/CodeGen/X86/movtopush.ll
+++ b/test/CodeGen/X86/movtopush.ll
@@ -265,7 +265,7 @@ define void @test10() optsize {
   store void (i32, i32, i32, i32)* @good, void (i32, i32, i32, i32)** %stack_fptr
   %good_ptr = load volatile void (i32, i32, i32, i32)*, void (i32, i32, i32, i32)** %stack_fptr
   call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di}"()
-  call void (i32, i32, i32, i32)* %good_ptr(i32 1, i32 2, i32 3, i32 4)
+  call void (i32, i32, i32, i32) %good_ptr(i32 1, i32 2, i32 3, i32 4)
   ret void
 }
 
diff --git a/test/CodeGen/X86/musttail-fastcall.ll b/test/CodeGen/X86/musttail-fastcall.ll
index ed3668d..a95e0ff 100644
--- a/test/CodeGen/X86/musttail-fastcall.ll
+++ b/test/CodeGen/X86/musttail-fastcall.ll
@@ -9,13 +9,13 @@
 declare void @puts(i8*)
 
 define i32 @call_fast_thunk() {
-  %r = call x86_fastcallcc i32 (...)* @fast_thunk(i32 inreg 1, i32 inreg 2, i32 3)
+  %r = call x86_fastcallcc i32 (...) @fast_thunk(i32 inreg 1, i32 inreg 2, i32 3)
   ret i32 %r
 }
 
 define x86_fastcallcc i32 @fast_thunk(...) {
   call void @puts(i8* getelementptr ([4 x i8], [4 x i8]* @asdf, i32 0, i32 0))
-  %r = musttail call x86_fastcallcc i32 (...)* bitcast (i32 (i32, i32, i32)* @fast_target to i32 (...)*) (...)
+  %r = musttail call x86_fastcallcc i32 (...) bitcast (i32 (i32, i32, i32)* @fast_target to i32 (...)*) (...)
   ret i32 %r
 }
 
@@ -38,13 +38,13 @@ define x86_fastcallcc i32 @fast_target(i32 inreg %a, i32 inreg %b, i32 %c) {
 ; Repeat the test for vectorcall, which has XMM registers.
 
 define i32 @call_vector_thunk() {
-  %r = call x86_vectorcallcc i32 (...)* @vector_thunk(i32 inreg 1, i32 inreg 2, i32 3)
+  %r = call x86_vectorcallcc i32 (...) @vector_thunk(i32 inreg 1, i32 inreg 2, i32 3)
   ret i32 %r
 }
 
 define x86_vectorcallcc i32 @vector_thunk(...) {
   call void @puts(i8* getelementptr ([4 x i8], [4 x i8]* @asdf, i32 0, i32 0))
-  %r = musttail call x86_vectorcallcc i32 (...)* bitcast (i32 (i32, i32, i32)* @vector_target to i32 (...)*) (...)
+  %r = musttail call x86_vectorcallcc i32 (...) bitcast (i32 (i32, i32, i32)* @vector_target to i32 (...)*) (...)
   ret i32 %r
 }
 
diff --git a/test/CodeGen/X86/musttail-varargs.ll b/test/CodeGen/X86/musttail-varargs.ll
index 52115b2..3613f4c 100644
--- a/test/CodeGen/X86/musttail-varargs.ll
+++ b/test/CodeGen/X86/musttail-varargs.ll
@@ -16,8 +16,8 @@ define void @f_thunk(i8* %this, ...) {
   %ap_i8 = bitcast [4 x i8*]* %ap to i8*
   call void @llvm.va_start(i8* %ap_i8)
 
-  %fptr = call void(i8*, ...)*(i8*)* @get_f(i8* %this)
-  musttail call void (i8*, ...)* %fptr(i8* %this, ...)
+  %fptr = call void(i8*, ...)*(i8*) @get_f(i8* %this)
+  musttail call void (i8*, ...) %fptr(i8* %this, ...)
   ret void
 }
 
@@ -84,7 +84,7 @@ define void @f_thunk(i8* %this, ...) {
 
 define void @g_thunk(i8* %fptr_i8, ...) {
   %fptr = bitcast i8* %fptr_i8 to void (i8*, ...)*
-  musttail call void (i8*, ...)* %fptr(i8* %fptr_i8, ...)
+  musttail call void (i8*, ...) %fptr(i8* %fptr_i8, ...)
   ret void
 }
 
@@ -114,7 +114,7 @@ then:
   %a_p = getelementptr %struct.Foo, %struct.Foo* %this, i32 0, i32 1
   %a_i8 = load i8*, i8** %a_p
   %a = bitcast i8* %a_i8 to void (%struct.Foo*, ...)*
-  musttail call void (%struct.Foo*, ...)* %a(%struct.Foo* %this, ...)
+  musttail call void (%struct.Foo*, ...) %a(%struct.Foo* %this, ...)
   ret void
 
 else:
@@ -122,7 +122,7 @@ else:
   %b_i8 = load i8*, i8** %b_p
   %b = bitcast i8* %b_i8 to void (%struct.Foo*, ...)*
   store i32 42, i32* @g
-  musttail call void (%struct.Foo*, ...)* %b(%struct.Foo* %this, ...)
+  musttail call void (%struct.Foo*, ...) %b(%struct.Foo* %this, ...)
   ret void
 }
 
diff --git a/test/CodeGen/X86/narrow-shl-cst.ll b/test/CodeGen/X86/narrow-shl-cst.ll
index 40b9760..c9e9a3d 100644
--- a/test/CodeGen/X86/narrow-shl-cst.ll
+++ b/test/CodeGen/X86/narrow-shl-cst.ll
@@ -99,3 +99,26 @@ define i64 @test11(i64 %x) nounwind {
 ; CHECK: xorq $-65536
 ; CHECK: shlq $33
 }
+
+; PR23098
+define i32 @test12(i32 %x, i32* %y) nounwind {
+  %and = shl i32 %x, 1
+  %shl = and i32 %and, 255
+  store i32 %shl, i32* %y
+  ret i32 %shl
+; CHECK-LABEL: test12:
+; CHECK: andl $127
+; CHECK-NEXT: addl
+; CHECK-NOT: shl
+}
+
+define i64 @test13(i64 %x, i64* %y) nounwind {
+  %and = shl i64 %x, 1
+  %shl = and i64 %and, 255
+  store i64 %shl, i64* %y
+  ret i64 %shl
+; CHECK-LABEL: test13:
+; CHECK: andq $127
+; CHECK-NEXT: addq
+; CHECK-NOT: shl
+}
diff --git a/test/CodeGen/X86/nontemporal-2.ll b/test/CodeGen/X86/nontemporal-2.ll
index f62f372..8c08b3c 100644
--- a/test/CodeGen/X86/nontemporal-2.ll
+++ b/test/CodeGen/X86/nontemporal-2.ll
@@ -1,31 +1,303 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
-
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
 
 ; Make sure that we generate non-temporal stores for the test cases below.
+; We use xorps for zeroing, so domain information isn't available anymore.
 
-define void @test1(<4 x float>* %dst) {
-; CHECK-LABEL: test1:
+define void @test_zero_v4f32(<4 x float>* %dst) {
+; CHECK-LABEL: test_zero_v4f32:
 ; SSE: movntps
 ; AVX: vmovntps
   store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1
   ret void
 }
 
-define void @test2(<4 x i32>* %dst) {
-; CHECK-LABEL: test2:
+define void @test_zero_v4i32(<4 x i32>* %dst) {
+; CHECK-LABEL: test_zero_v4i32:
 ; SSE: movntps
 ; AVX: vmovntps
   store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
   ret void
 }
 
-define void @test3(<2 x double>* %dst) {
-; CHECK-LABEL: test3:
+define void @test_zero_v2f64(<2 x double>* %dst) {
+; CHECK-LABEL: test_zero_v2f64:
 ; SSE: movntps
 ; AVX: vmovntps
   store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1
   ret void
 }
 
+define void @test_zero_v2i64(<2 x i64>* %dst) {
+; CHECK-LABEL: test_zero_v2i64:
+; SSE: movntps
+; AVX: vmovntps
+  store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_zero_v8i16(<8 x i16>* %dst) {
+; CHECK-LABEL: test_zero_v8i16:
+; SSE: movntps
+; AVX: vmovntps
+  store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_zero_v16i8(<16 x i8>* %dst) {
+; CHECK-LABEL: test_zero_v16i8:
+; SSE: movntps
+; AVX: vmovntps
+  store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+; And now YMM versions.
+
+define void @test_zero_v8f32(<8 x float>* %dst) {
+; CHECK-LABEL: test_zero_v8f32:
+; AVX: vmovntps %ymm
+  store <8 x float> zeroinitializer, <8 x float>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_zero_v8i32(<8 x i32>* %dst) {
+; CHECK-LABEL: test_zero_v8i32:
+; AVX2: vmovntps %ymm
+  store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_zero_v4f64(<4 x double>* %dst) {
+; CHECK-LABEL: test_zero_v4f64:
+; AVX: vmovntps %ymm
+  store <4 x double> zeroinitializer, <4 x double>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_zero_v4i64(<4 x i64>* %dst) {
+; CHECK-LABEL: test_zero_v4i64:
+; AVX2: vmovntps %ymm
+  store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_zero_v16i16(<16 x i16>* %dst) {
+; CHECK-LABEL: test_zero_v16i16:
+; AVX2: vmovntps %ymm
+  store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_zero_v32i8(<32 x i8>* %dst) {
+; CHECK-LABEL: test_zero_v32i8:
+; AVX2: vmovntps %ymm
+  store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+
+; Check that we also handle arguments.  Here the type survives longer.
+
+define void @test_arg_v4f32(<4 x float> %arg, <4 x float>* %dst) {
+; CHECK-LABEL: test_arg_v4f32:
+; SSE: movntps
+; AVX: vmovntps
+  store <4 x float> %arg, <4 x float>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_arg_v4i32(<4 x i32> %arg, <4 x i32>* %dst) {
+; CHECK-LABEL: test_arg_v4i32:
+; SSE: movntps
+; AVX: vmovntps
+  store <4 x i32> %arg, <4 x i32>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_arg_v2f64(<2 x double> %arg, <2 x double>* %dst) {
+; CHECK-LABEL: test_arg_v2f64:
+; SSE: movntps
+; AVX: vmovntps
+  store <2 x double> %arg, <2 x double>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_arg_v2i64(<2 x i64> %arg, <2 x i64>* %dst) {
+; CHECK-LABEL: test_arg_v2i64:
+; SSE: movntps
+; AVX: vmovntps
+  store <2 x i64> %arg, <2 x i64>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_arg_v8i16(<8 x i16> %arg, <8 x i16>* %dst) {
+; CHECK-LABEL: test_arg_v8i16:
+; SSE: movntps
+; AVX: vmovntps
+  store <8 x i16> %arg, <8 x i16>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %dst) {
+; CHECK-LABEL: test_arg_v16i8:
+; SSE: movntps
+; AVX: vmovntps
+  store <16 x i8> %arg, <16 x i8>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+; And now YMM versions.
+
+define void @test_arg_v8f32(<8 x float> %arg, <8 x float>* %dst) {
+; CHECK-LABEL: test_arg_v8f32:
+; AVX: vmovntps %ymm
+  store <8 x float> %arg, <8 x float>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %dst) {
+; CHECK-LABEL: test_arg_v8i32:
+; AVX2: vmovntps %ymm
+  store <8 x i32> %arg, <8 x i32>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_arg_v4f64(<4 x double> %arg, <4 x double>* %dst) {
+; CHECK-LABEL: test_arg_v4f64:
+; AVX: vmovntps %ymm
+  store <4 x double> %arg, <4 x double>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %dst) {
+; CHECK-LABEL: test_arg_v4i64:
+; AVX2: vmovntps %ymm
+  store <4 x i64> %arg, <4 x i64>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %dst) {
+; CHECK-LABEL: test_arg_v16i16:
+; AVX2: vmovntps %ymm
+  store <16 x i16> %arg, <16 x i16>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %dst) {
+; CHECK-LABEL: test_arg_v32i8:
+; AVX2: vmovntps %ymm
+  store <32 x i8> %arg, <32 x i8>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+
+; Now check that if the execution domain is trivially visible, we use it.
+; We use an add to make the type survive all the way to the MOVNT.
+
+define void @test_op_v4f32(<4 x float> %a, <4 x float> %b, <4 x float>* %dst) {
+; CHECK-LABEL: test_op_v4f32:
+; SSE: movntps
+; AVX: vmovntps
+  %r = fadd <4 x float> %a, %b
+  store <4 x float> %r, <4 x float>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_op_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32>* %dst) {
+; CHECK-LABEL: test_op_v4i32:
+; SSE: movntdq
+; AVX: vmovntdq
+  %r = add <4 x i32> %a, %b
+  store <4 x i32> %r, <4 x i32>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_op_v2f64(<2 x double> %a, <2 x double> %b, <2 x double>* %dst) {
+; CHECK-LABEL: test_op_v2f64:
+; SSE: movntpd
+; AVX: vmovntpd
+  %r = fadd <2 x double> %a, %b
+  store <2 x double> %r, <2 x double>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_op_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64>* %dst) {
+; CHECK-LABEL: test_op_v2i64:
+; SSE: movntdq
+; AVX: vmovntdq
+  %r = add <2 x i64> %a, %b
+  store <2 x i64> %r, <2 x i64>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_op_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16>* %dst) {
+; CHECK-LABEL: test_op_v8i16:
+; SSE: movntdq
+; AVX: vmovntdq
+  %r = add <8 x i16> %a, %b
+  store <8 x i16> %r, <8 x i16>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_op_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8>* %dst) {
+; CHECK-LABEL: test_op_v16i8:
+; SSE: movntdq
+; AVX: vmovntdq
+  %r = add <16 x i8> %a, %b
+  store <16 x i8> %r, <16 x i8>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+; And now YMM versions.
+
+define void @test_op_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
+; CHECK-LABEL: test_op_v8f32:
+; AVX: vmovntps %ymm
+  %r = fadd <8 x float> %a, %b
+  store <8 x float> %r, <8 x float>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_op_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %dst) {
+; CHECK-LABEL: test_op_v8i32:
+; AVX2: vmovntdq %ymm
+  %r = add <8 x i32> %a, %b
+  store <8 x i32> %r, <8 x i32>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_op_v4f64(<4 x double> %a, <4 x double> %b, <4 x double>* %dst) {
+; CHECK-LABEL: test_op_v4f64:
+; AVX: vmovntpd %ymm
+  %r = fadd <4 x double> %a, %b
+  store <4 x double> %r, <4 x double>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_op_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %dst) {
+; CHECK-LABEL: test_op_v4i64:
+; AVX2: vmovntdq %ymm
+  %r = add <4 x i64> %a, %b
+  store <4 x i64> %r, <4 x i64>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_op_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %dst) {
+; CHECK-LABEL: test_op_v16i16:
+; AVX2: vmovntdq %ymm
+  %r = add <16 x i16> %a, %b
+  store <16 x i16> %r, <16 x i16>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) {
+; CHECK-LABEL: test_op_v32i8:
+; AVX2: vmovntdq %ymm
+  %r = add <32 x i8> %a, %b
+  store <32 x i8> %r, <32 x i8>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
 !1 = !{i32 1}
diff --git a/test/CodeGen/X86/or-branch.ll b/test/CodeGen/X86/or-branch.ll
index 9ebf890..ae3ed3f 100644
--- a/test/CodeGen/X86/or-branch.ll
+++ b/test/CodeGen/X86/or-branch.ll
@@ -2,14 +2,14 @@
 
 define void @foo(i32 %X, i32 %Y, i32 %Z) nounwind {
 entry:
-	%tmp = tail call i32 (...)* @bar( )		; <i32> [#uses=0]
+	%tmp = tail call i32 (...) @bar( )		; <i32> [#uses=0]
 	%tmp.upgrd.1 = icmp eq i32 %X, 0		; <i1> [#uses=1]
 	%tmp3 = icmp slt i32 %Y, 5		; <i1> [#uses=1]
 	%tmp4 = or i1 %tmp3, %tmp.upgrd.1		; <i1> [#uses=1]
 	br i1 %tmp4, label %cond_true, label %UnifiedReturnBlock
 
 cond_true:		; preds = %entry
-	%tmp5 = tail call i32 (...)* @bar( )		; <i32> [#uses=0]
+	%tmp5 = tail call i32 (...) @bar( )		; <i32> [#uses=0]
 	ret void
 
 UnifiedReturnBlock:		; preds = %entry
diff --git a/test/CodeGen/X86/patchpoint-webkit_jscc.ll b/test/CodeGen/X86/patchpoint-webkit_jscc.ll
index 37bdd7d..5c39438 100644
--- a/test/CodeGen/X86/patchpoint-webkit_jscc.ll
+++ b/test/CodeGen/X86/patchpoint-webkit_jscc.ll
@@ -25,9 +25,9 @@ entry:
 ; FAST:       movq %rax, (%rsp)
 ; FAST:       callq
   %resolveCall2 = inttoptr i64 -559038736 to i8*
-  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
+  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
   %resolveCall3 = inttoptr i64 -559038737 to i8*
-  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
+  tail call webkit_jscc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
   ret void
 }
 
@@ -51,7 +51,7 @@ entry:
 ; FAST-NEXT:  movabsq $-559038736, %r11
 ; FAST-NEXT:  callq *%r11
   %call = inttoptr i64 -559038736 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
   ret i64 %result
 }
 
@@ -79,7 +79,7 @@ entry:
 ; FAST-NEXT:  movabsq $-559038736, %r11
 ; FAST-NEXT:  callq *%r11
   %call = inttoptr i64 -559038736 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
   ret i64 %result
 }
 
diff --git a/test/CodeGen/X86/patchpoint.ll b/test/CodeGen/X86/patchpoint.ll
index 24e324f..eda13fd 100644
--- a/test/CodeGen/X86/patchpoint.ll
+++ b/test/CodeGen/X86/patchpoint.ll
@@ -15,9 +15,9 @@ entry:
 ; CHECK:      movq %[[REG]], %rax
 ; CHECK:      ret
   %resolveCall2 = inttoptr i64 -559038736 to i8*
-  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
   %resolveCall3 = inttoptr i64 -559038737 to i8*
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 3, i32 15, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 3, i32 15, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
   ret i64 %result
 }
 
@@ -35,7 +35,7 @@ entry:
   store i64 11, i64* %metadata
   store i64 12, i64* %metadata
   store i64 13, i64* %metadata
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
   ret void
 }
 
@@ -48,14 +48,14 @@ entry:
   %tmp80 = add i64 %tmp79, -16
   %tmp81 = inttoptr i64 %tmp80 to i64*
   %tmp82 = load i64, i64* %tmp81, align 8
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 5, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 15, i32 30, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 14, i32 5, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 15, i32 30, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
   %tmp83 = load i64, i64* %tmp33, align 8
   %tmp84 = add i64 %tmp83, -24
   %tmp85 = inttoptr i64 %tmp84 to i64*
   %tmp86 = load i64, i64* %tmp85, align 8
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 5, i64 %arg, i64 %tmp10, i64 %tmp86)
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 18, i32 30, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 17, i32 5, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 18, i32 30, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
   ret i64 10
 }
 
@@ -67,7 +67,7 @@ entry:
 ; CHECK:      nopl 8(%rax,%rax)
 ; CHECK-NEXT: popq
 ; CHECK-NEXT: ret
-  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 5, i8* null, i32 2, i64 %p1, i64 %p2)
+  %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 5, i8* null, i32 2, i64 %p1, i64 %p2)
   ret void
 }
 
@@ -78,7 +78,7 @@ entry:
 ; CHECK:      movabsq $6153737369414576827, %r11
 ; CHECK-NEXT: callq *%r11
   %resolveCall2 = inttoptr i64 6153737369414576827 to i8*
-  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %resolveCall2, i32 0)
+  %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %resolveCall2, i32 0)
   ret i64 %result
 }
 
diff --git a/test/CodeGen/X86/phys-reg-local-regalloc.ll b/test/CodeGen/X86/phys-reg-local-regalloc.ll
index ca364f2..a0adba0 100644
--- a/test/CodeGen/X86/phys-reg-local-regalloc.ll
+++ b/test/CodeGen/X86/phys-reg-local-regalloc.ll
@@ -51,7 +51,7 @@ entry:
   %3 = call i32 asm "", "={ax}"() nounwind        ; <i32> [#uses=1]
   call void asm sideeffect alignstack "movl $0, $1", "{eax},*m,~{dirflag},~{fpsr},~{flags},~{memory}"(i32 %3, i32* %result) nounwind
   %4 = load i32, i32* %result, align 4                 ; <i32> [#uses=1]
-  %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str, i32 0, i32 0), i32 %4) nounwind ; <i32> [#uses=0]
+  %5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str, i32 0, i32 0), i32 %4) nounwind ; <i32> [#uses=0]
   store i32 0, i32* %0, align 4
   %6 = load i32, i32* %0, align 4                      ; <i32> [#uses=1]
   store i32 %6, i32* %retval, align 4
diff --git a/test/CodeGen/X86/pic.ll b/test/CodeGen/X86/pic.ll
index faaf73b..d543deb 100644
--- a/test/CodeGen/X86/pic.ll
+++ b/test/CodeGen/X86/pic.ll
@@ -69,10 +69,10 @@ entry:
 
 define void @test3() nounwind {
 entry:
-    %tmp = call void(...)*(...)* @afoo()
+    %tmp = call void(...)*(...) @afoo()
     store void(...)* %tmp, void(...)** @pfoo
     %tmp1 = load void(...)*, void(...)** @pfoo
-    call void(...)* %tmp1()
+    call void(...) %tmp1()
     ret void
 ; LINUX-LABEL: test3:
 ; LINUX: 	calll	.L3$pb
@@ -88,7 +88,7 @@ declare void(...)* @afoo(...)
 
 define void @test4() nounwind {
 entry:
-    call void(...)* @foo()
+    call void(...) @foo()
     ret void
 ; LINUX-LABEL: test4:
 ; LINUX: calll	.L4$pb
@@ -146,43 +146,43 @@ define void @test7(i32 %n.u) nounwind {
 entry:
     switch i32 %n.u, label %bb12 [i32 1, label %bb i32 2, label %bb6 i32 4, label %bb7 i32 5, label %bb8 i32 6, label %bb10 i32 7, label %bb1 i32 8, label %bb3 i32 9, label %bb4 i32 10, label %bb9 i32 11, label %bb2 i32 12, label %bb5 i32 13, label %bb11 ]
 bb:
-    tail call void(...)* @foo1()
+    tail call void(...) @foo1()
     ret void
 bb1:
-    tail call void(...)* @foo2()
+    tail call void(...) @foo2()
     ret void
 bb2:
-    tail call void(...)* @foo6()
+    tail call void(...) @foo6()
     ret void
 bb3:
-    tail call void(...)* @foo3()
+    tail call void(...) @foo3()
     ret void
 bb4:
-    tail call void(...)* @foo4()
+    tail call void(...) @foo4()
     ret void
 bb5:
-    tail call void(...)* @foo5()
+    tail call void(...) @foo5()
     ret void
 bb6:
-    tail call void(...)* @foo1()
+    tail call void(...) @foo1()
     ret void
 bb7:
-    tail call void(...)* @foo2()
+    tail call void(...) @foo2()
     ret void
 bb8:
-    tail call void(...)* @foo6()
+    tail call void(...) @foo6()
     ret void
 bb9:
-    tail call void(...)* @foo3()
+    tail call void(...) @foo3()
     ret void
 bb10:
-    tail call void(...)* @foo4()
+    tail call void(...) @foo4()
     ret void
 bb11:
-    tail call void(...)* @foo5()
+    tail call void(...) @foo5()
     ret void
 bb12:
-    tail call void(...)* @foo6()
+    tail call void(...) @foo6()
     ret void
     
 ; LINUX-LABEL: test7:
diff --git a/test/CodeGen/X86/pr1489.ll b/test/CodeGen/X86/pr1489.ll
index d37b9a2..13ced2a 100644
--- a/test/CodeGen/X86/pr1489.ll
+++ b/test/CodeGen/X86/pr1489.ll
@@ -48,7 +48,7 @@ entry:
 	%tmp1 = tail call i32 @bar( )		; <i32> [#uses=1]
 	%tmp2 = tail call i32 @foo( )		; <i32> [#uses=1]
 	%tmp3 = tail call i32 @quux( )		; <i32> [#uses=1]
-	%tmp5 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), i32 %tmp3, i32 %tmp2, i32 %tmp1, i32 %tmp )		; <i32> [#uses=0]
+	%tmp5 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), i32 %tmp3, i32 %tmp2, i32 %tmp1, i32 %tmp )		; <i32> [#uses=0]
 	ret i32 undef
 }
 
diff --git a/test/CodeGen/X86/pr18023.ll b/test/CodeGen/X86/pr18023.ll
index ed3d6a0..c7ea20c 100644
--- a/test/CodeGen/X86/pr18023.ll
+++ b/test/CodeGen/X86/pr18023.ll
@@ -24,7 +24,7 @@ define void @func() {
   %3 = load volatile i32, i32* @b, align 4
   store i32 3, i32* @c, align 4
   %4 = load i32, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 1), align 4
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %4)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %4)
   ret void
 }
 
diff --git a/test/CodeGen/X86/pr23246.ll b/test/CodeGen/X86/pr23246.ll
new file mode 100644
index 0000000..6eb24a6
--- /dev/null
+++ b/test/CodeGen/X86/pr23246.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple x86_64-unknown-unknown | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; PR23246
+; We're really only interested in doing something sane with the shuffle.
+
+; CHECK-LABEL: test:
+; CHECK:      movq2dq %mm0, %xmm0
+; CHECK-NEXT: pshufd {{.*}} xmm0 = xmm0[0,1,0,1]
+; CHECK-NEXT: retq
+define <2 x i64> @test(x86_mmx %a) #0 {
+entry:
+  %b = bitcast x86_mmx %a to <1 x i64>
+  %s = shufflevector <1 x i64> %b, <1 x i64> undef, <2 x i32> <i32 undef, i32 0>
+  ret <2 x i64> %s
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/pr2326.ll b/test/CodeGen/X86/pr2326.ll
index 9cf83bb..88c7bb5 100644
--- a/test/CodeGen/X86/pr2326.ll
+++ b/test/CodeGen/X86/pr2326.ll
@@ -17,7 +17,7 @@ entry:
 	%tmp25 = and i1 %toBool23, %toBool24		; <i1> [#uses=1]
 	%tmp2526 = zext i1 %tmp25 to i8		; <i8> [#uses=1]
 	%tmp252627 = zext i8 %tmp2526 to i32		; <i32> [#uses=1]
-	%tmp29 = call i32 (...)* @func_15( i32 %tmp252627, i32 0 ) nounwind 		; <i32> [#uses=0]
+	%tmp29 = call i32 (...) @func_15( i32 %tmp252627, i32 0 ) nounwind 		; <i32> [#uses=0]
 	unreachable
 }
 
diff --git a/test/CodeGen/X86/pr2656.ll b/test/CodeGen/X86/pr2656.ll
index 6f31c5f..9a162d7 100644
--- a/test/CodeGen/X86/pr2656.ll
+++ b/test/CodeGen/X86/pr2656.ll
@@ -19,7 +19,7 @@ entry:
 	%conv = fpext float %neg to double		; <double> [#uses=1]
 	%neg4 = fsub float -0.000000e+00, %tmp3		; <float> [#uses=1]
 	%conv5 = fpext float %neg4 to double		; <double> [#uses=1]
-	%call = call i32 (...)* @printf( i8* getelementptr ([17 x i8], [17 x i8]* @.str, i32 0, i32 0), double %conv, double %conv5 )		; <i32> [#uses=0]
+	%call = call i32 (...) @printf( i8* getelementptr ([17 x i8], [17 x i8]* @.str, i32 0, i32 0), double %conv, double %conv5 )		; <i32> [#uses=0]
 	ret void
 }
 
diff --git a/test/CodeGen/X86/pr2982.ll b/test/CodeGen/X86/pr2982.ll
index ab46005..b7902b8 100644
--- a/test/CodeGen/X86/pr2982.ll
+++ b/test/CodeGen/X86/pr2982.ll
@@ -20,7 +20,7 @@ entry:
         %5 = sext i8 %4 to i32          ; <i32> [#uses=1]
         %6 = add i32 %2, %3             ; <i32> [#uses=1]
         %7 = add i32 %6, %5             ; <i32> [#uses=1]
-        %8 = tail call i32 (...)* @rshift_u_u(i32 %7, i32 0) nounwind          
+        %8 = tail call i32 (...) @rshift_u_u(i32 %7, i32 0) nounwind          
 ; <i32> [#uses=0]
         ret void
 }
diff --git a/test/CodeGen/X86/pr3244.ll b/test/CodeGen/X86/pr3244.ll
index b08a223..c6419d8 100644
--- a/test/CodeGen/X86/pr3244.ll
+++ b/test/CodeGen/X86/pr3244.ll
@@ -10,7 +10,7 @@ entry:
         %1 = load i32, i32* @g_487, align 4          ; <i32> [#uses=1]
         %2 = trunc i16 %0 to i8         ; <i8> [#uses=1]
         %3 = trunc i32 %1 to i8         ; <i8> [#uses=1]
-        %4 = tail call i32 (...)* @func_7(i64 -4455561449541442965, i32 1)
+        %4 = tail call i32 (...) @func_7(i64 -4455561449541442965, i32 1)
 nounwind             ; <i32> [#uses=1]
         %5 = trunc i32 %4 to i8         ; <i8> [#uses=1]
         %6 = mul i8 %3, %2              ; <i8> [#uses=1]
diff --git a/test/CodeGen/X86/pr3250.ll b/test/CodeGen/X86/pr3250.ll
index cccbf54..4ab989e 100644
--- a/test/CodeGen/X86/pr3250.ll
+++ b/test/CodeGen/X86/pr3250.ll
@@ -5,7 +5,7 @@ declare i32 @safe_sub_func_short_u_u(i16 signext, i16 signext) nounwind
 
 define i32 @func_106(i32 %p_107) nounwind {
 entry:
-        %0 = tail call i32 (...)* @safe_div_(i32 %p_107, i32 1) nounwind       
+        %0 = tail call i32 (...) @safe_div_(i32 %p_107, i32 1) nounwind       
         ; <i32> [#uses=1]
         %1 = lshr i32 %0, -9            ; <i32> [#uses=1]
         %2 = trunc i32 %1 to i16                ; <i16> [#uses=1]
diff --git a/test/CodeGen/X86/pr3457.ll b/test/CodeGen/X86/pr3457.ll
index 7264bcd..d4c0020 100644
--- a/test/CodeGen/X86/pr3457.ll
+++ b/test/CodeGen/X86/pr3457.ll
@@ -4,8 +4,8 @@
 
 define void @foo(double* nocapture %P) nounwind {
 entry:
-	%0 = tail call double (...)* @test() nounwind		; <double> [#uses=2]
-	%1 = tail call double (...)* @test() nounwind		; <double> [#uses=2]
+	%0 = tail call double (...) @test() nounwind		; <double> [#uses=2]
+	%1 = tail call double (...) @test() nounwind		; <double> [#uses=2]
 	%2 = fmul double %0, %0		; <double> [#uses=1]
 	%3 = fmul double %1, %1		; <double> [#uses=1]
 	%4 = fadd double %2, %3		; <double> [#uses=1]
diff --git a/test/CodeGen/X86/rd-mod-wr-eflags.ll b/test/CodeGen/X86/rd-mod-wr-eflags.ll
index afa1962..9723721 100644
--- a/test/CodeGen/X86/rd-mod-wr-eflags.ll
+++ b/test/CodeGen/X86/rd-mod-wr-eflags.ll
@@ -39,7 +39,7 @@ store i64 %dec.i, i64* @c, align 8
 %tobool.i = icmp ne i64 %dec.i, 0
 %lor.ext.i = zext i1 %tobool.i to i32
 store i32 %lor.ext.i, i32* @a, align 4
-%call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), i64 %dec.i) nounwind
+%call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), i64 %dec.i) nounwind
 ret i32 0
 }
 
@@ -53,7 +53,7 @@ store i64 %dec.i, i64* @c, align 8
 %tobool.i = icmp ne i64 %0, 0
 %lor.ext.i = zext i1 %tobool.i to i32
 store i32 %lor.ext.i, i32* @a, align 4
-%call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), i64 %dec.i) nounwind
+%call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), i64 %dec.i) nounwind
 ret i32 0
 }
 
diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll
index 83b86ac..fcd0770 100644
--- a/test/CodeGen/X86/recip-fastmath.ll
+++ b/test/CodeGen/X86/recip-fastmath.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+use-recip-est,+avx -x86-recip-refinement-steps=2 | FileCheck %s --check-prefix=REFINE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est | FileCheck %s --check-prefix=RECIP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est -x86-recip-refinement-steps=2 | FileCheck %s --check-prefix=REFINE
 
 ; If the target's divss/divps instructions are substantially
 ; slower than rcpss/rcpps with a Newton-Raphson refinement,
@@ -20,13 +20,13 @@ define float @reciprocal_estimate(float %x) #0 {
 ; CHECK-NEXT: movaps
 ; CHECK-NEXT: retq
 
-; BTVER2-LABEL: reciprocal_estimate:
-; BTVER2: vrcpss
-; BTVER2: vmulss
-; BTVER2: vsubss
-; BTVER2: vmulss
-; BTVER2: vaddss
-; BTVER2-NEXT: retq
+; RECIP-LABEL: reciprocal_estimate:
+; RECIP: vrcpss
+; RECIP: vmulss
+; RECIP: vsubss
+; RECIP: vmulss
+; RECIP: vaddss
+; RECIP-NEXT: retq
 
 ; REFINE-LABEL: reciprocal_estimate:
 ; REFINE: vrcpss
@@ -51,13 +51,13 @@ define <4 x float> @reciprocal_estimate_v4f32(<4 x float> %x) #0 {
 ; CHECK-NEXT: movaps
 ; CHECK-NEXT: retq
 
-; BTVER2-LABEL: reciprocal_estimate_v4f32:
-; BTVER2: vrcpps
-; BTVER2: vmulps
-; BTVER2: vsubps
-; BTVER2: vmulps
-; BTVER2: vaddps
-; BTVER2-NEXT: retq
+; RECIP-LABEL: reciprocal_estimate_v4f32:
+; RECIP: vrcpps
+; RECIP: vmulps
+; RECIP: vsubps
+; RECIP: vmulps
+; RECIP: vaddps
+; RECIP-NEXT: retq
 
 ; REFINE-LABEL: reciprocal_estimate_v4f32:
 ; REFINE: vrcpps
@@ -85,13 +85,13 @@ define <8 x float> @reciprocal_estimate_v8f32(<8 x float> %x) #0 {
 ; CHECK-NEXT: movaps
 ; CHECK-NEXT: retq
 
-; BTVER2-LABEL: reciprocal_estimate_v8f32:
-; BTVER2: vrcpps
-; BTVER2: vmulps
-; BTVER2: vsubps
-; BTVER2: vmulps
-; BTVER2: vaddps
-; BTVER2-NEXT: retq
+; RECIP-LABEL: reciprocal_estimate_v8f32:
+; RECIP: vrcpps
+; RECIP: vmulps
+; RECIP: vsubps
+; RECIP: vmulps
+; RECIP: vaddps
+; RECIP-NEXT: retq
 
 ; REFINE-LABEL: reciprocal_estimate_v8f32:
 ; REFINE: vrcpps
diff --git a/test/CodeGen/X86/scalarize-bitcast.ll b/test/CodeGen/X86/scalarize-bitcast.ll
index 6de511f..60650f4 100644
--- a/test/CodeGen/X86/scalarize-bitcast.ll
+++ b/test/CodeGen/X86/scalarize-bitcast.ll
@@ -21,7 +21,7 @@ entry:
 	%tmp24.i = extractelement <1 x i64> %tmp10.i, i32 0		; <i64> [#uses=1]
 	%tmp10 = bitcast i64 %tmp24.i to <1 x i64>		; <<1 x i64>> [#uses=1]
 	%tmp7 = extractelement <1 x i64> %tmp10, i32 0		; <i64> [#uses=1]
-	%call6 = tail call i32 (...)* @store8888(i64 %tmp7)		; <i32> [#uses=1]
+	%call6 = tail call i32 (...) @store8888(i64 %tmp7)		; <i32> [#uses=1]
 	store i32 %call6, i32* %src
 	ret void
 }
diff --git a/test/CodeGen/X86/scheduler-backtracking.ll b/test/CodeGen/X86/scheduler-backtracking.ll
new file mode 100644
index 0000000..98471ee
--- /dev/null
+++ b/test/CodeGen/X86/scheduler-backtracking.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=x86-64 < %s -pre-RA-sched=list-ilp    | FileCheck %s
+; RUN: llc -march=x86-64 < %s -pre-RA-sched=list-hybrid | FileCheck %s
+; RUN: llc -march=x86-64 < %s -pre-RA-sched=source      | FileCheck %s
+; RUN: llc -march=x86-64 < %s -pre-RA-sched=list-burr   | FileCheck %s
+; RUN: llc -march=x86-64 < %s -pre-RA-sched=linearize   | FileCheck %s
+
+; PR22304 https://llvm.org/bugs/show_bug.cgi?id=22304
+; Tests checking backtracking in source scheduler. llc used to crash on them.
+
+; CHECK-LABEL: test1
+define i256 @test1(i256 %a) {
+  %b = add i256 %a, 1 
+  %m = shl i256 %b, 1
+  %p = add i256 %m, 1
+  %v = lshr i256 %b, %p
+  %t = trunc i256 %v to i1
+  %c = shl i256 1, %p
+  %f = select i1 %t, i256 undef, i256 %c
+  ret i256 %f
+}
+
+; CHECK-LABEL: test2
+define i256 @test2(i256 %a) {
+  %b = sub i256 0, %a
+  %c = and i256 %b, %a
+  %d = call i256 @llvm.ctlz.i256(i256 %c, i1 false)
+  ret i256 %d
+}
+
+; CHECK-LABEL: test3
+define i256 @test3(i256 %n) {
+  %m = sub i256 -1, %n
+  %x = sub i256 0, %n
+  %y = and i256 %x, %m
+  %z = call i256 @llvm.ctlz.i256(i256 %y, i1 false)
+  ret i256 %z
+}
+
+declare i256 @llvm.ctlz.i256(i256, i1) nounwind readnone
+
+; CHECK-LABEL: test4
+define i64 @test4(i64 %a, i64 %b) {
+  %r = zext i64 %b to i256
+  %u = add i256 %r, 1
+  %w = and i256 %u, 1461501637330902918203684832716283019655932542975
+  %x = zext i64 %a to i256
+  %c = icmp uge i256 %w, %x
+  %y = select i1 %c, i64 0, i64 1
+  %z = add i64 %y, 1
+  ret i64 %z
+}
diff --git a/test/CodeGen/X86/segmented-stacks.ll b/test/CodeGen/X86/segmented-stacks.ll
index 4127288..55eaab9 100644
--- a/test/CodeGen/X86/segmented-stacks.ll
+++ b/test/CodeGen/X86/segmented-stacks.ll
@@ -11,16 +11,16 @@
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -verify-machineinstrs | FileCheck %s -check-prefix=X64-MinGW
 
 ; We used to crash with filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=i686-dragonfly -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-dragonfly -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -filetype=obj -o /dev/null
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -filetype=obj -o /dev/null
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -filetype=obj -o /dev/null
+; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -filetype=obj -o /dev/null
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -filetype=obj -o /dev/null
+; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -filetype=obj -o /dev/null
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -filetype=obj -o /dev/null
+; RUN: llc < %s -mcpu=generic -mtriple=i686-dragonfly -filetype=obj -o /dev/null
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-dragonfly -filetype=obj -o /dev/null
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -filetype=obj -o /dev/null
 
 ; RUN: not llc < %s -mcpu=generic -mtriple=x86_64-solaris 2> %t.log
 ; RUN: FileCheck %s -input-file=%t.log -check-prefix=X64-Solaris
diff --git a/test/CodeGen/X86/seh-safe-div.ll b/test/CodeGen/X86/seh-safe-div.ll
index 477ad36..ba54f1c 100644
--- a/test/CodeGen/X86/seh-safe-div.ll
+++ b/test/CodeGen/X86/seh-safe-div.ll
@@ -173,15 +173,15 @@ define i32 @main() {
   store i32 10, i32* %n.addr, align 4
   store i32 2, i32* %d.addr, align 4
   %r1 = call i32 @safe_div(i32* %n.addr, i32* %d.addr)
-  call void (i8*, ...)* @printf(i8* getelementptr ([21 x i8], [21 x i8]* @str_result, i32 0, i32 0), i32 %r1)
+  call void (i8*, ...) @printf(i8* getelementptr ([21 x i8], [21 x i8]* @str_result, i32 0, i32 0), i32 %r1)
 
   store i32 10, i32* %n.addr, align 4
   store i32 0, i32* %d.addr, align 4
   %r2 = call i32 @safe_div(i32* %n.addr, i32* %d.addr)
-  call void (i8*, ...)* @printf(i8* getelementptr ([21 x i8], [21 x i8]* @str_result, i32 0, i32 0), i32 %r2)
+  call void (i8*, ...) @printf(i8* getelementptr ([21 x i8], [21 x i8]* @str_result, i32 0, i32 0), i32 %r2)
 
   %r3 = call i32 @safe_div(i32* %n.addr, i32* null)
-  call void (i8*, ...)* @printf(i8* getelementptr ([21 x i8], [21 x i8]* @str_result, i32 0, i32 0), i32 %r3)
+  call void (i8*, ...) @printf(i8* getelementptr ([21 x i8], [21 x i8]* @str_result, i32 0, i32 0), i32 %r3)
   ret i32 0
 }
 
diff --git a/test/CodeGen/X86/setcc.ll b/test/CodeGen/X86/setcc.ll
index 2454af9..6f1ddbd 100644
--- a/test/CodeGen/X86/setcc.ll
+++ b/test/CodeGen/X86/setcc.ll
@@ -29,7 +29,7 @@ define i64 @t3(i64 %x) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: t3:
 ; CHECK: sbbq %rax, %rax
-; CHECK: andq $64, %rax
+; CHECK: andl $64, %eax
   %0 = icmp ult i64 %x, 18                        ; <i1> [#uses=1]
   %iftmp.2.0 = select i1 %0, i64 64, i64 0        ; <i64> [#uses=1]
   ret i64 %iftmp.2.0
diff --git a/test/CodeGen/X86/shift-pair.ll b/test/CodeGen/X86/shift-pair.ll
index 24ba1fc..62e51f0 100644
--- a/test/CodeGen/X86/shift-pair.ll
+++ b/test/CodeGen/X86/shift-pair.ll
@@ -3,7 +3,7 @@
 define i64 @test(i64 %A) {
 ; CHECK: @test
 ; CHECK: shrq $54
-; CHECK: andq $1020
+; CHECK: andl $1020
 ; CHECK: ret
     %B = lshr i64 %A, 56
     %C = shl i64 %B, 2
diff --git a/test/CodeGen/X86/sibcall.ll b/test/CodeGen/X86/sibcall.ll
index d32e567..b94960a 100644
--- a/test/CodeGen/X86/sibcall.ll
+++ b/test/CodeGen/X86/sibcall.ll
@@ -349,7 +349,7 @@ entry:
 ; X32ABI-LABEL: t17:
 ; X32ABI: xorl %eax, %eax
 ; X32ABI: jmp {{_?}}bar5
-  tail call void (...)* @bar5() nounwind
+  tail call void (...) @bar5() nounwind
   ret void
 }
 
@@ -369,7 +369,7 @@ entry:
 ; X32ABI-LABEL: t18:
 ; X32ABI: xorl %eax, %eax
 ; X32ABI: jmp {{_?}}bar6
-  %0 = tail call double (...)* @bar6() nounwind
+  %0 = tail call double (...) @bar6() nounwind
   ret void
 }
 
diff --git a/test/CodeGen/X86/smul-with-overflow.ll b/test/CodeGen/X86/smul-with-overflow.ll
index 55aa6aa..2b21f4f 100644
--- a/test/CodeGen/X86/smul-with-overflow.ll
+++ b/test/CodeGen/X86/smul-with-overflow.ll
@@ -11,11 +11,11 @@ entry:
   br i1 %obit, label %overflow, label %normal
 
 normal:
-  %t1 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
+  %t1 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
   ret i1 true
 
 overflow:
-  %t2 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
+  %t2 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
   ret i1 false
 ; CHECK-LABEL: test1:
 ; CHECK: imull
@@ -30,11 +30,11 @@ entry:
   br i1 %obit, label %overflow, label %normal
 
 overflow:
-  %t2 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
+  %t2 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
   ret i1 false
 
 normal:
-  %t1 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
+  %t1 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
   ret i1 true
 ; CHECK-LABEL: test2:
 ; CHECK: imull
diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll
index 24b175e..4c6b521 100644
--- a/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/test/CodeGen/X86/sqrt-fastmath.ll
@@ -1,132 +1,141 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-sqrt-est | FileCheck %s --check-prefix=ESTIMATE
 
-; generated using "clang -S -O2 -ffast-math -emit-llvm sqrt.c" from
-; #include <math.h>
-; 
-; double fd(double d){
-;   return sqrt(d);
-; }
-; 
-; float ff(float f){
-;   return sqrtf(f);
-; }
-; 
-; long double fld(long double ld){
-;   return sqrtl(ld);
-; }
-;
-; Tests conversion of sqrt function calls into sqrt instructions when
-; -ffast-math is in effect.
+declare double @__sqrt_finite(double) #0
+declare float @__sqrtf_finite(float) #0
+declare x86_fp80 @__sqrtl_finite(x86_fp80) #0
+declare float @llvm.sqrt.f32(float) #0
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #0
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #0
 
-; ModuleID = 'sqrt.c'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
 
-; Function Attrs: nounwind readnone uwtable
 define double @fd(double %d) #0 {
-entry:
-; CHECK: sqrtsd
-  %call = tail call double @__sqrt_finite(double %d) #2
+; CHECK-LABEL: fd:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    sqrtsd %xmm0, %xmm0
+; CHECK-NEXT:    retq
+;
+; ESTIMATE-LABEL: fd:
+; ESTIMATE:       # BB#0:
+; ESTIMATE-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; ESTIMATE-NEXT:    retq
+  %call = tail call double @__sqrt_finite(double %d) #1
   ret double %call
 }
 
-; Function Attrs: nounwind readnone
-declare double @__sqrt_finite(double) #1
 
-; Function Attrs: nounwind readnone uwtable
 define float @ff(float %f) #0 {
-entry:
-; CHECK: sqrtss
-  %call = tail call float @__sqrtf_finite(float %f) #2
+; CHECK-LABEL: ff:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    sqrtss %xmm0, %xmm0
+; CHECK-NEXT:    retq
+;
+; ESTIMATE-LABEL: ff:
+; ESTIMATE:       # BB#0:
+; ESTIMATE-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
+; ESTIMATE-NEXT:    vmulss {{.*}}(%rip), %xmm1, %xmm2
+; ESTIMATE-NEXT:    vmulss %xmm1, %xmm1, %xmm1
+; ESTIMATE-NEXT:    vmulss %xmm0, %xmm1, %xmm1
+; ESTIMATE-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
+; ESTIMATE-NEXT:    vmulss %xmm2, %xmm1, %xmm1
+; ESTIMATE-NEXT:    vmulss %xmm1, %xmm0, %xmm1
+; ESTIMATE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; ESTIMATE-NEXT:    vcmpeqss %xmm2, %xmm0, %xmm0
+; ESTIMATE-NEXT:    vandnps %xmm1, %xmm0, %xmm0
+; ESTIMATE-NEXT:    retq
+  %call = tail call float @__sqrtf_finite(float %f) #1
   ret float %call
 }
 
-; Function Attrs: nounwind readnone
-declare float @__sqrtf_finite(float) #1
 
-; Function Attrs: nounwind readnone uwtable
 define x86_fp80 @fld(x86_fp80 %ld) #0 {
-entry:
-; CHECK: fsqrt
-  %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2
+; CHECK-LABEL: fld:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fsqrt
+; CHECK-NEXT:    retq
+;
+; ESTIMATE-LABEL: fld:
+; ESTIMATE:       # BB#0:
+; ESTIMATE-NEXT:    fldt {{[0-9]+}}(%rsp)
+; ESTIMATE-NEXT:    fsqrt
+; ESTIMATE-NEXT:    retq
+  %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #1
   ret x86_fp80 %call
 }
 
-declare x86_fp80 @__sqrtl_finite(x86_fp80) #1
-
-declare float @llvm.sqrt.f32(float) #1
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #1
-declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #1
 
-; If the target's sqrtss and divss instructions are substantially
-; slower than rsqrtss with a Newton-Raphson refinement, we should
-; generate the estimate sequence.
 
 define float @reciprocal_square_root(float %x) #0 {
+; CHECK-LABEL: reciprocal_square_root:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    sqrtss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    divss %xmm1, %xmm0
+; CHECK-NEXT:    retq
+;
+; ESTIMATE-LABEL: reciprocal_square_root:
+; ESTIMATE:       # BB#0:
+; ESTIMATE-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
+; ESTIMATE-NEXT:    vmulss {{.*}}(%rip), %xmm1, %xmm2
+; ESTIMATE-NEXT:    vmulss %xmm1, %xmm1, %xmm1
+; ESTIMATE-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; ESTIMATE-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
+; ESTIMATE-NEXT:    vmulss %xmm2, %xmm0, %xmm0
+; ESTIMATE-NEXT:    retq
   %sqrt = tail call float @llvm.sqrt.f32(float %x)
   %div = fdiv fast float 1.0, %sqrt
   ret float %div
-
-; CHECK-LABEL: reciprocal_square_root:
-; CHECK: sqrtss
-; CHECK-NEXT: movss
-; CHECK-NEXT: divss
-; CHECK-NEXT: retq
-; BTVER2-LABEL: reciprocal_square_root:
-; BTVER2: vrsqrtss
-; BTVER2-NEXT: vmulss
-; BTVER2-NEXT: vmulss
-; BTVER2-NEXT: vmulss
-; BTVER2-NEXT: vaddss
-; BTVER2-NEXT: vmulss
-; BTVER2-NEXT: retq
 }
 
 define <4 x float> @reciprocal_square_root_v4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: reciprocal_square_root_v4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    sqrtps %xmm0, %xmm1
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; CHECK-NEXT:    divps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+;
+; ESTIMATE-LABEL: reciprocal_square_root_v4f32:
+; ESTIMATE:       # BB#0:
+; ESTIMATE-NEXT:    vrsqrtps %xmm0, %xmm1
+; ESTIMATE-NEXT:    vmulps %xmm1, %xmm1, %xmm2
+; ESTIMATE-NEXT:    vmulps %xmm0, %xmm2, %xmm0
+; ESTIMATE-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; ESTIMATE-NEXT:    vmulps {{.*}}(%rip), %xmm1, %xmm1
+; ESTIMATE-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; ESTIMATE-NEXT:    retq
   %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
   ret <4 x float> %div
-
-; CHECK-LABEL: reciprocal_square_root_v4f32:
-; CHECK: sqrtps
-; CHECK-NEXT: movaps
-; CHECK-NEXT: divps
-; CHECK-NEXT: retq
-; BTVER2-LABEL: reciprocal_square_root_v4f32:
-; BTVER2: vrsqrtps
-; BTVER2-NEXT: vmulps
-; BTVER2-NEXT: vmulps
-; BTVER2-NEXT: vmulps
-; BTVER2-NEXT: vaddps
-; BTVER2-NEXT: vmulps
-; BTVER2-NEXT: retq
 }
 
 define <8 x float> @reciprocal_square_root_v8f32(<8 x float> %x) #0 {
+; CHECK-LABEL: reciprocal_square_root_v8f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    sqrtps %xmm1, %xmm2
+; CHECK-NEXT:    sqrtps %xmm0, %xmm3
+; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    divps %xmm3, %xmm0
+; CHECK-NEXT:    divps %xmm2, %xmm1
+; CHECK-NEXT:    retq
+;
+; ESTIMATE-LABEL: reciprocal_square_root_v8f32:
+; ESTIMATE:       # BB#0:
+; ESTIMATE-NEXT:    vrsqrtps %ymm0, %ymm1
+; ESTIMATE-NEXT:    vmulps %ymm1, %ymm1, %ymm2
+; ESTIMATE-NEXT:    vmulps %ymm0, %ymm2, %ymm0
+; ESTIMATE-NEXT:    vaddps {{.*}}(%rip), %ymm0, %ymm0
+; ESTIMATE-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
+; ESTIMATE-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; ESTIMATE-NEXT:    retq
   %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
   ret <8 x float> %div
-
-; CHECK-LABEL: reciprocal_square_root_v8f32:
-; CHECK: sqrtps
-; CHECK-NEXT: sqrtps
-; CHECK-NEXT: movaps
-; CHECK-NEXT: movaps
-; CHECK-NEXT: divps
-; CHECK-NEXT: divps
-; CHECK-NEXT: retq
-; BTVER2-LABEL: reciprocal_square_root_v8f32:
-; BTVER2: vrsqrtps
-; BTVER2-NEXT: vmulps
-; BTVER2-NEXT: vmulps
-; BTVER2-NEXT: vmulps
-; BTVER2-NEXT: vaddps
-; BTVER2-NEXT: vmulps
-; BTVER2-NEXT: retq
 }
 
 
-attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
+attributes #0 = { "unsafe-fp-math"="true" }
+attributes #1 = { nounwind readnone }
+
diff --git a/test/CodeGen/X86/sret-implicit.ll b/test/CodeGen/X86/sret-implicit.ll
index 3fade1d..5680952 100644
--- a/test/CodeGen/X86/sret-implicit.ll
+++ b/test/CodeGen/X86/sret-implicit.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin8 < %s | FileCheck %s
 ; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin8 -terminal-rule < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux -terminal-rule < %s | FileCheck %s
 
 ; CHECK-LABEL: return32
 ; CHECK-DAG: movq	$0, (%rdi)
diff --git a/test/CodeGen/X86/sse-varargs.ll b/test/CodeGen/X86/sse-varargs.ll
index da38f0e..7c3c781 100644
--- a/test/CodeGen/X86/sse-varargs.ll
+++ b/test/CodeGen/X86/sse-varargs.ll
@@ -2,7 +2,7 @@
 
 define i32 @t() nounwind  {
 entry:
-	tail call void (i32, ...)* @foo( i32 1, <4 x i32> < i32 10, i32 11, i32 12, i32 13 > ) nounwind 
+	tail call void (i32, ...) @foo( i32 1, <4 x i32> < i32 10, i32 11, i32 12, i32 13 > ) nounwind 
 	ret i32 0
 }
 
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index cab62a3..5afebd2 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -581,7 +581,7 @@ declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
 define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {
   ; CHECK: test_x86_sse2_storel_dq
   ; CHECK: movl
-  ; CHECK: movq
+  ; CHECK: movlps
   call void @llvm.x86.sse2.storel.dq(i8* %a0, <4 x i32> %a1)
   ret void
 }
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index ca13392..3bde991 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -1026,29 +1026,24 @@ define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
 }
 
 ; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1>
-define void @insertps_pr20411(i32* noalias nocapture %RET) #1 {
+define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* noalias nocapture %RET) #1 {
 ; X32-LABEL: insertps_pr20411:
 ; X32:       ## BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    pshufd {{.*#+}} xmm0 = mem[2,3,0,1]
-; X32-NEXT:    pshufd {{.*#+}} xmm1 = mem[3,1,2,3]
-; X32-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
 ; X32-NEXT:    movdqu %xmm1, (%eax)
 ; X32-NEXT:    retl
 ;
-; X64-LABEL: insertps_pr20411:
-; X64:       ## BB#0:
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = mem[2,3,0,1]
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = mem[3,1,2,3]
-; X64-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
-; X64-NEXT:    movdqu %xmm1, (%rdi)
+; X64-LABEL: insertps_pr20411:
+; X64:       ## BB#0:
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X64-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; X64-NEXT:    movdqu %xmm1, (%rdi)
 ; X64-NEXT:    retq
-  %gather_load = shufflevector <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %shuffle109 = shufflevector <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>  ; 4 5 6 7
-  %shuffle116 = shufflevector <8 x i32> %gather_load, <8 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> ; 3 x x x
-  %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 4, i32 3, i32 undef, i32 undef> ; 3 7 x x
-  %ptrcast = bitcast i32* %RET to <4 x i32>*
-  store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4
+  %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 0, i32 7, i32 undef, i32 undef>
+  %ptrcast = bitcast i32* %RET to <4 x i32>*
+  store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4
   ret void
 }
 
diff --git a/test/CodeGen/X86/stack-folding-3dnow.ll b/test/CodeGen/X86/stack-folding-3dnow.ll
new file mode 100644
index 0000000..955bf44
--- /dev/null
+++ b/test/CodeGen/X86/stack-folding-3dnow.ll
@@ -0,0 +1,217 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+3dnow | FileCheck %s
+
+define x86_mmx @stack_fold_pavgusb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pavgusb
+  ;CHECK:       pavgusb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pf2id(x86_mmx %a) {
+  ;CHECK-LABEL: stack_fold_pf2id
+  ;CHECK:       pf2id {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx %a) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pf2iw(x86_mmx %a) {
+  ;CHECK-LABEL: stack_fold_pf2iw
+  ;CHECK:       pf2iw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx %a) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfacc(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfacc
+  ;CHECK:       pfacc {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfadd(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfadd
+  ;CHECK:       pfadd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfcmpeq(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfcmpeq
+  ;CHECK:       pfcmpeq {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfcmpge(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfcmpge
+  ;CHECK:       pfcmpge {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfcmpgt(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfcmpgt
+  ;CHECK:       pfcmpgt {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfmax(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfmax
+  ;CHECK:       pfmax {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfmin(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfmin
+  ;CHECK:       pfmin {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfmul(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfmul
+  ;CHECK:       pfmul {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfnacc(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfnacc
+  ;CHECK:       pfnacc {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfpnacc(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfpnacc
+  ;CHECK:       pfpnacc {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfrcp(x86_mmx %a) {
+  ;CHECK-LABEL: stack_fold_pfrcp
+  ;CHECK:       pfrcp {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx %a) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfrcpit1(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfrcpit1
+  ;CHECK:       pfrcpit1 {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfrcpit2(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfrcpit2
+  ;CHECK:       pfrcpit2 {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfrsqit1(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfrsqit1
+  ;CHECK:       pfrsqit1 {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfrsqrt(x86_mmx %a) {
+  ;CHECK-LABEL: stack_fold_pfrsqrt
+  ;CHECK:       pfrsqrt {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx %a) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfsub(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfsub
+  ;CHECK:       pfsub {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfsubr(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfsubr
+  ;CHECK:       pfsubr {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pi2fd(x86_mmx %a) {
+  ;CHECK-LABEL: stack_fold_pi2fd
+  ;CHECK:       pi2fd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx %a) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pi2fw(x86_mmx %a) {
+  ;CHECK-LABEL: stack_fold_pi2fw
+  ;CHECK:       pi2fw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx %a) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pmulhrw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pmulhrw
+  ;CHECK:       pmulhrw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pswapd(x86_mmx %a) {
+  ;CHECK-LABEL: stack_fold_pswapd
+  ;CHECK:       pswapd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %a) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx) nounwind readnone
diff --git a/test/CodeGen/X86/stack-folding-int-avx1.ll b/test/CodeGen/X86/stack-folding-int-avx1.ll
index a9a21c2..fec297d 100644
--- a/test/CodeGen/X86/stack-folding-int-avx1.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx1.ll
@@ -87,15 +87,19 @@ define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
   ;CHECK:       movq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
-  ret <2 x i64> %2
+  ; add forces execution domain
+  %3 = add <2 x i64> %2, <i64 1, i64 1>
+  ret <2 x i64> %3
 }
 
 define i64 @stack_fold_movq_store(<2 x i64> %a0) {
   ;CHECK-LABEL: stack_fold_movq_store
   ;CHECK:       movq {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill
-  %1 = extractelement <2 x i64> %a0, i32 0
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
-  ret i64 %1
+  ; add forces execution domain
+  %1 = add <2 x i64> %a0, <i64 1, i64 1>
+  %2 = extractelement <2 x i64> %1, i32 0
+  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  ret i64 %2
 }
 
 define <8 x i16> @stack_fold_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
diff --git a/test/CodeGen/X86/stack-folding-int-sse42.ll b/test/CodeGen/X86/stack-folding-int-sse42.ll
index 6aa2601..e814ae6 100644
--- a/test/CodeGen/X86/stack-folding-int-sse42.ll
+++ b/test/CodeGen/X86/stack-folding-int-sse42.ll
@@ -62,6 +62,33 @@ define <2 x i64> @stack_fold_aeskeygenassist(<2 x i64> %a0) {
 }
 declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone
 
+;TODO stack_fold_crc32_32_8
+declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
+
+;TODO stack_fold_crc32_32_16
+declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
+
+define i32 @stack_fold_crc32_32_32(i32 %a0, i32 %a1) {
+  ;CHECK-LABEL: stack_fold_crc32_32_32
+  ;CHECK:       crc32l {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a0, i32 %a1)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
+
+;TODO stack_fold_crc32_64_8
+declare i64 @llvm.x86.sse42.crc32.64.8(i64, i8) nounwind
+
+define i64 @stack_fold_crc32_64_64(i64 %a0, i64 %a1) {
+  ;CHECK-LABEL: stack_fold_crc32_64_64
+  ;CHECK:       crc32q {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1)
+  ret i64 %2
+}
+declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind
+
 define <4 x i32> @stack_fold_movd_load(i32 %a0) {
   ;CHECK-LABEL: stack_fold_movd_load
   ;CHECK:       movd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
@@ -87,15 +114,19 @@ define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
   ;CHECK:       movq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
-  ret <2 x i64> %2
+  ; add forces execution domain
+  %3 = add <2 x i64> %2, <i64 1, i64 1>
+  ret <2 x i64> %3
 }
 
 define i64 @stack_fold_movq_store(<2 x i64> %a0) {
   ;CHECK-LABEL: stack_fold_movq_store
   ;CHECK:       movq {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill
-  %1 = extractelement <2 x i64> %a0, i32 0
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
-  ret i64 %1
+  ; add forces execution domain
+  %1 = add <2 x i64> %a0, <i64 1, i64 1>
+  %2 = extractelement <2 x i64> %1, i32 0
+  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  ret i64 %2
 }
 
 define <8 x i16> @stack_fold_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
diff --git a/test/CodeGen/X86/stack-folding-mmx.ll b/test/CodeGen/X86/stack-folding-mmx.ll
new file mode 100644
index 0000000..8a5d4e2
--- /dev/null
+++ b/test/CodeGen/X86/stack-folding-mmx.ll
@@ -0,0 +1,566 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s
+
+define x86_mmx @stack_fold_cvtpd2pi(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtpd2pi
+  ;CHECK:       cvtpd2pi {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a0) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
+
+define <2 x double> @stack_fold_cvtpi2pd(x86_mmx %a0) {
+  ;CHECK-LABEL: stack_fold_cvtpi2pd
+  ;CHECK:       cvtpi2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %a0) nounwind readnone
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone
+
+define <4 x float> @stack_fold_cvtpi2ps(<4 x float> %a0, x86_mmx %a1) {
+  ;CHECK-LABEL: stack_fold_cvtpi2ps
+  ;CHECK:       cvtpi2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, x86_mmx %a1) nounwind readnone
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_cvtps2pi(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtps2pi
+  ;CHECK:       cvtps2pi {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %a0) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float>) nounwind readnone
+
+define x86_mmx @stack_fold_cvttpd2pi(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvttpd2pi
+  ;CHECK:       cvttpd2pi {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a0) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
+
+define x86_mmx @stack_fold_cvttps2pi(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvttps2pi
+  ;CHECK:       cvttps2pi {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float> %a0) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float>) nounwind readnone
+
+; TODO stack_fold_movd_load
+; TODO stack_fold_movd_store
+; TODO stack_fold_movq_load
+; TODO stack_fold_movq_store
+
+define x86_mmx @stack_fold_packssdw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_packssdw
+  ;CHECK:       packssdw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_packsswb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_packsswb
+  ;CHECK:       packsswb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_packuswb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_packuswb
+  ;CHECK:       packuswb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_paddb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_paddb
+  ;CHECK:       paddb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_paddd(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_paddd
+  ;CHECK:       paddd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_paddq(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_paddq
+  ;CHECK:       paddq {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_paddsb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_paddsb
+  ;CHECK:       paddsb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_paddsw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_paddsw
+  ;CHECK:       paddsw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_paddusb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_paddusb
+  ;CHECK:       paddusb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_paddusw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_paddusw
+  ;CHECK:       paddusw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_paddw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_paddw
+  ;CHECK:       paddw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pand(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pand
+  ;CHECK:       pand {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pand(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pandn(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pandn
+  ;CHECK:       pandn {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pavgb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pavgb
+  ;CHECK:       pavgb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pavgw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pavgw
+  ;CHECK:       pavgw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pcmpeqb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pcmpeqb
+  ;CHECK:       pcmpeqb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pcmpeqd(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pcmpeqd
+  ;CHECK:       pcmpeqd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pcmpeqw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pcmpeqw
+  ;CHECK:       pcmpeqw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pcmpgtb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pcmpgtb
+  ;CHECK:       pcmpgtb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pcmpgtd(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pcmpgtd
+  ;CHECK:       pcmpgtd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pcmpgtw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pcmpgtw
+  ;CHECK:       pcmpgtw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone
+
+; TODO stack_fold_pinsrw
+
+define x86_mmx @stack_fold_pmaddwd(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pmaddwd
+  ;CHECK:       pmaddwd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pmaxsw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pmaxsw
+  ;CHECK:       pmaxsw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pmaxub(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pmaxub
+  ;CHECK:       pmaxub {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pminsw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pminsw
+  ;CHECK:       pminsw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pminub(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pminub
+  ;CHECK:       pminub {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pmulhuw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pmulhuw
+  ;CHECK:       pmulhuw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pmulhw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pmulhw
+  ;CHECK:       pmulhw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pmullw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pmullw
+  ;CHECK:       pmullw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pmuludq(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pmuludq
+  ;CHECK:       pmuludq {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_por(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_por
+  ;CHECK:       por {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.por(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psadbw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psadbw
+  ;CHECK:       psadbw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pshufw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pshufw
+  ;CHECK:       pshufw $1, {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %a, i8 1) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone
+
+define x86_mmx @stack_fold_pslld(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pslld
+  ;CHECK:       pslld {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psllq(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psllq
+  ;CHECK:       psllq {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psllw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psllw
+  ;CHECK:       psllw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psrad(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psrad
+  ;CHECK:       psrad {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psraw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psraw
+  ;CHECK:       psraw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psrld(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psrld
+  ;CHECK:       psrld {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psrlq(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psrlq
+  ;CHECK:       psrlq {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psrlw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psrlw
+  ;CHECK:       psrlw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psubb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psubb
+  ;CHECK:       psubb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psubd(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psubd
+  ;CHECK:       psubd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psubq(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psubq
+  ;CHECK:       psubq {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psubsb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psubsb
+  ;CHECK:       psubsb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psubsw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psubsw
+  ;CHECK:       psubsw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psubusb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psubusb
+  ;CHECK:       psubusb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psubusw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psubusw
+  ;CHECK:       psubusw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psubw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psubw
+  ;CHECK:       psubw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_punpckhbw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_punpckhbw
+  ;CHECK:       punpckhbw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_punpckhdq(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_punpckhdq
+  ;CHECK:       punpckhdq {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_punpckhwd(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_punpckhwd
+  ;CHECK:       punpckhwd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_punpcklbw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_punpcklbw
+  ;CHECK:       punpcklbw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_punpckldq(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_punpckldq
+  ;CHECK:       punpckldq {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_punpcklwd(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_punpcklwd
+  ;CHECK:       punpcklwd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pxor(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pxor
+  ;CHECK:       pxor {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx) nounwind readnone
diff --git a/test/CodeGen/X86/stack-protector-dbginfo.ll b/test/CodeGen/X86/stack-protector-dbginfo.ll
index 0a4a4f2..6275c8d 100644
--- a/test/CodeGen/X86/stack-protector-dbginfo.ll
+++ b/test/CodeGen/X86/stack-protector-dbginfo.ll
@@ -48,7 +48,7 @@ attributes #0 = { sspreq }
 !20 = !{}
 !21 = !{i32 2, !"Dwarf Version", i32 2}
 !22 = !{i64* getelementptr inbounds ({ i64, [56 x i8] }, { i64, [56 x i8] }* @a, i32 0, i32 0)}
-!23 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "p2", line: 12, arg: 2, scope: !24, file: !10, type: !32, inlinedAt: !38)
+!23 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "p2", line: 12, arg: 2, scope: !24, file: !10, type: !32)
 !24 = !MDSubprogram(name: "min<unsigned long long>", linkageName: "_ZN3__13minIyEERKT_S3_RS1_", line: 12, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 12, file: !1, scope: !25, type: !27, templateParams: !33, variables: !35)
 !25 = !MDNamespace(name: "__1", line: 1, file: !26, scope: null)
 !26 = !MDFile(filename: "main.cpp", directory: "/Users/matt/ryan_bug")
@@ -71,13 +71,12 @@ attributes #0 = { sspreq }
 !43 = !{!29, !29, !32, !44}
 !44 = !MDCompositeType(tag: DW_TAG_structure_type, name: "A", size: 8, align: 8, file: !1, scope: !25, elements: !45)
 !45 = !{!46}
-!46 = !MDSubprogram(name: "operator()", linkageName: "_ZN3__11AclERKiS2_", line: 1, isLocal: false, isDefinition: false, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !1, scope: !44, type: !47, variables: !52)
+!46 = !MDSubprogram(name: "operator()", linkageName: "_ZN3__11AclERKiS2_", line: 1, isLocal: false, isDefinition: false, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !1, scope: !44, type: !47)
 !47 = !MDSubroutineType(types: !48)
 !48 = !{!13, !49, !50, !50}
 !49 = !MDDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, flags: DIFlagArtificial | DIFlagObjectPointer, baseType: !44)
 !50 = !MDDerivedType(tag: DW_TAG_reference_type, baseType: !51)
 !51 = !MDDerivedType(tag: DW_TAG_const_type, baseType: !13)
-!52 = !{i32 786468}
 !53 = !{!34, !54}
 !54 = !MDTemplateTypeParameter(name: "_Compare", type: !44)
 !55 = !{!56, !57, !58}
@@ -86,7 +85,7 @@ attributes #0 = { sspreq }
 !58 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "p3", line: 8, arg: 3, scope: !41, file: !10, type: !44)
 !59 = !MDLocation(line: 13, scope: !24, inlinedAt: !38)
 !63 = !{i32 undef}
-!64 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "p1", line: 1, arg: 2, scope: !65, file: !10, type: !50, inlinedAt: !40)
+!64 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "p1", line: 1, arg: 2, scope: !65, file: !10, type: !50)
 !65 = !MDSubprogram(name: "operator()", linkageName: "_ZN3__11AclERKiS2_", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2, file: !1, scope: !25, type: !47, declaration: !46, variables: !66)
 !66 = !{!67, !69, !70}
 !67 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !65, type: !68)
diff --git a/test/CodeGen/X86/stack-protector.ll b/test/CodeGen/X86/stack-protector.ll
index a88acf0..acaba6d 100644
--- a/test/CodeGen/X86/stack-protector.ll
+++ b/test/CodeGen/X86/stack-protector.ll
@@ -47,7 +47,7 @@ entry:
   %0 = load i8*, i8** %a.addr, align 8
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %arraydecay1 = getelementptr inbounds [16 x i8], [16 x i8]* %buf, i32 0, i32 0
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
   ret void
 }
 
@@ -83,7 +83,7 @@ entry:
   %0 = load i8*, i8** %a.addr, align 8
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %arraydecay1 = getelementptr inbounds [16 x i8], [16 x i8]* %buf, i32 0, i32 0
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
   ret void
 }
 
@@ -115,7 +115,7 @@ entry:
   %0 = load i8*, i8** %a.addr, align 8
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %arraydecay1 = getelementptr inbounds [16 x i8], [16 x i8]* %buf, i32 0, i32 0
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
   ret void
 }
 
@@ -147,7 +147,7 @@ entry:
   %0 = load i8*, i8** %a.addr, align 8
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %arraydecay1 = getelementptr inbounds [16 x i8], [16 x i8]* %buf, i32 0, i32 0
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
   ret void
 }
 
@@ -180,7 +180,7 @@ entry:
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %buf1 = getelementptr inbounds %struct.foo, %struct.foo* %b, i32 0, i32 0
   %arraydecay2 = getelementptr inbounds [16 x i8], [16 x i8]* %buf1, i32 0, i32 0
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
   ret void
 }
 
@@ -214,7 +214,7 @@ entry:
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %buf1 = getelementptr inbounds %struct.foo, %struct.foo* %b, i32 0, i32 0
   %arraydecay2 = getelementptr inbounds [16 x i8], [16 x i8]* %buf1, i32 0, i32 0
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
   ret void
 }
 
@@ -248,7 +248,7 @@ entry:
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %buf1 = getelementptr inbounds %struct.foo, %struct.foo* %b, i32 0, i32 0
   %arraydecay2 = getelementptr inbounds [16 x i8], [16 x i8]* %buf1, i32 0, i32 0
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
   ret void
 }
 
@@ -282,7 +282,7 @@ entry:
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %buf1 = getelementptr inbounds %struct.foo, %struct.foo* %b, i32 0, i32 0
   %arraydecay2 = getelementptr inbounds [16 x i8], [16 x i8]* %buf1, i32 0, i32 0
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
   ret void
 }
 
@@ -313,7 +313,7 @@ entry:
   %0 = load i8*, i8** %a.addr, align 8
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %arraydecay1 = getelementptr inbounds [4 x i8], [4 x i8]* %buf, i32 0, i32 0
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
   ret void
 }
 
@@ -345,7 +345,7 @@ entry:
   %0 = load i8*, i8** %a.addr, align 8
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %arraydecay1 = getelementptr inbounds [4 x i8], [4 x i8]* %buf, i32 0, i32 0
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
   ret void
 }
 
@@ -377,7 +377,7 @@ entry:
   %0 = load i8*, i8** %a.addr, align 8
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %arraydecay1 = getelementptr inbounds [4 x i8], [4 x i8]* %buf, i32 0, i32 0
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
   ret void
 }
 
@@ -409,7 +409,7 @@ entry:
   %0 = load i8*, i8** %a.addr, align 8
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %arraydecay1 = getelementptr inbounds [4 x i8], [4 x i8]* %buf, i32 0, i32 0
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
   ret void
 }
 
@@ -442,7 +442,7 @@ entry:
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %buf1 = getelementptr inbounds %struct.foo.0, %struct.foo.0* %b, i32 0, i32 0
   %arraydecay2 = getelementptr inbounds [4 x i8], [4 x i8]* %buf1, i32 0, i32 0
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
   ret void
 }
 
@@ -476,7 +476,7 @@ entry:
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %buf1 = getelementptr inbounds %struct.foo.0, %struct.foo.0* %b, i32 0, i32 0
   %arraydecay2 = getelementptr inbounds [4 x i8], [4 x i8]* %buf1, i32 0, i32 0
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
   ret void
 }
 
@@ -510,7 +510,7 @@ entry:
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %buf1 = getelementptr inbounds %struct.foo.0, %struct.foo.0* %b, i32 0, i32 0
   %arraydecay2 = getelementptr inbounds [4 x i8], [4 x i8]* %buf1, i32 0, i32 0
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
   ret void
 }
 
@@ -544,7 +544,7 @@ entry:
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %buf1 = getelementptr inbounds %struct.foo.0, %struct.foo.0* %b, i32 0, i32 0
   %arraydecay2 = getelementptr inbounds [4 x i8], [4 x i8]* %buf1, i32 0, i32 0
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
   ret void
 }
 
@@ -571,7 +571,7 @@ entry:
   %a.addr = alloca i8*, align 8
   store i8* %a, i8** %a.addr, align 8
   %0 = load i8*, i8** %a.addr, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %0)
   ret void
 }
 
@@ -599,7 +599,7 @@ entry:
   %a.addr = alloca i8*, align 8
   store i8* %a, i8** %a.addr, align 8
   %0 = load i8*, i8** %a.addr, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %0)
   ret void
 }
 
@@ -627,7 +627,7 @@ entry:
   %a.addr = alloca i8*, align 8
   store i8* %a, i8** %a.addr, align 8
   %0 = load i8*, i8** %a.addr, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %0)
   ret void
 }
 
@@ -655,7 +655,7 @@ entry:
   %a.addr = alloca i8*, align 8
   store i8* %a, i8** %a.addr, align 8
   %0 = load i8*, i8** %a.addr, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %0)
   ret void
 }
 
@@ -808,7 +808,7 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %a = alloca i32, align 4
   %0 = ptrtoint i32* %a to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
   ret void
 }
 
@@ -835,7 +835,7 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %a = alloca i32, align 4
   %0 = ptrtoint i32* %a to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
   ret void
 }
 
@@ -862,7 +862,7 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %a = alloca i32, align 4
   %0 = ptrtoint i32* %a to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
   ret void
 }
 
@@ -889,7 +889,7 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %a = alloca i32, align 4
   %0 = ptrtoint i32* %a to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
   ret void
 }
 
@@ -1021,7 +1021,7 @@ entry:
   store double %call, double* %x, align 8
   %cmp2 = fcmp ogt double %call, 0.000000e+00
   %y.1 = select i1 %cmp2, double* %x, double* null
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double* %y.1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double* %y.1)
   ret void
 }
 
@@ -1051,7 +1051,7 @@ entry:
   store double %call, double* %x, align 8
   %cmp2 = fcmp ogt double %call, 0.000000e+00
   %y.1 = select i1 %cmp2, double* %x, double* null
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double* %y.1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double* %y.1)
   ret void
 }
 
@@ -1081,7 +1081,7 @@ entry:
   store double %call, double* %x, align 8
   %cmp2 = fcmp ogt double %call, 0.000000e+00
   %y.1 = select i1 %cmp2, double* %x, double* null
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double* %y.1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double* %y.1)
   ret void
 }
 
@@ -1111,7 +1111,7 @@ entry:
   store double %call, double* %x, align 8
   %cmp2 = fcmp ogt double %call, 0.000000e+00
   %y.1 = select i1 %cmp2, double* %x, double* null
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double* %y.1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double* %y.1)
   ret void
 }
 
@@ -1155,7 +1155,7 @@ if.then3:                                         ; preds = %if.else
 
 if.end4:                                          ; preds = %if.else, %if.then3, %if.then
   %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double* %y.0)
+  %call5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double* %y.0)
   ret void
 }
 
@@ -1200,7 +1200,7 @@ if.then3:                                         ; preds = %if.else
 
 if.end4:                                          ; preds = %if.else, %if.then3, %if.then
   %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double* %y.0)
+  %call5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double* %y.0)
   ret void
 }
 
@@ -1245,7 +1245,7 @@ if.then3:                                         ; preds = %if.else
 
 if.end4:                                          ; preds = %if.else, %if.then3, %if.then
   %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double* %y.0)
+  %call5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double* %y.0)
   ret void
 }
 
@@ -1290,7 +1290,7 @@ if.then3:                                         ; preds = %if.else
 
 if.end4:                                          ; preds = %if.else, %if.then3, %if.then
   %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double* %y.0)
+  %call5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double* %y.0)
   ret void
 }
 
@@ -1319,7 +1319,7 @@ entry:
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
   store i32* %y, i32** %b, align 8
   %0 = load i32*, i32** %b, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %0)
   ret void
 }
 
@@ -1349,7 +1349,7 @@ entry:
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
   store i32* %y, i32** %b, align 8
   %0 = load i32*, i32** %b, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %0)
   ret void
 }
 
@@ -1379,7 +1379,7 @@ entry:
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
   store i32* %y, i32** %b, align 8
   %0 = load i32*, i32** %b, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %0)
   ret void
 }
 
@@ -1409,7 +1409,7 @@ entry:
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
   store i32* %y, i32** %b, align 8
   %0 = load i32*, i32** %b, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %0)
   ret void
 }
 
@@ -1437,7 +1437,7 @@ entry:
   %b = alloca i32*, align 8
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
   %0 = ptrtoint i32* %y to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
   ret void
 }
 
@@ -1466,7 +1466,7 @@ entry:
   %b = alloca i32*, align 8
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
   %0 = ptrtoint i32* %y to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
   ret void
 }
 
@@ -1494,7 +1494,7 @@ entry:
   %b = alloca i32*, align 8
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
   %0 = ptrtoint i32* %y to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
   ret void
 }
 
@@ -1523,7 +1523,7 @@ entry:
   %b = alloca i32*, align 8
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
   %0 = ptrtoint i32* %y to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
   ret void
 }
 
@@ -1549,7 +1549,7 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %c = alloca %struct.pair, align 4
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i64 0, i32 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
   ret void
 }
 
@@ -1576,7 +1576,7 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %c = alloca %struct.pair, align 4
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i64 0, i32 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
   ret void
 }
 
@@ -1603,7 +1603,7 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %c = alloca %struct.pair, align 4
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i64 0, i32 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
   ret void
 }
 
@@ -1630,7 +1630,7 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %c = alloca %struct.pair, align 4
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i64 0, i32 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
   ret void
 }
 
@@ -1656,7 +1656,7 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %a = alloca i32, align 4
   %add.ptr5 = getelementptr inbounds i32, i32* %a, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
   ret void
 }
 
@@ -1683,7 +1683,7 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %a = alloca i32, align 4
   %add.ptr5 = getelementptr inbounds i32, i32* %a, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
   ret void
 }
 
@@ -1710,7 +1710,7 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %a = alloca i32, align 4
   %add.ptr5 = getelementptr inbounds i32, i32* %a, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
   ret void
 }
 
@@ -1737,7 +1737,7 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %a = alloca i32, align 4
   %add.ptr5 = getelementptr inbounds i32, i32* %a, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
   ret void
 }
 
@@ -1768,7 +1768,7 @@ entry:
   %0 = bitcast i32* %a to float*
   store float* %0, float** %b, align 8
   %1 = load float*, float** %b, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), float* %1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), float* %1)
   ret void
 }
 
@@ -1800,7 +1800,7 @@ entry:
   %0 = bitcast i32* %a to float*
   store float* %0, float** %b, align 8
   %1 = load float*, float** %b, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), float* %1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), float* %1)
   ret void
 }
 
@@ -1832,7 +1832,7 @@ entry:
   %0 = bitcast i32* %a to float*
   store float* %0, float** %b, align 8
   %1 = load float*, float** %b, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), float* %1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), float* %1)
   ret void
 }
 
@@ -1864,7 +1864,7 @@ entry:
   %0 = bitcast i32* %a to float*
   store float* %0, float** %b, align 8
   %1 = load float*, float** %b, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), float* %1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), float* %1)
   ret void
 }
 
@@ -2006,7 +2006,7 @@ entry:
   %c = alloca %struct.vec, align 16
   %y = getelementptr inbounds %struct.vec, %struct.vec* %c, i64 0, i32 0
   %add.ptr = getelementptr inbounds <4 x i32>, <4 x i32>* %y, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
   ret void
 }
 
@@ -2034,7 +2034,7 @@ entry:
   %c = alloca %struct.vec, align 16
   %y = getelementptr inbounds %struct.vec, %struct.vec* %c, i64 0, i32 0
   %add.ptr = getelementptr inbounds <4 x i32>, <4 x i32>* %y, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
   ret void
 }
 
@@ -2062,7 +2062,7 @@ entry:
   %c = alloca %struct.vec, align 16
   %y = getelementptr inbounds %struct.vec, %struct.vec* %c, i64 0, i32 0
   %add.ptr = getelementptr inbounds <4 x i32>, <4 x i32>* %y, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
   ret void
 }
 
@@ -2090,7 +2090,7 @@ entry:
   %c = alloca %struct.vec, align 16
   %y = getelementptr inbounds %struct.vec, %struct.vec* %c, i64 0, i32 0
   %add.ptr = getelementptr inbounds <4 x i32>, <4 x i32>* %y, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
   ret void
 }
 
@@ -3152,7 +3152,7 @@ entry:
   %b = getelementptr inbounds %struct.nest, %struct.nest* %c, i32 0, i32 1
   %_a = getelementptr inbounds %struct.pair, %struct.pair* %b, i32 0, i32 0
   %0 = load i32, i32* %_a, align 4
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %0)
   ret void
 }
 
@@ -3181,7 +3181,7 @@ bb:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %tmp = alloca %struct.small*, align 8
-  %tmp1 = call i32 (...)* @dummy(%struct.small** %tmp)
+  %tmp1 = call i32 (...) @dummy(%struct.small** %tmp)
   %tmp2 = load %struct.small*, %struct.small** %tmp, align 8
   %tmp3 = ptrtoint %struct.small* %tmp2 to i64
   %tmp4 = trunc i64 %tmp3 to i32
@@ -3209,7 +3209,7 @@ bb17:                                             ; preds = %bb6
 
 bb21:                                             ; preds = %bb6, %bb
   %tmp22 = phi i32 [ %tmp1, %bb ], [ %tmp14, %bb6 ]
-  %tmp23 = call i32 (...)* @dummy(i32 %tmp22)
+  %tmp23 = call i32 (...) @dummy(i32 %tmp22)
   ret i32 undef
 }
 
@@ -3235,7 +3235,7 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %test = alloca [32 x i8], align 16
   %arraydecay = getelementptr inbounds [32 x i8], [32 x i8]* %test, i32 0, i32 0
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
   ret i32 %call
 }
 
@@ -3261,7 +3261,7 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %test = alloca [33 x i8], align 16
   %arraydecay = getelementptr inbounds [33 x i8], [33 x i8]* %test, i32 0, i32 0
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
   ret i32 %call
 }
 
@@ -3287,7 +3287,7 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %test = alloca [4 x i8], align 1
   %arraydecay = getelementptr inbounds [4 x i8], [4 x i8]* %test, i32 0, i32 0
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
   ret i32 %call
 }
 
@@ -3313,7 +3313,7 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %test = alloca [5 x i8], align 1
   %arraydecay = getelementptr inbounds [5 x i8], [5 x i8]* %test, i32 0, i32 0
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
   ret i32 %call
 }
 
@@ -3347,7 +3347,7 @@ entry:
   %3 = load i64, i64* %2, align 1
   %4 = getelementptr { i64, i8 }, { i64, i8 }* %test.coerce, i32 0, i32 1
   %5 = load i8, i8* %4, align 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %3, i8 %5)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %3, i8 %5)
   ret i32 %call
 }
 
@@ -3381,7 +3381,7 @@ entry:
   %3 = load i64, i64* %2, align 1
   %4 = getelementptr { i64, i8 }, { i64, i8 }* %test.coerce, i32 0, i32 1
   %5 = load i8, i8* %4, align 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %3, i8 %5)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %3, i8 %5)
   ret i32 %call
 }
 
@@ -3410,7 +3410,7 @@ entry:
   %0 = alloca i8, i64 4
   store i8* %0, i8** %test, align 8
   %1 = load i8*, i8** %test, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %1)
   ret i32 %call
 }
 
@@ -3438,7 +3438,7 @@ entry:
   %0 = alloca i8, i64 5
   store i8* %0, i8** %test, align 8
   %1 = load i8*, i8** %test, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %1)
   ret i32 %call
 }
 
diff --git a/test/CodeGen/X86/stackmap-fast-isel.ll b/test/CodeGen/X86/stackmap-fast-isel.ll
index d2155bd..1392e5b 100644
--- a/test/CodeGen/X86/stackmap-fast-isel.ll
+++ b/test/CodeGen/X86/stackmap-fast-isel.ll
@@ -99,7 +99,7 @@
 
 define void @constantargs() {
 entry:
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 1, i32 15, i16 65535, i16 -1, i32 65536, i32 2000000000, i32 2147483647, i32 -1, i32 4294967295, i32 4294967296, i64 2147483648, i64 4294967295, i64 4294967296, i64 -1)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 1, i32 15, i16 65535, i16 -1, i32 65536, i32 2000000000, i32 2147483647, i32 -1, i32 4294967295, i32 4294967296, i64 2147483648, i64 4294967295, i64 4294967296, i64 -1)
   ret void
 }
 
@@ -116,7 +116,7 @@ entry:
 ; CHECK-NEXT:   .long   33
 
 define void @liveConstant() {
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 5, i32 33)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 15, i32 5, i32 33)
   ret void
 }
 
@@ -139,7 +139,7 @@ entry:
   store i64 11, i64* %metadata1
   store i64 12, i64* %metadata1
   store i64 13, i64* %metadata1
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 0, i64* %metadata1)
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 16, i32 0, i64* %metadata1)
   ret void
 }
 
@@ -155,10 +155,10 @@ entry:
 ; CHECK-LABEL:  .long L{{.*}}-_longid
 define void @longid() {
 entry:
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4294967295, i32 0)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4294967296, i32 0)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 9223372036854775807, i32 0)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 -1, i32 0)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4294967295, i32 0)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4294967296, i32 0)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 9223372036854775807, i32 0)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 -1, i32 0)
   ret void
 }
 
diff --git a/test/CodeGen/X86/stackmap-large-constants.ll b/test/CodeGen/X86/stackmap-large-constants.ll
index 73ee4f3..a38b920 100644
--- a/test/CodeGen/X86/stackmap-large-constants.ll
+++ b/test/CodeGen/X86/stackmap-large-constants.ll
@@ -51,7 +51,7 @@
 declare void @llvm.experimental.stackmap(i64, i32, ...)
 
 define void @foo() {
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 0, i64 9223372036854775807)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 0, i32 0, i64 9223372036854775807)
   ret void
 }
 
@@ -78,6 +78,6 @@ define void @foo() {
 
 
 define void @bar() {
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 0, i64 -9223372036854775808)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 0, i32 0, i64 -9223372036854775808)
   ret void
 }
diff --git a/test/CodeGen/X86/stackmap-liveness.ll b/test/CodeGen/X86/stackmap-liveness.ll
index 31553c0..599b626 100644
--- a/test/CodeGen/X86/stackmap-liveness.ll
+++ b/test/CodeGen/X86/stackmap-liveness.ll
@@ -50,7 +50,7 @@ entry:
 ; PATCH-NEXT:   .byte 16
 ; Align
 ; PATCH-NEXT:   .align  3
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 1, i32 12, i8* null, i32 0)
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 1, i32 12, i8* null, i32 0)
   %a2 = call i64 asm sideeffect "", "={r8}"() nounwind
   %a3 = call i8 asm sideeffect "", "={ah}"() nounwind
   %a4 = call <4 x double> asm sideeffect "", "={ymm0}"() nounwind
@@ -97,7 +97,7 @@ entry:
 ; PATCH-NEXT:   .byte 16
 ; Align
 ; PATCH-NEXT:   .align  3
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 2, i32 12, i8* null, i32 0)
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 2, i32 12, i8* null, i32 0)
   call void asm sideeffect "", "{r8},{ah},{ymm0},{ymm1}"(i64 %a2, i8 %a3, <4 x double> %a4, <4 x double> %a5) nounwind
 
 ; StackMap 3 (no liveness information available)
@@ -129,7 +129,7 @@ entry:
 ; PATCH-NEXT:   .byte 16
 ; Align
 ; PATCH-NEXT:   .align  3
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 3, i32 12, i8* null, i32 0)
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 3, i32 12, i8* null, i32 0)
   call void asm sideeffect "", "{xmm2}"(<2 x double> %a1) nounwind
   ret void
 }
@@ -166,8 +166,8 @@ entry:
 ; PATCH-NEXT:   .byte 16
 ; Align
 ; PATCH-NEXT:   .align  3
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 5)
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 0, i8* null, i32 0)
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 5)
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 5, i32 0, i8* null, i32 0)
   call void asm sideeffect "", "{xmm2}"(<2 x double> %a1) nounwind
   ret void
 }
diff --git a/test/CodeGen/X86/stackmap-nops.ll b/test/CodeGen/X86/stackmap-nops.ll
index 7932c0d..08fee2e 100644
--- a/test/CodeGen/X86/stackmap-nops.ll
+++ b/test/CodeGen/X86/stackmap-nops.ll
@@ -193,41 +193,41 @@ entry:
 ; CHECK-NEXT: .byte 102
 ; CHECK-NEXT: .byte 102
 ; CHECK-NEXT: nopw %cs:512(%rax,%rax)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  0, i32  0)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  1, i32  1)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  2, i32  2)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  3, i32  3)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  4, i32  4)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  5, i32  5)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  6, i32  6)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  7, i32  7)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  8, i32  8)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  9, i32  9)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 10, i32 10)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 11, i32 11)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 12)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 13, i32 13)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 14)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 15)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 16)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 17)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 18, i32 18)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 19, i32 19)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 20, i32 20)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 21, i32 21)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 22, i32 22)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 23, i32 23)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 24, i32 24)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 25, i32 25)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 26, i32 26)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 27, i32 27)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 28, i32 28)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 29, i32 29)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 30, i32 30)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  0, i32  0)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  1, i32  1)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  2, i32  2)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  3, i32  3)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  4, i32  4)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  5, i32  5)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  6, i32  6)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  7, i32  7)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  8, i32  8)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  9, i32  9)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 10, i32 10)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 11, i32 11)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 12, i32 12)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 13, i32 13)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 14, i32 14)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 15, i32 15)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 16, i32 16)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 17, i32 17)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 18, i32 18)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 19, i32 19)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 20, i32 20)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 21, i32 21)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 22, i32 22)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 23, i32 23)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 24, i32 24)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 25, i32 25)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 26, i32 26)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 27, i32 27)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 28, i32 28)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 29, i32 29)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 30, i32 30)
 ; Add an extra stackmap with a zero-length shadow to thwart the shadow
 ; optimization. This will force all 15 bytes of the previous shadow to be
 ; padded with nops.
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 31, i32 0)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 31, i32 0)
   ret void
 }
 
diff --git a/test/CodeGen/X86/stackmap-shadow-optimization.ll b/test/CodeGen/X86/stackmap-shadow-optimization.ll
index a3725f2..001d8d9 100644
--- a/test/CodeGen/X86/stackmap-shadow-optimization.ll
+++ b/test/CodeGen/X86/stackmap-shadow-optimization.ll
@@ -18,7 +18,7 @@ entry:
 ; CHECK:        callq   _bar
 ; CHECK-NOT:    nop
   call void @bar()
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 8)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 0, i32 8)
   call void @bar()
   call void @bar()
   ret void
diff --git a/test/CodeGen/X86/stackmap.ll b/test/CodeGen/X86/stackmap.ll
index fc958ec..0805e81 100644
--- a/test/CodeGen/X86/stackmap.ll
+++ b/test/CodeGen/X86/stackmap.ll
@@ -125,7 +125,7 @@
 define void @constantargs() {
 entry:
   %0 = inttoptr i64 12345 to i8*
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 1, i32 15, i8* %0, i32 0, i16 65535, i16 -1, i32 65536, i32 2000000000, i32 2147483647, i32 -1, i32 4294967295, i32 4294967296, i64 2147483648, i64 4294967295, i64 4294967296, i64 -1)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 1, i32 15, i8* %0, i32 0, i16 65535, i16 -1, i32 65536, i32 2000000000, i32 2147483647, i32 -1, i32 4294967295, i32 4294967296, i64 2147483648, i64 4294967295, i64 4294967296, i64 -1)
   ret void
 }
 
@@ -147,7 +147,7 @@ entry:
   ; Runtime void->void call.
   call void inttoptr (i64 -559038737 to void ()*)()
   ; Followed by inline OSR patchpoint with 12-byte shadow and 2 live vars.
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
   ret void
 }
 
@@ -173,7 +173,7 @@ entry:
 cold:
   ; OSR patchpoint with 12-byte nop-slide and 2 live vars.
   %thunk = inttoptr i64 -559038737 to i8*
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4, i32 15, i8* %thunk, i32 0, i64 %a, i64 %b)
+  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 4, i32 15, i8* %thunk, i32 0, i64 %a, i64 %b)
   unreachable
 ret:
   ret void
@@ -194,7 +194,7 @@ ret:
 define i64 @propertyRead(i64* %obj) {
 entry:
   %resolveRead = inttoptr i64 -559038737 to i8*
-  %result = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveRead, i32 1, i64* %obj)
+  %result = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveRead, i32 1, i64* %obj)
   %add = add i64 %result, 3
   ret i64 %add
 }
@@ -214,7 +214,7 @@ entry:
 define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) {
 entry:
   %resolveWrite = inttoptr i64 -559038737 to i8*
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
   ret void
 }
 
@@ -236,7 +236,7 @@ entry:
 define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
 entry:
   %resolveCall = inttoptr i64 -559038737 to i8*
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 7, i32 15, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 7, i32 15, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
   ret void
 }
 
@@ -258,7 +258,7 @@ entry:
 define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
 entry:
   %resolveCall = inttoptr i64 -559038737 to i8*
-  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 8, i32 15, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  %result = call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 8, i32 15, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
   %add = add i64 %result, 3
   ret i64 %add
 }
@@ -278,7 +278,7 @@ entry:
 ; CHECK-NEXT:   .short 6
 define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16) {
 entry:
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 11, i32 15, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16)
+  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 15, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16)
   ret void
 }
 
@@ -297,7 +297,7 @@ entry:
 ; CHECK-NEXT:   .short 6
 define webkit_jscc void @spilledStackMapValue(i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16) {
 entry:
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 15, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16)
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 12, i32 15, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16)
   ret void
 }
 
@@ -333,7 +333,7 @@ bb17:
 
 bb60:
   tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 13, i32 5, i32 %tmp32)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 13, i32 5, i32 %tmp32)
   unreachable
 
 bb61:
@@ -367,7 +367,7 @@ define void @subRegOffset(i16 %arg) {
   %arghi = lshr i16 %v, 8
   %a1 = trunc i16 %arghi to i8
   tail call void asm sideeffect "nop", "~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 5, i8 %a0, i8 %a1)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 14, i32 5, i8 %a0, i8 %a1)
   ret void
 }
 
@@ -384,7 +384,7 @@ define void @subRegOffset(i16 %arg) {
 ; CHECK-NEXT:   .long   33
 
 define void @liveConstant() {
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 5, i32 33)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 15, i32 5, i32 33)
   ret void
 }
 
@@ -422,10 +422,10 @@ entry:
   store i64 11, i64* %metadata1
   store i64 12, i64* %metadata1
   store i64 13, i64* %metadata1
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 0, i64* %metadata1)
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 16, i32 0, i64* %metadata1)
   %metadata2 = alloca i8, i32 4, align 8
   %metadata3 = alloca i16, i32 4, align 8
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 17, i32 5, i8* null, i32 0, i8* %metadata2, i16* %metadata3)
+  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 17, i32 5, i8* null, i32 0, i8* %metadata2, i16* %metadata3)
   ret void
 }
 
@@ -441,10 +441,10 @@ entry:
 ; CHECK-LABEL:  .long L{{.*}}-_longid
 define void @longid() {
 entry:
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4294967295, i32 0, i8* null, i32 0)
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4294967296, i32 0, i8* null, i32 0)
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 9223372036854775807, i32 0, i8* null, i32 0)
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 -1, i32 0, i8* null, i32 0)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 4294967295, i32 0, i8* null, i32 0)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 4294967296, i32 0, i8* null, i32 0)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 9223372036854775807, i32 0, i8* null, i32 0)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 -1, i32 0, i8* null, i32 0)
   ret void
 }
 
@@ -462,7 +462,7 @@ entry:
 ; CHECK-NEXT:   .long   -{{[0-9]+}}
 define void @clobberScratch(i32 %a) {
   tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r12},~{r13},~{r14},~{r15}"() nounwind
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
   ret void
 }
 
@@ -474,11 +474,11 @@ define void @clobberScratch(i32 %a) {
 ; CHECK-NEXT:   .short 0
 define void @needsStackRealignment() {
   %val = alloca i64, i32 3, align 128
-  tail call void (...)* @escape_values(i64* %val)
+  tail call void (...) @escape_values(i64* %val)
 ; Note: Adding any non-constant to the stackmap would fail because we
 ; expected to be able to address off the frame pointer.  In a realigned
 ; frame, we must use the stack pointer instead.  This is a separate bug.
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 0)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 0, i32 0)
   ret void
 }
 declare void @escape_values(...)
diff --git a/test/CodeGen/X86/statepoint-allocas.ll b/test/CodeGen/X86/statepoint-allocas.ll
new file mode 100644
index 0000000..eb34a42
--- /dev/null
+++ b/test/CodeGen/X86/statepoint-allocas.ll
@@ -0,0 +1,121 @@
+; RUN: llc < %s | FileCheck %s
+; Check that we can lower a use of an alloca both as a deopt value (where the
+; exact meaning is up to the consumer of the stackmap) and as an explicit spill
+; slot used for GC.  
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+declare zeroext i1 @return_i1()
+
+; Can we handle an explicit relocation slot (in the form of an alloca) given 
+; to the statepoint?
+define i32 addrspace(1)* @test(i32 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: test
+; CHECK: pushq  %rax
+; CHECK: movq   %rdi, (%rsp)
+; CHECK: callq return_i1
+; CHECK: movq   (%rsp), %rax
+; CHECK: popq   %rdx
+; CHECK: retq
+entry:
+  %alloca = alloca i32 addrspace(1)*, align 8
+  store i32 addrspace(1)* %ptr, i32 addrspace(1)** %alloca
+  call i32 (i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 addrspace(1)** %alloca)
+  %rel = load i32 addrspace(1)*, i32 addrspace(1)** %alloca
+  ret i32 addrspace(1)* %rel
+}
+
+; Can we handle an alloca as a deopt value?  
+define i32 addrspace(1)* @test2(i32 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: test2
+; CHECK: pushq  %rax
+; CHECK: movq   %rdi, (%rsp)
+; CHECK: callq return_i1
+; CHECK: xorl   %eax, %eax
+; CHECK: popq   %rdx
+; CHECK: retq
+entry:
+  %alloca = alloca i32 addrspace(1)*, align 8
+  store i32 addrspace(1)* %ptr, i32 addrspace(1)** %alloca
+  call i32 (i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 1, i32 addrspace(1)** %alloca)
+  ret i32 addrspace(1)* null
+}
+
+declare i32 @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()*, i32, i32, ...)
+
+
+; CHECK-LABEL: .section .llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 2
+; Num LargeConstants
+; CHECK-NEXT:   .long 0
+; Num Callsites
+; CHECK-NEXT:   .long 2
+
+; Functions and stack size
+; CHECK-NEXT:   .quad test
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad test2
+; CHECK-NEXT:   .quad 8
+
+; Large Constants
+; Statepoint ID only
+; CHECK: .quad	2882400000
+
+; Callsites
+; The GC one
+; CHECK: .long	.Ltmp1-test
+; CHECK: .short	0
+; CHECK: .short	3
+; SmallConstant (0)
+; CHECK: .byte	4
+; CHECK: .byte	8
+; CHECK: .short	0
+; CHECK: .long	0
+; SmallConstant (0)
+; CHECK: .byte	4
+; CHECK: .byte	8
+; CHECK: .short	0
+; CHECK: .long	0
+; Direct Spill Slot [RSP+0]
+; CHECK: .byte	2
+; CHECK: .byte	8
+; CHECK: .short	7
+; CHECK: .long	0
+; No Padding or LiveOuts
+; CHECK: .short	0
+; CHECK: .short	0
+; CHECK: .align	8
+
+; The Deopt one
+; CHECK: .long	.Ltmp3-test2
+; CHECK: .short	0
+; CHECK: .short	3
+; SmallConstant (0)
+; CHECK: .byte	4
+; CHECK: .byte	8
+; CHECK: .short	0
+; CHECK: .long	0
+; SmallConstant (1)
+; CHECK: .byte	4
+; CHECK: .byte	8
+; CHECK: .short	0
+; CHECK: .long	1
+; Direct Spill Slot [RSP+0]
+; CHECK: .byte	2
+; CHECK: .byte	8
+; CHECK: .short	7
+; CHECK: .long	0
+
+; No Padding or LiveOuts
+; CHECK: .short	0
+; CHECK: .short	0
+; CHECK: .align	8
+
+
diff --git a/test/CodeGen/X86/statepoint-call-lowering.ll b/test/CodeGen/X86/statepoint-call-lowering.ll
index 22049cf..9f6e4f9 100644
--- a/test/CodeGen/X86/statepoint-call-lowering.ll
+++ b/test/CodeGen/X86/statepoint-call-lowering.ll
@@ -20,7 +20,7 @@ define i1 @test_i1_return() gc "statepoint-example" {
 ; CHECK: popq %rdx
 ; CHECK: retq
 entry:
-  %safepoint_token = tail call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0)
+  %safepoint_token = tail call i32 (i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0)
   %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(i32 %safepoint_token)
   ret i1 %call1
 }
@@ -32,7 +32,7 @@ define i32 @test_i32_return() gc "statepoint-example" {
 ; CHECK: popq %rdx
 ; CHECK: retq
 entry:
-  %safepoint_token = tail call i32 (i32 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i32f(i32 ()* @return_i32, i32 0, i32 0, i32 0)
+  %safepoint_token = tail call i32 (i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i32 ()* @return_i32, i32 0, i32 0, i32 0)
   %call1 = call zeroext i32 @llvm.experimental.gc.result.i32(i32 %safepoint_token)
   ret i32 %call1
 }
@@ -44,7 +44,7 @@ define i32* @test_i32ptr_return() gc "statepoint-example" {
 ; CHECK: popq %rdx
 ; CHECK: retq
 entry:
-  %safepoint_token = tail call i32 (i32* ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_p0i32f(i32* ()* @return_i32ptr, i32 0, i32 0, i32 0)
+  %safepoint_token = tail call i32 (i32* ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p0i32f(i32* ()* @return_i32ptr, i32 0, i32 0, i32 0)
   %call1 = call i32* @llvm.experimental.gc.result.p0i32(i32 %safepoint_token)
   ret i32* %call1
 }
@@ -56,7 +56,7 @@ define float @test_float_return() gc "statepoint-example" {
 ; CHECK: popq %rax
 ; CHECK: retq
 entry:
-  %safepoint_token = tail call i32 (float ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_f32f(float ()* @return_float, i32 0, i32 0, i32 0)
+  %safepoint_token = tail call i32 (float ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_f32f(float ()* @return_float, i32 0, i32 0, i32 0)
   %call1 = call float @llvm.experimental.gc.result.f32(i32 %safepoint_token)
   ret float %call1
 }
@@ -70,7 +70,7 @@ define i1 @test_relocate(i32 addrspace(1)* %a) gc "statepoint-example" {
 ; CHECK-NEXT: popq %rdx
 ; CHECK-NEXT: retq
 entry:
-  %safepoint_token = tail call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a)
+  %safepoint_token = tail call i32 (i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a)
   %call1 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 4, i32 4)
   %call2 = call zeroext i1 @llvm.experimental.gc.result.i1(i32 %safepoint_token)
   ret i1 %call2
@@ -81,7 +81,7 @@ define void @test_void_vararg() gc "statepoint-example" {
 ; Check a statepoint wrapping a *void* returning vararg function works
 ; CHECK: callq varargf
 entry:
-  %safepoint_token = tail call i32 (void (i32, ...)*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidi32varargf(void (i32, ...)* @varargf, i32 2, i32 0, i32 42, i32 43, i32 0)
+  %safepoint_token = tail call i32 (void (i32, ...)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32varargf(void (i32, ...)* @varargf, i32 2, i32 0, i32 42, i32 43, i32 0)
   ;; if we try to use the result from a statepoint wrapping a
   ;; non-void-returning varargf, we will experience a crash.
   ret void
diff --git a/test/CodeGen/X86/statepoint-forward.ll b/test/CodeGen/X86/statepoint-forward.ll
index 5a1b18a..0b296cf 100644
--- a/test/CodeGen/X86/statepoint-forward.ll
+++ b/test/CodeGen/X86/statepoint-forward.ll
@@ -25,7 +25,7 @@ entry:
   %before = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p
   %cmp1 = call i1 @f(i32 addrspace(1)* %before)
   call void @llvm.assume(i1 %cmp1)
-  %safepoint_token = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0, i32 addrspace(1)* addrspace(1)* %p)
+  %safepoint_token = tail call i32 (void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0, i32 addrspace(1)* addrspace(1)* %p)
   %pnew = call i32 addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1i32(i32 %safepoint_token, i32 4, i32 4)
   %after = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %pnew
   %cmp2 = call i1 @f(i32 addrspace(1)* %after)
@@ -44,7 +44,7 @@ entry:
   %cmp1 = call i1 @f(i32 addrspace(1)* %v)
   call void @llvm.assume(i1 %cmp1)
   store i32 addrspace(1)* %v, i32 addrspace(1)* addrspace(1)* %p
-  %safepoint_token = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0, i32 addrspace(1)* addrspace(1)* %p)
+  %safepoint_token = tail call i32 (void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0, i32 addrspace(1)* addrspace(1)* %p)
   %pnew = call i32 addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1i32(i32 %safepoint_token, i32 4, i32 4)
   %after = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %pnew
   %cmp2 = call i1 @f(i32 addrspace(1)* %after)
@@ -72,7 +72,7 @@ entry:
   %before = load i32 addrspace(1)*, i32 addrspace(1)** %p
   %cmp1 = call i1 @f(i32 addrspace(1)* %before)
   call void @llvm.assume(i1 %cmp1)
-  call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0)
+  call i32 (void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0)
   %after = load i32 addrspace(1)*, i32 addrspace(1)** %p
   %cmp2 = call i1 @f(i32 addrspace(1)* %after)
   ret i1 %cmp2
@@ -90,7 +90,7 @@ entry:
   %cmp1 = call i1 @f(i32 addrspace(1)* %v)
   call void @llvm.assume(i1 %cmp1)
   store i32 addrspace(1)* %v, i32 addrspace(1)** %p
-  call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0)
+  call i32 (void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0)
   %after = load i32 addrspace(1)*, i32 addrspace(1)** %p
   %cmp2 = call i1 @f(i32 addrspace(1)* %after)
   ret i1 %cmp2
diff --git a/test/CodeGen/X86/statepoint-invoke.ll b/test/CodeGen/X86/statepoint-invoke.ll
index 91bf46a..177eb96 100644
--- a/test/CodeGen/X86/statepoint-invoke.ll
+++ b/test/CodeGen/X86/statepoint-invoke.ll
@@ -6,7 +6,9 @@ declare i64 addrspace(1)* @"some_other_call"(i64 addrspace(1)*)
 
 declare i32 @"personality_function"()
 
-define i64 addrspace(1)* @test_result(i64 addrspace(1)* %obj, i64 addrspace(1)* %obj1) {
+define i64 addrspace(1)* @test_result(i64 addrspace(1)* %obj, 
+                                      i64 addrspace(1)* %obj1)
+  gc "statepoint-example" {
 entry:
   ; CHECK: .Ltmp{{[0-9]+}}:
   ; CHECK: callq some_other_call
diff --git a/test/CodeGen/X86/statepoint-stack-usage.ll b/test/CodeGen/X86/statepoint-stack-usage.ll
index 3ecef33..a968c03 100644
--- a/test/CodeGen/X86/statepoint-stack-usage.ll
+++ b/test/CodeGen/X86/statepoint-stack-usage.ll
@@ -8,20 +8,20 @@ target triple = "x86_64-pc-linux-gnu"
 ; of GC arguments differ, niave lowering code would insert loads and 
 ; stores to rearrange items on the stack.  We need to make sure (for
 ; performance) that this doesn't happen.
-define i32 @back_to_back_calls(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 {
+define i32 @back_to_back_calls(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 gc "statepoint-example" {
 ; CHECK-LABEL: back_to_back_calls
 ; The exact stores don't matter, but there need to be three stack slots created
 ; CHECK: movq	%rdx, 16(%rsp)
 ; CHECK: movq	%rdi, 8(%rsp)
 ; CHECK: movq	%rsi, (%rsp)
-  %safepoint_token = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
+  %safepoint_token = tail call i32 (void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
   %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 9)
   %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 10)
   %c1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 11)
 ; CHECK: callq
 ; This is the key check.  There should NOT be any memory moves here
 ; CHECK-NOT: movq
-  %safepoint_token2 = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %c1, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1)
+  %safepoint_token2 = tail call i32 (void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %c1, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1)
   %a2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 9, i32 11)
   %b2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 9, i32 10)
   %c2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 9, i32 9)
@@ -31,20 +31,20 @@ define i32 @back_to_back_calls(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 a
 
 ; This test simply checks that minor changes in vm state don't prevent slots
 ; being reused for gc values.  
-define i32 @reserve_first(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 {
+define i32 @reserve_first(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 gc "statepoint-example" {
 ; CHECK-LABEL: reserve_first
 ; The exact stores don't matter, but there need to be three stack slots created
 ; CHECK: movq	%rdx, 16(%rsp)
 ; CHECK: movq	%rdi, 8(%rsp)
 ; CHECK: movq	%rsi, (%rsp)
-  %safepoint_token = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
+  %safepoint_token = tail call i32 (void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
   %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 9)
   %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 10)
   %c1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 11)
 ; CHECK: callq
 ; This is the key check.  There should NOT be any memory moves here
 ; CHECK-NOT: movq
-  %safepoint_token2 = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 addrspace(1)* %a1, i32 0, i32 addrspace(1)* %c1, i32 0, i32 0, i32 addrspace(1)* %c1, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1)
+  %safepoint_token2 = tail call i32 (void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 addrspace(1)* %a1, i32 0, i32 addrspace(1)* %c1, i32 0, i32 0, i32 addrspace(1)* %c1, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1)
   %a2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 9, i32 11)
   %b2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 9, i32 10)
   %c2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 9, i32 9)
diff --git a/test/CodeGen/X86/statepoint-stackmap-format.ll b/test/CodeGen/X86/statepoint-stackmap-format.ll
index e452a63..9593c40 100644
--- a/test/CodeGen/X86/statepoint-stackmap-format.ll
+++ b/test/CodeGen/X86/statepoint-stackmap-format.ll
@@ -21,7 +21,7 @@ define i1 @test(i32 addrspace(1)* %ptr) gc "statepoint-example" {
 entry:
   %metadata1 = alloca i32 addrspace(1)*, i32 2, align 8
   store i32 addrspace(1)* null, i32 addrspace(1)** %metadata1
-  %safepoint_token = tail call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 2, i32 addrspace(1)* %ptr, i32 addrspace(1)* null, i32 addrspace(1)* %ptr, i32 addrspace(1)* null)
+  %safepoint_token = tail call i32 (i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 2, i32 addrspace(1)* %ptr, i32 addrspace(1)* null, i32 addrspace(1)* %ptr, i32 addrspace(1)* null)
   %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(i32 %safepoint_token)
   %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 6, i32 6)
   %b = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 7, i32 7)
diff --git a/test/CodeGen/X86/sub-with-overflow.ll b/test/CodeGen/X86/sub-with-overflow.ll
index 34f4066..fa00d6f 100644
--- a/test/CodeGen/X86/sub-with-overflow.ll
+++ b/test/CodeGen/X86/sub-with-overflow.ll
@@ -11,11 +11,11 @@ entry:
   br i1 %obit, label %overflow, label %normal
 
 normal:
-  %t1 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
+  %t1 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
   ret i1 true
 
 overflow:
-  %t2 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
+  %t2 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
   ret i1 false
 
 ; CHECK-LABEL: func1:
@@ -31,11 +31,11 @@ entry:
   br i1 %obit, label %carry, label %normal
 
 normal:
-  %t1 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
+  %t1 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
   ret i1 true
 
 carry:
-  %t2 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
+  %t2 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
   ret i1 false
 
 ; CHECK-LABEL: func2:
diff --git a/test/CodeGen/X86/switch-crit-edge-constant.ll b/test/CodeGen/X86/switch-crit-edge-constant.ll
index b44385c..e9a208d 100644
--- a/test/CodeGen/X86/switch-crit-edge-constant.ll
+++ b/test/CodeGen/X86/switch-crit-edge-constant.ll
@@ -35,19 +35,19 @@ cond_true:		; preds = %bb2
 
 blahaha:		; preds = %cond_true, %bb2, %entry, %entry, %entry, %entry, %entry, %entry, %entry, %entry, %entry, %entry
 	%s.0 = phi i8* [ getelementptr ([8 x i8], [8 x i8]* @str, i32 0, i64 0), %cond_true ], [ getelementptr ([5 x i8], [5 x i8]* @str1, i32 0, i64 0), %entry ], [ getelementptr ([5 x i8], [5 x i8]* @str1, i32 0, i64 0), %entry ], [ getelementptr ([5 x i8], [5 x i8]* @str1, i32 0, i64 0), %entry ], [ getelementptr ([5 x i8], [5 x i8]* @str1, i32 0, i64 0), %entry ], [ getelementptr ([5 x i8], [5 x i8]* @str1, i32 0, i64 0), %entry ], [ getelementptr ([5 x i8], [5 x i8]* @str1, i32 0, i64 0), %entry ], [ getelementptr ([5 x i8], [5 x i8]* @str1, i32 0, i64 0), %entry ], [ getelementptr ([5 x i8], [5 x i8]* @str1, i32 0, i64 0), %entry ], [ getelementptr ([5 x i8], [5 x i8]* @str1, i32 0, i64 0), %entry ], [ getelementptr ([5 x i8], [5 x i8]* @str1, i32 0, i64 0), %entry ], [ getelementptr ([5 x i8], [5 x i8]* @str2, i32 0, i64 0), %bb2 ]		; <i8*> [#uses=13]
-	%tmp8 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp10 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp12 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp14 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp16 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp18 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp20 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp22 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp24 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp26 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp28 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp30 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp32 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp8 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp10 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp12 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp14 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp16 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp18 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp20 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp22 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp24 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp26 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp28 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp30 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp32 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
 	ret void
 }
 
diff --git a/test/CodeGen/X86/switch-or.ll b/test/CodeGen/X86/switch-or.ll
index 75832c7..6e6b013 100644
--- a/test/CodeGen/X86/switch-or.ll
+++ b/test/CodeGen/X86/switch-or.ll
@@ -12,7 +12,7 @@ entry:
   ]
 
 if.then:
-  %call = tail call i32 (...)* @bar() nounwind
+  %call = tail call i32 (...) @bar() nounwind
   ret void
 
 if.end:
diff --git a/test/CodeGen/X86/tail-call-win64.ll b/test/CodeGen/X86/tail-call-win64.ll
index 8811b75..fb10d5d 100644
--- a/test/CodeGen/X86/tail-call-win64.ll
+++ b/test/CodeGen/X86/tail-call-win64.ll
@@ -4,7 +4,7 @@
 ; in-function jumps from function exiting jumps.
 
 define void @tail_jmp_reg(i32, i32, void ()* %fptr) {
-  tail call void ()* %fptr()
+  tail call void () %fptr()
   ret void
 }
 
@@ -28,7 +28,7 @@ define void @tail_jmp_imm() {
 
 define void @tail_jmp_mem() {
   %fptr = load void ()*, void ()** @g_fptr
-  tail call void ()* %fptr()
+  tail call void () %fptr()
   ret void
 }
 
diff --git a/test/CodeGen/X86/tailcall-64.ll b/test/CodeGen/X86/tailcall-64.ll
index f4d51c2..9e054fe 100644
--- a/test/CodeGen/X86/tailcall-64.ll
+++ b/test/CodeGen/X86/tailcall-64.ll
@@ -215,7 +215,7 @@ entry:
   %idxprom = sext i32 %n to i64
   %arrayidx = getelementptr inbounds [0 x i32 (i8*, ...)*], [0 x i32 (i8*, ...)*]* @funcs, i64 0, i64 %idxprom
   %0 = load i32 (i8*, ...)*, i32 (i8*, ...)** %arrayidx, align 8
-  %call = tail call i32 (i8*, ...)* %0(i8* null, i32 0, i32 0, i32 0, i32 0, i32 0) nounwind
+  %call = tail call i32 (i8*, ...) %0(i8* null, i32 0, i32 0, i32 0, i32 0, i32 0) nounwind
   ret i32 %call
 }
 
diff --git a/test/CodeGen/X86/tailcall-fastisel.ll b/test/CodeGen/X86/tailcall-fastisel.ll
index 4e1fc43..f69e75c 100644
--- a/test/CodeGen/X86/tailcall-fastisel.ll
+++ b/test/CodeGen/X86/tailcall-fastisel.ll
@@ -11,7 +11,7 @@ fail:                                             ; preds = %entry
 
 define i32 @foo() nounwind {
 entry:
- %0 = tail call i32 (...)* @bar() nounwind       ; <i32> [#uses=1]
+ %0 = tail call i32 (...) @bar() nounwind       ; <i32> [#uses=1]
  ret i32 %0
 }
 
diff --git a/test/CodeGen/X86/tailcall-mem-intrinsics.ll b/test/CodeGen/X86/tailcall-mem-intrinsics.ll
new file mode 100644
index 0000000..0e0ab5c
--- /dev/null
+++ b/test/CodeGen/X86/tailcall-mem-intrinsics.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+
+; CHECK-LABEL: tail_memcpy
+; CHECK: jmp memcpy
+define void @tail_memcpy(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: tail_memset
+; CHECK; jmp memmove
+define void @tail_memmove(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 {
+entry:
+  tail call void @llvm.memmove.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: tail_memset
+; CHECK: jmp memset
+define void @tail_memset(i8* nocapture %p, i8 %c, i32 %n) #0 {
+entry:
+  tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #0
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #0
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/twoaddr-coalesce.ll b/test/CodeGen/X86/twoaddr-coalesce.ll
index d3498a4..38685ec 100644
--- a/test/CodeGen/X86/twoaddr-coalesce.ll
+++ b/test/CodeGen/X86/twoaddr-coalesce.ll
@@ -12,7 +12,7 @@ bb1:		; preds = %bb1, %bb1.thread
 	%0 = trunc i32 %i.0.reg2mem.0 to i8		; <i8> [#uses=1]
 	%1 = sdiv i8 %0, 2		; <i8> [#uses=1]
 	%2 = sext i8 %1 to i32		; <i32> [#uses=1]
-	%3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i32 0), i32 %2) nounwind		; <i32> [#uses=0]
+	%3 = tail call i32 (i8*, ...) @printf(i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i32 0), i32 %2) nounwind		; <i32> [#uses=0]
 	%indvar.next = add i32 %i.0.reg2mem.0, 1		; <i32> [#uses=2]
 	%exitcond = icmp eq i32 %indvar.next, 258		; <i1> [#uses=1]
 	br i1 %exitcond, label %bb2, label %bb1
diff --git a/test/CodeGen/X86/uint64-to-float.ll b/test/CodeGen/X86/uint64-to-float.ll
index ca764e7..a1074a6 100644
--- a/test/CodeGen/X86/uint64-to-float.ll
+++ b/test/CodeGen/X86/uint64-to-float.ll
@@ -6,13 +6,13 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
 
-; CHECK: testq %rdi, %rdi
+; CHECK: andl
+; CHECK-NEXT: testq %rdi, %rdi
 ; CHECK-NEXT: js LBB0_1
 ; CHECK: cvtsi2ss
 ; CHECK-NEXT: ret
 ; CHECK: LBB0_1
 ; CHECK: shrq
-; CHECK-NEXT: andq
 ; CHECK-NEXT: orq
 ; CHECK-NEXT: cvtsi2ss
 define float @test(i64 %a) {
diff --git a/test/CodeGen/X86/umul-with-carry.ll b/test/CodeGen/X86/umul-with-carry.ll
index c930c16..6435760 100644
--- a/test/CodeGen/X86/umul-with-carry.ll
+++ b/test/CodeGen/X86/umul-with-carry.ll
@@ -14,11 +14,11 @@ entry:
   br i1 %obit, label %carry, label %normal
 
 normal:
-  %t1 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
+  %t1 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
   ret i1 true
 
 carry:
-  %t2 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
+  %t2 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
   ret i1 false
 }
 
diff --git a/test/CodeGen/X86/unknown-location.ll b/test/CodeGen/X86/unknown-location.ll
index 47d0811..2ed1acf 100644
--- a/test/CodeGen/X86/unknown-location.ll
+++ b/test/CodeGen/X86/unknown-location.ll
@@ -32,5 +32,5 @@ entry:
 !8 = !MDLocation(line: 4, column: 3, scope: !7)
 !9 = !{!1}
 !10 = !MDFile(filename: "test.c", directory: "/dir")
-!11 = !{i32 0}
+!11 = !{}
 !12 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/utf16-cfstrings.ll b/test/CodeGen/X86/utf16-cfstrings.ll
index b49eecf..5f0e78f 100644
--- a/test/CodeGen/X86/utf16-cfstrings.ll
+++ b/test/CodeGen/X86/utf16-cfstrings.ll
@@ -21,7 +21,7 @@ define i32 @main() uwtable ssp {
 entry:
   %retval = alloca i32, align 4
   store i32 0, i32* %retval
-  call void (%0*, ...)* @NSLog(%0* bitcast (%struct.NSConstantString* @_unnamed_cfstring_ to %0*))
+  call void (%0*, ...) @NSLog(%0* bitcast (%struct.NSConstantString* @_unnamed_cfstring_ to %0*))
   ret i32 0
 }
 
diff --git a/test/CodeGen/X86/vararg-callee-cleanup.ll b/test/CodeGen/X86/vararg-callee-cleanup.ll
index 2dcf319..bb1104d 100644
--- a/test/CodeGen/X86/vararg-callee-cleanup.ll
+++ b/test/CodeGen/X86/vararg-callee-cleanup.ll
@@ -4,8 +4,8 @@ target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
 
 declare x86_thiscallcc void @thiscall_thunk(i8* %this, ...)
 define i32 @call_varargs_thiscall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) {
-  call x86_thiscallcc void (i8*, ...)* @thiscall_thunk(i8* %a, i32 1, i32 2)
-  call x86_thiscallcc void (i8*, ...)* @thiscall_thunk(i8* %a, i32 1, i32 2)
+  call x86_thiscallcc void (i8*, ...) @thiscall_thunk(i8* %a, i32 1, i32 2)
+  call x86_thiscallcc void (i8*, ...) @thiscall_thunk(i8* %a, i32 1, i32 2)
   %t1 = add i32 %b, %c
   %r = add i32 %t1, %d
   ret i32 %r
@@ -19,8 +19,8 @@ define i32 @call_varargs_thiscall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) {
 
 declare x86_stdcallcc void @stdcall_thunk(i8* %this, ...)
 define i32 @call_varargs_stdcall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) {
-  call x86_stdcallcc void (i8*, ...)* @stdcall_thunk(i8* %a, i32 1, i32 2)
-  call x86_stdcallcc void (i8*, ...)* @stdcall_thunk(i8* %a, i32 1, i32 2)
+  call x86_stdcallcc void (i8*, ...) @stdcall_thunk(i8* %a, i32 1, i32 2)
+  call x86_stdcallcc void (i8*, ...) @stdcall_thunk(i8* %a, i32 1, i32 2)
   %t1 = add i32 %b, %c
   %r = add i32 %t1, %d
   ret i32 %r
@@ -32,8 +32,8 @@ define i32 @call_varargs_stdcall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) {
 
 declare x86_fastcallcc void @fastcall_thunk(i8* %this, ...)
 define i32 @call_varargs_fastcall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) {
-  call x86_fastcallcc void (i8*, ...)* @fastcall_thunk(i8* inreg %a, i32 inreg 1, i32 2)
-  call x86_fastcallcc void (i8*, ...)* @fastcall_thunk(i8* inreg %a, i32 inreg 1, i32 2)
+  call x86_fastcallcc void (i8*, ...) @fastcall_thunk(i8* inreg %a, i32 inreg 1, i32 2)
+  call x86_fastcallcc void (i8*, ...) @fastcall_thunk(i8* inreg %a, i32 inreg 1, i32 2)
   %t1 = add i32 %b, %c
   %r = add i32 %t1, %d
   ret i32 %r
diff --git a/test/CodeGen/X86/vararg_tailcall.ll b/test/CodeGen/X86/vararg_tailcall.ll
index 9b76bdd..98aa4a8 100644
--- a/test/CodeGen/X86/vararg_tailcall.ll
+++ b/test/CodeGen/X86/vararg_tailcall.ll
@@ -15,7 +15,7 @@
 ; WIN64: callq
 define void @foo(i64 %arg) nounwind optsize ssp noredzone {
 entry:
-  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), i64 %arg) nounwind optsize noredzone
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), i64 %arg) nounwind optsize noredzone
   ret void
 }
 
@@ -40,7 +40,7 @@ declare void @bar2(i8*, i64) optsize noredzone
 define i8* @foo2(i8* %arg) nounwind optsize ssp noredzone {
 entry:
   %tmp1 = load i8*, i8** @sel, align 8
-  %call = tail call i8* (i8*, i8*, ...)* @x2(i8* %arg, i8* %tmp1) nounwind optsize noredzone
+  %call = tail call i8* (i8*, i8*, ...) @x2(i8* %arg, i8* %tmp1) nounwind optsize noredzone
   ret i8* %call
 }
 
@@ -56,7 +56,7 @@ entry:
   %tmp3 = load i8*, i8** @sel4, align 8
   %tmp4 = load i8*, i8** @sel5, align 8
   %tmp5 = load i8*, i8** @sel6, align 8
-  %call = tail call i8* (i8*, i8*, i8*, ...)* @x3(i8* %arg1, i8* %arg2, i8* %tmp2, i8* %tmp3, i8* %tmp4, i8* %tmp5) nounwind optsize noredzone
+  %call = tail call i8* (i8*, i8*, i8*, ...) @x3(i8* %arg1, i8* %arg2, i8* %tmp2, i8* %tmp3, i8* %tmp4, i8* %tmp5) nounwind optsize noredzone
   ret i8* %call
 }
 
@@ -73,7 +73,7 @@ entry:
   %tmp4 = load i8*, i8** @sel5, align 8
   %tmp5 = load i8*, i8** @sel6, align 8
   %tmp6 = load i8*, i8** @sel7, align 8
-  %call = tail call i8* (i8*, i8*, i8*, i8*, i8*, i8*, i8*, ...)* @x7(i8* %arg1, i8* %arg2, i8* %tmp2, i8* %tmp3, i8* %tmp4, i8* %tmp5, i8* %tmp6) nounwind optsize noredzone
+  %call = tail call i8* (i8*, i8*, i8*, i8*, i8*, i8*, i8*, ...) @x7(i8* %arg1, i8* %arg2, i8* %tmp2, i8* %tmp3, i8* %tmp4, i8* %tmp5, i8* %tmp6) nounwind optsize noredzone
   ret i8* %call
 }
 
@@ -89,6 +89,6 @@ entry:
   %tmp3 = load i8*, i8** @sel4, align 8
   %tmp4 = load i8*, i8** @sel5, align 8
   %tmp5 = load i8*, i8** @sel6, align 8
-  %call = tail call i8* (i8*, i8*, i8*, ...)* @x3(i8* %arg1, i8* %arg2, i8* %tmp2, i8* %tmp3, i8* %tmp4, i8* %tmp5, i32 48879, i32 48879) nounwind optsize noredzone
+  %call = tail call i8* (i8*, i8*, i8*, ...) @x3(i8* %arg1, i8* %arg2, i8* %tmp2, i8* %tmp3, i8* %tmp4, i8* %tmp5, i32 48879, i32 48879) nounwind optsize noredzone
   ret i8* %call
 }
diff --git a/test/CodeGen/X86/variadic-node-pic.ll b/test/CodeGen/X86/variadic-node-pic.ll
index 1182a30..704459e 100644
--- a/test/CodeGen/X86/variadic-node-pic.ll
+++ b/test/CodeGen/X86/variadic-node-pic.ll
@@ -6,6 +6,6 @@ target triple = "x86_64-apple-darwin8"
 declare void @xscanf(i64) nounwind 
 
 define void @foo() nounwind  {
-	call void (i64)* @xscanf( i64 0 ) nounwind
+	call void (i64) @xscanf( i64 0 ) nounwind
 	unreachable
 }
diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll
index 07cd195..e507895 100644
--- a/test/CodeGen/X86/vec_cast2.ll
+++ b/test/CodeGen/X86/vec_cast2.ll
@@ -100,37 +100,29 @@ define <8 x i8> @foo3_8(<8 x float> %src) {
 ;
 ; CHECK-WIDE-LABEL: foo3_8:
 ; CHECK-WIDE:       ## BB#0:
-; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
-; CHECK-WIDE-NEXT:    shll $8, %eax
-; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %ecx
-; CHECK-WIDE-NEXT:    movzbl %cl, %ecx
-; CHECK-WIDE-NEXT:    orl %eax, %ecx
-; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
-; CHECK-WIDE-NEXT:    shll $8, %eax
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %edx
-; CHECK-WIDE-NEXT:    movzbl %dl, %edx
-; CHECK-WIDE-NEXT:    orl %eax, %edx
-; CHECK-WIDE-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm1
-; CHECK-WIDE-NEXT:    vpinsrw $1, %ecx, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
+; CHECK-WIDE-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm1
+; CHECK-WIDE-NEXT:    vmovshdup %xmm0, %xmm2    ## xmm2 = xmm0[1,1,3,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpermilps $231, %xmm0, %xmm2 ## xmm2 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
 ; CHECK-WIDE-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
+; CHECK-WIDE-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vmovshdup %xmm0, %xmm2    ## xmm2 = xmm0[1,1,3,3]
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
-; CHECK-WIDE-NEXT:    shll $8, %eax
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %ecx
-; CHECK-WIDE-NEXT:    movzbl %cl, %ecx
-; CHECK-WIDE-NEXT:    orl %eax, %ecx
-; CHECK-WIDE-NEXT:    vpinsrw $2, %ecx, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0]
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
-; CHECK-WIDE-NEXT:    shll $8, %eax
-; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %ecx
-; CHECK-WIDE-NEXT:    movzbl %cl, %ecx
-; CHECK-WIDE-NEXT:    orl %eax, %ecx
-; CHECK-WIDE-NEXT:    vpinsrw $3, %ecx, %xmm1, %xmm0
+; CHECK-WIDE-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpermilps $231, %xmm0, %xmm0 ## xmm0 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
+; CHECK-WIDE-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm0
 ; CHECK-WIDE-NEXT:    vzeroupper
 ; CHECK-WIDE-NEXT:    retl
   %res = fptosi <8 x float> %src to <8 x i8>
@@ -145,21 +137,17 @@ define <4 x i8> @foo3_4(<4 x float> %src) {
 ;
 ; CHECK-WIDE-LABEL: foo3_4:
 ; CHECK-WIDE:       ## BB#0:
-; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
-; CHECK-WIDE-NEXT:    shll $8, %eax
-; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %ecx
-; CHECK-WIDE-NEXT:    movzbl %cl, %ecx
-; CHECK-WIDE-NEXT:    orl %eax, %ecx
-; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
-; CHECK-WIDE-NEXT:    shll $8, %eax
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %edx
-; CHECK-WIDE-NEXT:    movzbl %dl, %edx
-; CHECK-WIDE-NEXT:    orl %eax, %edx
-; CHECK-WIDE-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
+; CHECK-WIDE-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm1
+; CHECK-WIDE-NEXT:    vmovshdup %xmm0, %xmm2    ## xmm2 = xmm0[1,1,3,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpermilps $231, %xmm0, %xmm0 ## xmm0 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
+; CHECK-WIDE-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm0
 ; CHECK-WIDE-NEXT:    retl
   %res = fptosi <4 x float> %src to <4 x i8>
   ret <4 x i8> %res
diff --git a/test/CodeGen/X86/vec_floor.ll b/test/CodeGen/X86/vec_floor.ll
index 4db68bd..f35c4ab 100644
--- a/test/CodeGen/X86/vec_floor.ll
+++ b/test/CodeGen/X86/vec_floor.ll
@@ -180,3 +180,49 @@ define <8 x float> @nearbyint_v8f32(<8 x float> %p)
   ret <8 x float> %t
 }
 declare <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
+
+;
+; Constant Folding
+;
+
+define <2 x double> @const_floor_v2f64() {
+  ; CHECK: const_floor_v2f64
+  ; CHECK: movaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00]
+  %t = call <2 x double> @llvm.floor.v2f64(<2 x double> <double -1.5, double 2.5>)
+  ret <2 x double> %t
+}
+
+define <4 x float> @const_floor_v4f32() {
+  ; CHECK: const_floor_v4f32
+  ; CHECK: movaps {{.*#+}} xmm0 = [-4.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
+  %t = call <4 x float> @llvm.floor.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
+  ret <4 x float> %t
+}
+
+define <2 x double> @const_ceil_v2f64() {
+  ; CHECK: const_ceil_v2f64
+  ; CHECK: movaps {{.*#+}} xmm0 = [-1.000000e+00,3.000000e+00]
+  %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> <double -1.5, double 2.5>)
+  ret <2 x double> %t
+}
+
+define <4 x float> @const_ceil_v4f32() {
+  ; CHECK: const_ceil_v4f32
+  ; CHECK: movaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,3.000000e+00]
+  %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
+  ret <4 x float> %t
+}
+
+define <2 x double> @const_trunc_v2f64() {
+  ; CHECK: const_trunc_v2f64
+  ; CHECK: movaps {{.*#+}} xmm0 = [-1.000000e+00,2.000000e+00]
+  %t = call <2 x double> @llvm.trunc.v2f64(<2 x double> <double -1.5, double 2.5>)
+  ret <2 x double> %t
+}
+
+define <4 x float> @const_trunc_v4f32() {
+  ; CHECK: const_trunc_v4f32
+  ; CHECK: movaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
+  %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
+  ret <4 x float> %t
+}
diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll
index 0f89515..4018a21 100644
--- a/test/CodeGen/X86/vec_insert-5.ll
+++ b/test/CodeGen/X86/vec_insert-5.ll
@@ -8,8 +8,8 @@ define void  @t1(i32 %a, x86_mmx* %P) nounwind {
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    shll $12, %ecx
 ; CHECK-NEXT:    movd %ecx, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,1]
-; CHECK-NEXT:    movlpd %xmm0, (%eax)
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; CHECK-NEXT:    movq %xmm0, (%eax)
 ; CHECK-NEXT:    retl
  %tmp12 = shl i32 %a, 12
  %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1
diff --git a/test/CodeGen/X86/vec_insert-mmx.ll b/test/CodeGen/X86/vec_insert-mmx.ll
index 447f97a..cbd4208 100644
--- a/test/CodeGen/X86/vec_insert-mmx.ll
+++ b/test/CodeGen/X86/vec_insert-mmx.ll
@@ -6,8 +6,8 @@ define x86_mmx @t0(i32 %A) nounwind {
 ; X86-32-LABEL: t0:
 ; X86-32:       ## BB#0:
 ; X86-32:    movd {{[0-9]+}}(%esp), %xmm0
-; X86-32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,1]
-; X86-32-NEXT:    movlpd %xmm0, (%esp)
+; X86-32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; X86-32-NEXT:    movq %xmm0, (%esp)
 ; X86-32-NEXT:    movq (%esp), %mm0
 ; X86-32-NEXT:    addl $12, %esp
 ; X86-32-NEXT:    retl
diff --git a/test/CodeGen/X86/vec_reassociate.ll b/test/CodeGen/X86/vec_reassociate.ll
new file mode 100644
index 0000000..bf2053f
--- /dev/null
+++ b/test/CodeGen/X86/vec_reassociate.ll
@@ -0,0 +1,119 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s
+
+define <4 x i32> @add_4i32(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @add_4i32
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   paddd %xmm1, %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = add <4 x i32> %a0, <i32  1, i32 -2, i32  3, i32 -4>
+  %2 = add <4 x i32> %a1, <i32 -1, i32  2, i32 -3, i32  4>
+  %3 = add <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @add_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @add_4i32_commute
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   paddd %xmm1, %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = add <4 x i32> <i32  1, i32 -2, i32  3, i32 -4>, %a0
+  %2 = add <4 x i32> <i32 -1, i32  2, i32 -3, i32  4>, %a1
+  %3 = add <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @mul_4i32(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @mul_4i32
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   pmulld %xmm1, %xmm0
+  ;CHECK-NEXT:   pmulld .LCPI2_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = mul <4 x i32> %a0, <i32 1, i32 2, i32 3, i32 4>
+  %2 = mul <4 x i32> %a1, <i32 4, i32 3, i32 2, i32 1>
+  %3 = mul <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @mul_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @mul_4i32_commute
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   pmulld %xmm1, %xmm0
+  ;CHECK-NEXT:   pmulld .LCPI3_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = mul <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %a0
+  %2 = mul <4 x i32> <i32 4, i32 3, i32 2, i32 1>, %a1
+  %3 = mul <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @and_4i32(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @and_4i32
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   andps %xmm1, %xmm0
+  ;CHECK-NEXT:   andps .LCPI4_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = and <4 x i32> %a0, <i32 -2, i32 -2, i32  3, i32  3>
+  %2 = and <4 x i32> %a1, <i32 -1, i32 -1, i32  1, i32  1>
+  %3 = and <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @and_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @and_4i32_commute
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   andps %xmm1, %xmm0
+  ;CHECK-NEXT:   andps .LCPI5_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = and <4 x i32> <i32 -2, i32 -2, i32  3, i32  3>, %a0
+  %2 = and <4 x i32> <i32 -1, i32 -1, i32  1, i32  1>, %a1
+  %3 = and <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @or_4i32(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @or_4i32
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   orps %xmm1, %xmm0
+  ;CHECK-NEXT:   orps .LCPI6_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = or <4 x i32> %a0, <i32 -2, i32 -2, i32  3, i32  3>
+  %2 = or <4 x i32> %a1, <i32 -1, i32 -1, i32  1, i32  1>
+  %3 = or <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @or_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @or_4i32_commute
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   orps %xmm1, %xmm0
+  ;CHECK-NEXT:   orps .LCPI7_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = or <4 x i32> <i32 -2, i32 -2, i32  3, i32  3>, %a0 
+  %2 = or <4 x i32> <i32 -1, i32 -1, i32  1, i32  1>, %a1
+  %3 = or <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @xor_4i32(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @xor_4i32
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   xorps %xmm1, %xmm0
+  ;CHECK-NEXT:   xorps .LCPI8_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = xor <4 x i32> %a0, <i32 -2, i32 -2, i32  3, i32  3>
+  %2 = xor <4 x i32> %a1, <i32 -1, i32 -1, i32  1, i32  1>
+  %3 = xor <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @xor_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @xor_4i32_commute
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   xorps %xmm1, %xmm0
+  ;CHECK-NEXT:   xorps .LCPI9_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = xor <4 x i32> <i32 -2, i32 -2, i32  3, i32  3>, %a0
+  %2 = xor <4 x i32> <i32 -1, i32 -1, i32  1, i32  1>, %a1
+  %3 = xor <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
diff --git a/test/CodeGen/X86/vec_zero_cse.ll b/test/CodeGen/X86/vec_zero_cse.ll
index afde0ed..8ed8083 100644
--- a/test/CodeGen/X86/vec_zero_cse.ll
+++ b/test/CodeGen/X86/vec_zero_cse.ll
@@ -9,7 +9,7 @@
 
 define void @test1() {
 ;CHECK-LABEL: @test1
-;CHECK: xorpd
+;CHECK: xorps
   store <1 x i64> zeroinitializer, <1 x i64>* @M1
   store <2 x i32> zeroinitializer, <2 x i32>* @M2
   ret void
@@ -17,7 +17,7 @@ define void @test1() {
 
 define void @test2() {
 ;CHECK-LABEL: @test2
-;CHECK: pshufd
+;CHECK: pcmpeqd
   store <1 x i64> < i64 -1 >, <1 x i64>* @M1
   store <2 x i32> < i32 -1, i32 -1 >, <2 x i32>* @M2
   ret void
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 01b8972..53d13c8 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -634,28 +634,16 @@ define <16 x i8> @PR20540(<8 x i8> %a) {
 }
 
 define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
-; SSE2-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movzbl %dil, %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movd %edi, %xmm0
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    movd %edi, %xmm0
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT:    retq
+; SSE-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE:       # BB#0:
+; SSE-NEXT:    movzbl %dil, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    movzbl %dil, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
 ; AVX-NEXT:    retq
   %a = insertelement <16 x i8> undef, i8 %i, i32 0
   %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -665,27 +653,28 @@ define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
 define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
 ; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movzbl %dil, %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; SSE2-NEXT:    shll   $8, %edi
+; SSE2-NEXT:    pxor   %xmm0, %xmm0
+; SSE2-NEXT:    pinsrw $2, %edi, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movd %edi, %xmm0
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    shll   $8, %edi
+; SSSE3-NEXT:    pxor   %xmm0, %xmm0
+; SSSE3-NEXT:    pinsrw $2, %edi, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movd %edi, %xmm0
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pxor   %xmm0, %xmm0
+; SSE41-NEXT:    pinsrb $5, %edi, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpxor   %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $5, %edi, %xmm0
 ; AVX-NEXT:    retq
   %a = insertelement <16 x i8> undef, i8 %i, i32 0
   %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -693,16 +682,30 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
 }
 
 define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) {
-; SSE-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
-; SSE:       # BB#0:
-; SSE-NEXT:    movd %edi, %xmm0
-; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; SSE-NEXT:    retq
+; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shll   $8, %edi
+; SSE2-NEXT:    pxor   %xmm0, %xmm0
+; SSE2-NEXT:    pinsrw $7, %edi, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shll   $8, %edi
+; SSSE3-NEXT:    pxor   %xmm0, %xmm0
+; SSSE3-NEXT:    pinsrw $7, %edi, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor   %xmm0, %xmm0
+; SSE41-NEXT:    pinsrb $15, %edi, %xmm0
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; AVX-NEXT:    vpxor   %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $15, %edi, %xmm0
 ; AVX-NEXT:    retq
   %a = insertelement <16 x i8> undef, i8 %i, i32 0
   %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
@@ -713,29 +716,27 @@ define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
 ; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movzbl %dil, %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE2-NEXT:    pxor   %xmm0, %xmm0
+; SSE2-NEXT:    pinsrw $1, %eax, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movd %edi, %xmm0
-; SSSE3-NEXT:    pslld $24, %xmm0
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    movzbl %dil, %eax
+; SSSE3-NEXT:    pxor   %xmm0, %xmm0
+; SSSE3-NEXT:    pinsrw $1, %eax, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movd %edi, %xmm0
-; SSE41-NEXT:    pslld $24, %xmm0
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pxor   %xmm0, %xmm0
+; SSE41-NEXT:    pinsrb $2, %edi, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    vpslld $24, %xmm0, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpxor   %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $2, %edi, %xmm0
 ; AVX-NEXT:    retq
   %a = insertelement <16 x i8> undef, i8 %i, i32 3
   %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll
index eb77c38..4007f0b 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -1384,16 +1384,14 @@ define <8 x i16> @shuffle_v8i16_8zzzzzzz(i16 %i) {
 define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) {
 ; SSE-LABEL: shuffle_v8i16_z8zzzzzz:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movzwl %di, %eax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE-NEXT:    pxor   %xmm0, %xmm0
+; SSE-NEXT:    pinsrw $1, %edi, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v8i16_z8zzzzzz:
 ; AVX:       # BB#0:
-; AVX-NEXT:    movzwl %di, %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT:    vpxor   %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $1, %edi, %xmm0
 ; AVX-NEXT:    retq
   %a = insertelement <8 x i16> undef, i16 %i, i32 0
   %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 2, i32 8, i32 3, i32 7, i32 6, i32 5, i32 4, i32 3>
@@ -1403,16 +1401,14 @@ define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) {
 define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) {
 ; SSE-LABEL: shuffle_v8i16_zzzzz8zz:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movzwl %di, %eax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE-NEXT:    pxor   %xmm0, %xmm0
+; SSE-NEXT:    pinsrw $5, %edi, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v8i16_zzzzz8zz:
 ; AVX:       # BB#0:
-; AVX-NEXT:    movzwl %di, %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; AVX-NEXT:    vpxor   %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $5, %edi, %xmm0
 ; AVX-NEXT:    retq
   %a = insertelement <8 x i16> undef, i16 %i, i32 0
   %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0>
@@ -1422,14 +1418,14 @@ define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) {
 define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) {
 ; SSE-LABEL: shuffle_v8i16_zuuzuuz8:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movd %edi, %xmm0
-; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
+; SSE-NEXT:    pxor   %xmm0, %xmm0
+; SSE-NEXT:    pinsrw $7, %edi, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v8i16_zuuzuuz8:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
+; AVX-NEXT:    vpxor   %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $7, %edi, %xmm0
 ; AVX-NEXT:    retq
   %a = insertelement <8 x i16> undef, i16 %i, i32 0
   %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 8>
@@ -1439,16 +1435,14 @@ define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) {
 define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) {
 ; SSE-LABEL: shuffle_v8i16_zzBzzzzz:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movzwl %di, %eax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSE-NEXT:    pxor   %xmm0, %xmm0
+; SSE-NEXT:    pinsrw $2, %edi, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v8i16_zzBzzzzz:
 ; AVX:       # BB#0:
-; AVX-NEXT:    movzwl %di, %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
+; AVX-NEXT:    vpxor   %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $2, %edi, %xmm0
 ; AVX-NEXT:    retq
   %a = insertelement <8 x i16> undef, i16 %i, i32 3
   %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 11, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index aad3702..df4994d 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -3249,3 +3249,15 @@ define <16 x i16> @shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_u
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 23, i32 undef, i32 3, i32 undef, i32 20, i32 20, i32 5, i32 undef, i32 31, i32 undef, i32 11, i32 undef, i32 28, i32 28, i32 13, i32 undef>
   ret <16 x i16> %shuffle
 }
+
+define <16 x i16> @insert_v16i16_0elt_into_zero_vector(i16* %ptr) {
+; ALL-LABEL: insert_v16i16_0elt_into_zero_vector:
+; ALL:       # BB#0:
+; ALL-NEXT:    movzwl (%rdi), %eax
+; ALL-NEXT:    vmovd %eax, %xmm0
+; ALL-NEXT:    retq
+  %val = load i16, i16* %ptr
+  %i0 = insertelement <16 x i16> zeroinitializer, i16 %val, i32 0
+  ret <16 x i16> %i0
+}
+
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
index f9f4b96..a0f43de 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -656,8 +656,6 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX2-NEXT:    movl $15, %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpblendd $15, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 8aca67c..1b42a63 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -813,15 +813,11 @@ define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) {
 ; AVX1-LABEL: insert_reg_and_zero_v4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vmovq %rdi, %xmm0
-; AVX1-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
-; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: insert_reg_and_zero_v4i64:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vmovq %rdi, %xmm0
-; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX2-NEXT:    retq
   %v = insertelement <4 x i64> undef, i64 %a, i64 0
   %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -832,15 +828,11 @@ define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
 ; AVX1-LABEL: insert_mem_and_zero_v4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
-; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: insert_mem_and_zero_v4i64:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX2-NEXT:    retq
   %a = load i64, i64* %ptr
   %v = insertelement <4 x i64> undef, i64 %a, i64 0
@@ -851,8 +843,9 @@ define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
 define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
 ; ALL-LABEL: insert_reg_and_zero_v4f64:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; ALL-NEXT:    # kill: XMM0<def> XMM0<kill> YMM0<def>
+; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
 ; ALL-NEXT:    retq
   %v = insertelement <4 x double> undef, double %a, i32 0
   %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 417423a..bb07077 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -133,8 +133,6 @@ define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    movl $7, %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
 ; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -962,8 +960,6 @@ define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    movl $7, %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
 ; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -2090,3 +2086,20 @@ entry:
   %res = shufflevector <4 x float> %tmp76, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x float> %res
 }
+
+define <8 x i32> @insert_mem_and_zero_v8i32(i32* %ptr) {
+; AVX1-LABEL: insert_mem_and_zero_v8i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_mem_and_zero_v8i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    retq
+  %a = load i32, i32* %ptr
+  %v = insertelement <8 x i32> undef, i32 %a, i32 0
+  %shuffle = shufflevector <8 x i32> %v, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i32> %shuffle
+}
+
diff --git a/test/CodeGen/X86/vector-shuffle-mmx.ll b/test/CodeGen/X86/vector-shuffle-mmx.ll
index 094722d..dbccd26 100644
--- a/test/CodeGen/X86/vector-shuffle-mmx.ll
+++ b/test/CodeGen/X86/vector-shuffle-mmx.ll
@@ -9,7 +9,7 @@ define void @test0(<1 x i64>* %x) {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X32-NEXT:    movlpd %xmm0, (%eax)
+; X32-NEXT:    movq %xmm0, (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test0:
@@ -38,13 +38,13 @@ define void @test1() {
 ; X32-NEXT:    .cfi_def_cfa_offset 24
 ; X32-NEXT:  Ltmp2:
 ; X32-NEXT:    .cfi_offset %edi, -8
-; X32-NEXT:    xorpd %xmm0, %xmm0
-; X32-NEXT:    movlpd %xmm0, (%esp)
+; X32-NEXT:    xorps %xmm0, %xmm0
+; X32-NEXT:    movlps %xmm0, (%esp)
 ; X32-NEXT:    movq (%esp), %mm0
 ; X32-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
 ; X32-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-NEXT:    movlpd %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movq %xmm0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movq {{[0-9]+}}(%esp), %mm1
 ; X32-NEXT:    xorl %edi, %edi
 ; X32-NEXT:    maskmovq %mm1, %mm0
@@ -54,8 +54,8 @@ define void @test1() {
 ;
 ; X64-LABEL: test1:
 ; X64:       ## BB#0: ## %entry
-; X64-NEXT:    pxor %xmm0, %xmm0
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    movlps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %mm0
 ; X64-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
 ; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
diff --git a/test/CodeGen/X86/widen_cast-1.ll b/test/CodeGen/X86/widen_cast-1.ll
index 6b7f489..b0240dd 100644
--- a/test/CodeGen/X86/widen_cast-1.ll
+++ b/test/CodeGen/X86/widen_cast-1.ll
@@ -3,12 +3,14 @@
 
 ; CHECK: movl
 ; CHECK: paddw
-; CHECK: movlpd
+; CHECK: movq
+
+; FIXME - if this test cares about scheduling, why isn't it being checked?
 
 ; Scheduler causes produce a different instruction order
 ; ATOM: movl
 ; ATOM: paddw
-; ATOM: movlpd
+; ATOM: movq
 
 ; bitcast a v4i16 to v2i32
 
diff --git a/test/CodeGen/X86/widen_cast-4.ll b/test/CodeGen/X86/widen_cast-4.ll
index 060dfb1..8ed2785 100644
--- a/test/CodeGen/X86/widen_cast-4.ll
+++ b/test/CodeGen/X86/widen_cast-4.ll
@@ -52,7 +52,7 @@ forbody:		; preds = %forcond
 ; CHECK-NEXT: psraw $8
 ; CHECK-NEXT: psraw $2
 ; CHECK-NEXT: pshufb
-; CHECK-NEXT: movlpd
+; CHECK-NEXT: movq
 ;
 ; FIXME: We shouldn't require both a movd and an insert.
 ; CHECK-WIDE: %forbody
diff --git a/test/CodeGen/X86/widen_cast-5.ll b/test/CodeGen/X86/widen_cast-5.ll
index ccf0bd1..4e9d2df 100644
--- a/test/CodeGen/X86/widen_cast-5.ll
+++ b/test/CodeGen/X86/widen_cast-5.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
 ; CHECK: movl
-; CHECK: movlpd
+; CHECK: movq
 
 ; bitcast a i64 to v2i32
 define void @convert(<2 x i32>* %dst.addr, i64 %src) nounwind {
diff --git a/test/CodeGen/X86/widen_shuffle-1.ll b/test/CodeGen/X86/widen_shuffle-1.ll
index 2aa870f..3028052 100644
--- a/test/CodeGen/X86/widen_shuffle-1.ll
+++ b/test/CodeGen/X86/widen_shuffle-1.ll
@@ -84,7 +84,7 @@ define void @shuf5(<8 x i8>* %p) nounwind {
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33]
 ; CHECK-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; CHECK-NEXT:    movlpd %xmm0, (%eax)
+; CHECK-NEXT:    movq %xmm0, (%eax)
 ; CHECK-NEXT:    retl
   %v = shufflevector <2 x i8> <i8 4, i8 33>, <2 x i8> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   store <8 x i8> %v, <8 x i8>* %p, align 8
diff --git a/test/CodeGen/X86/x86-64-asm.ll b/test/CodeGen/X86/x86-64-asm.ll
index 2640e59..f103ab7 100644
--- a/test/CodeGen/X86/x86-64-asm.ll
+++ b/test/CodeGen/X86/x86-64-asm.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define void @frame_dummy() {
 entry:
-        %tmp1 = tail call void (i8*)* (void (i8*)*)* asm "", "=r,0,~{dirflag},~{fpsr},~{flags}"( void (i8*)* null )           ; <void (i8*)*> [#uses=0]
+        %tmp1 = tail call void (i8*)* (void (i8*)*) asm "", "=r,0,~{dirflag},~{fpsr},~{flags}"( void (i8*)* null )           ; <void (i8*)*> [#uses=0]
         ret void
 }
 
diff --git a/test/CodeGen/X86/x86-64-tls-1.ll b/test/CodeGen/X86/x86-64-tls-1.ll
index 2879fb4..2c954db 100644
--- a/test/CodeGen/X86/x86-64-tls-1.ll
+++ b/test/CodeGen/X86/x86-64-tls-1.ll
@@ -3,7 +3,7 @@
 define i64 @z() nounwind {
 ; CHECK:      movq    $tm_nest_level@TPOFF, %r[[R0:[abcd]]]x
 ; CHECK-NEXT: addl    %fs:0, %e[[R0]]x
-; CHECK-NEXT: andq    $100, %r[[R0]]x
+; CHECK-NEXT: andl    $100, %e[[R0]]x
 
   ret i64 and (i64 ptrtoint (i32* @tm_nest_level to i64), i64 100)
 }
diff --git a/test/CodeGen/X86/x86-64-varargs.ll b/test/CodeGen/X86/x86-64-varargs.ll
index f40e02f..ed07bde 100644
--- a/test/CodeGen/X86/x86-64-varargs.ll
+++ b/test/CodeGen/X86/x86-64-varargs.ll
@@ -6,6 +6,6 @@ declare i32 @printf(i8*, ...) nounwind
 
 define i32 @main() nounwind  {
 entry:
-	%tmp10.i = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([26 x i8], [26 x i8]* @.str, i32 0, i64 0), i32 12, double 0x3FF3EB8520000000, i32 120, i64 123456677890, i32 -10, double 4.500000e+15 ) nounwind 		; <i32> [#uses=0]
+	%tmp10.i = tail call i32 (i8*, ...) @printf( i8* getelementptr ([26 x i8], [26 x i8]* @.str, i32 0, i64 0), i32 12, double 0x3FF3EB8520000000, i32 120, i64 123456677890, i32 -10, double 4.500000e+15 ) nounwind 		; <i32> [#uses=0]
 	ret i32 0
 }
diff --git a/test/CodeGen/X86/xmulo.ll b/test/CodeGen/X86/xmulo.ll
index ebc1907..825efa6 100644
--- a/test/CodeGen/X86/xmulo.ll
+++ b/test/CodeGen/X86/xmulo.ll
@@ -17,7 +17,7 @@ define i32 @t1() nounwind {
     %2 = extractvalue {i64, i1} %1, 0
     %3 = extractvalue {i64, i1} %1, 1
     %4 = zext i1 %3 to i32
-    %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
+    %5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
     ret i32 0
 }
 
@@ -31,7 +31,7 @@ define i32 @t2() nounwind {
     %2 = extractvalue {i64, i1} %1, 0
     %3 = extractvalue {i64, i1} %1, 1
     %4 = zext i1 %3 to i32
-    %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
+    %5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
     ret i32 0
 }
 
@@ -45,6 +45,6 @@ define i32 @t3() nounwind {
     %2 = extractvalue {i64, i1} %1, 0
     %3 = extractvalue {i64, i1} %1, 1
     %4 = zext i1 %3 to i32
-    %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
+    %5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
     ret i32 0
 }
diff --git a/test/CodeGen/X86/xor-icmp.ll b/test/CodeGen/X86/xor-icmp.ll
index dd1fcca..397e5bc 100644
--- a/test/CodeGen/X86/xor-icmp.ll
+++ b/test/CodeGen/X86/xor-icmp.ll
@@ -24,11 +24,11 @@ entry:
   br i1 %4, label %bb1, label %bb
 
 bb:                                               ; preds = %entry
-  %5 = tail call i32 (...)* @foo() nounwind       ; <i32> [#uses=1]
+  %5 = tail call i32 (...) @foo() nounwind       ; <i32> [#uses=1]
   ret i32 %5
 
 bb1:                                              ; preds = %entry
-  %6 = tail call i32 (...)* @bar() nounwind       ; <i32> [#uses=1]
+  %6 = tail call i32 (...) @bar() nounwind       ; <i32> [#uses=1]
   ret i32 %6
 }
 
@@ -59,7 +59,7 @@ entry:
   br i1 %2, label %bb, label %return
 
 bb:                                               ; preds = %entry
-  %3 = tail call i32 (...)* @foo() nounwind       ; <i32> [#uses=0]
+  %3 = tail call i32 (...) @foo() nounwind       ; <i32> [#uses=0]
   ret i32 undef
 
 return:                                           ; preds = %entry
diff --git a/test/CodeGen/XCore/llvm-intrinsics.ll b/test/CodeGen/XCore/llvm-intrinsics.ll
index 539bf19..b7868d3 100644
--- a/test/CodeGen/XCore/llvm-intrinsics.ll
+++ b/test/CodeGen/XCore/llvm-intrinsics.ll
@@ -122,7 +122,7 @@ entry:
 ; CHECK-NEXT: ldw r0, sp[2]
 ; CHECK-NEXT: set sp, r2
 ; CHECK-NEXT: bau r3
-  call void (...)* @foo()
+  call void (...) @foo()
   call void @llvm.eh.return.i32(i32 %offset, i8* %handler)
   unreachable
 }
@@ -144,7 +144,7 @@ entry:
 ; CHECK-NEXT: ldw r0, sp[2]
 ; CHECK-NEXT: set sp, r2
 ; CHECK-NEXT: bau r3
-  call void (...)* @foo()
+  call void (...) @foo()
   %0 = load i32, i32* @offset
   call void @llvm.eh.return.i32(i32 %0, i8* @handler)
   unreachable
@@ -244,7 +244,7 @@ define void @Unwind0() {
 ; CHECK: ldw r4, sp[9]
 ; CHECK: retsp 10
 define void @Unwind1() {
-  call void (...)* @foo()
+  call void (...) @foo()
   call void @llvm.eh.unwind.init()
   ret void
 }
author	Pirama Arumuga Nainar <pirama@google.com>	2015-05-06 11:46:36 -0700
committer	Pirama Arumuga Nainar <pirama@google.com>	2015-05-18 10:52:30 -0700
commit	2c3e0051c31c3f5b2328b447eadf1cf9c4427442 (patch)
tree	c0104029af14e9f47c2ef58ca60e6137691f3c9b /test/CodeGen
parent	e1bc145815f4334641be19f1c45ecf85d25b6e5a (diff)
download	external_llvm-2c3e0051c31c3f5b2328b447eadf1cf9c4427442.zip external_llvm-2c3e0051c31c3f5b2328b447eadf1cf9c4427442.tar.gz external_llvm-2c3e0051c31c3f5b2328b447eadf1cf9c4427442.tar.bz2