Update LLVM for rebase to r212749.

Includes a cherry-pick of: r212948 - fixes a small issue with atomic calls Change-Id: Ib97bd980b59f18142a69506400911a6009d9df18
author: Stephen Hines <srhines@google.com> 2014-07-21 00:45:20 -0700
committer: Stephen Hines <srhines@google.com> 2014-07-21 00:45:20 -0700
commit: c6a4f5e819217e1e12c458aed8e7b122e23a3a58 (patch)
tree: 81b7dd2bb4370a392f31d332a566c903b5744764 /test/CodeGen/AArch64
parent: 19c6fbb3e8aaf74093afa08013134b61fa08f245 (diff)
download: external_llvm-c6a4f5e819217e1e12c458aed8e7b122e23a3a58.zip
external_llvm-c6a4f5e819217e1e12c458aed8e7b122e23a3a58.tar.gz
external_llvm-c6a4f5e819217e1e12c458aed8e7b122e23a3a58.tar.bz2
61 files changed, 2196 insertions, 138 deletions
diff --git a/test/CodeGen/AArch64/aarch64-address-type-promotion-assertion.ll b/test/CodeGen/AArch64/aarch64-address-type-promotion-assertion.ll
new file mode 100644
index 0000000..2df9c37
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-address-type-promotion-assertion.ll
@@ -0,0 +1,55 @@
+; RUN: llc -O3 -mcpu=cortex-a53 -mtriple=aarch64--linux-gnu %s -o - | FileCheck %s
+; PR20188: don't crash when merging sexts.
+
+; CHECK: foo:
+define void @foo() unnamed_addr align 2 {
+entry:
+  br label %invoke.cont145
+
+invoke.cont145:
+  %or.cond = and i1 undef, false
+  br i1 %or.cond, label %if.then274, label %invoke.cont145
+
+if.then274:
+  %0 = load i32* null, align 4
+  br i1 undef, label %invoke.cont291, label %if.else313
+
+invoke.cont291:
+  %idxprom.i.i.i605 = sext i32 %0 to i64
+  %arrayidx.i.i.i607 = getelementptr inbounds double* undef, i64 %idxprom.i.i.i605
+  %idxprom.i.i.i596 = sext i32 %0 to i64
+  %arrayidx.i.i.i598 = getelementptr inbounds double* undef, i64 %idxprom.i.i.i596
+  br label %if.end356
+
+if.else313:
+  %cmp314 = fcmp olt double undef, 0.000000e+00
+  br i1 %cmp314, label %invoke.cont317, label %invoke.cont353
+
+invoke.cont317:
+  br i1 undef, label %invoke.cont326, label %invoke.cont334
+
+invoke.cont326:
+  %idxprom.i.i.i587 = sext i32 %0 to i64
+  %arrayidx.i.i.i589 = getelementptr inbounds double* undef, i64 %idxprom.i.i.i587
+  %sub329 = fsub fast double undef, undef
+  br label %invoke.cont334
+
+invoke.cont334:
+  %lo.1 = phi double [ %sub329, %invoke.cont326 ], [ undef, %invoke.cont317 ]
+  br i1 undef, label %invoke.cont342, label %if.end356
+
+invoke.cont342:
+  %idxprom.i.i.i578 = sext i32 %0 to i64
+  %arrayidx.i.i.i580 = getelementptr inbounds double* undef, i64 %idxprom.i.i.i578
+  br label %if.end356
+
+invoke.cont353:
+  %idxprom.i.i.i572 = sext i32 %0 to i64
+  %arrayidx.i.i.i574 = getelementptr inbounds double* undef, i64 %idxprom.i.i.i572
+  br label %if.end356
+
+if.end356:
+  %lo.2 = phi double [ 0.000000e+00, %invoke.cont291 ], [ %lo.1, %invoke.cont342 ], [ undef, %invoke.cont353 ], [ %lo.1, %invoke.cont334 ]
+  call void null(i32 %0, double %lo.2)
+  unreachable
+}
diff --git a/test/CodeGen/AArch64/aarch64-address-type-promotion.ll b/test/CodeGen/AArch64/aarch64-address-type-promotion.ll
new file mode 100644
index 0000000..ee90d19
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-address-type-promotion.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -o - | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-apple-macosx10.9"
+
+; Check that sexts get promoted above adds.
+define void @foo(i32* nocapture %a, i32 %i) {
+entry:
+; CHECK-LABEL: _foo:
+; CHECK: add
+; CHECK-NEXT: ldp
+; CHECK-NEXT: add
+; CHECK-NEXT: str
+; CHECK-NEXT: ret
+  %add = add nsw i32 %i, 1
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32* %a, i64 %idxprom
+  %0 = load i32* %arrayidx, align 4
+  %add1 = add nsw i32 %i, 2
+  %idxprom2 = sext i32 %add1 to i64
+  %arrayidx3 = getelementptr inbounds i32* %a, i64 %idxprom2
+  %1 = load i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %1, %0
+  %idxprom5 = sext i32 %i to i64
+  %arrayidx6 = getelementptr inbounds i32* %a, i64 %idxprom5
+  store i32 %add4, i32* %arrayidx6, align 4
+  ret void
+}
diff --git a/test/CodeGen/AArch64/addsub_ext.ll b/test/CodeGen/AArch64/addsub_ext.ll
index a2266b1..ceea8a0 100644
--- a/test/CodeGen/AArch64/addsub_ext.ll
+++ b/test/CodeGen/AArch64/addsub_ext.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs %s -o - -mtriple=aarch64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs %s -o - -mtriple=aarch64-linux-gnu -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 
 @var8 = global i8 0
 @var16 = global i16 0
diff --git a/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll b/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll
index d1840d3..7da2d2c 100644
--- a/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll
+++ b/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll
@@ -2,14 +2,14 @@
 ; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic < %s | FileCheck %s --check-prefix=CHECK-LINUX
 ; <rdar://problem/11392109>
 
-define hidden void @t() optsize ssp {
+define hidden void @t(i64* %addr) optsize ssp {
 entry:
-  store i64 zext (i32 ptrtoint (i64 (i32)* @x to i32) to i64), i64* undef, align 8
+  store i64 zext (i32 ptrtoint (i64 (i32)* @x to i32) to i64), i64* %addr, align 8
 ; CHECK:             adrp    x{{[0-9]+}}, _x@GOTPAGE
 ; CHECK:        ldr     x{{[0-9]+}}, [x{{[0-9]+}}, _x@GOTPAGEOFF]
 ; CHECK-NEXT:        and     x{{[0-9]+}}, x{{[0-9]+}}, #0xffffffff
 ; CHECK-NEXT:        str     x{{[0-9]+}}, [x{{[0-9]+}}]
-  unreachable
+  ret void
 }
 
 declare i64 @x(i32) optsize
diff --git a/test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll b/test/CodeGen/AArch64/arm64-AnInfiniteLoopInDAGCombine.ll
index a73b707..a73b707 100644
--- a/test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll
+++ b/test/CodeGen/AArch64/arm64-AnInfiniteLoopInDAGCombine.ll
diff --git a/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll b/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll
index 1b2d543..1b2d543 100644
--- a/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll
+++ b/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll
diff --git a/test/CodeGen/AArch64/arm64-aapcs.ll b/test/CodeGen/AArch64/arm64-aapcs.ll
index b713f0d..ccf1371 100644
--- a/test/CodeGen/AArch64/arm64-aapcs.ll
+++ b/test/CodeGen/AArch64/arm64-aapcs.ll
@@ -101,3 +101,11 @@ define fp128 @test_fp128([8 x float] %arg0, fp128 %arg1) {
 ; CHECK: ldr {{q[0-9]+}}, [sp]
   ret fp128 %arg1
 }
+
+; Check if VPR can be correctly pass by stack.
+define <2 x double> @test_vreg_stack([8 x <2 x double>], <2 x double> %varg_stack) {
+entry:
+; CHECK-LABEL: test_vreg_stack:
+; CHECK: ldr {{q[0-9]+}}, [sp]
+  ret <2 x double> %varg_stack;
+}
diff --git a/test/CodeGen/AArch64/arm64-abi.ll b/test/CodeGen/AArch64/arm64-abi.ll
index e2de434..a955029 100644
--- a/test/CodeGen/AArch64/arm64-abi.ll
+++ b/test/CodeGen/AArch64/arm64-abi.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
+; RUN: llc < %s -debug -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
 ; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
+; REQUIRES: asserts
 target triple = "arm64-apple-darwin"
 
 ; rdar://9932559
@@ -8,15 +9,15 @@ entry:
 ; CHECK-LABEL: i8i16callee:
 ; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
 ; They are i8, i16, i8 and i8.
-; CHECK: ldrsb	{{w[0-9]+}}, [sp, #5]
-; CHECK: ldrsh	{{w[0-9]+}}, [sp, #2]
-; CHECK: ldrsb	{{w[0-9]+}}, [sp]
-; CHECK: ldrsb	{{w[0-9]+}}, [sp, #4]
+; CHECK-DAG: ldrsb {{w[0-9]+}}, [sp, #5]
+; CHECK-DAG: ldrsb {{w[0-9]+}}, [sp, #4]
+; CHECK-DAG: ldrsh {{w[0-9]+}}, [sp, #2]
+; CHECK-DAG: ldrsb {{w[0-9]+}}, [sp]
 ; FAST-LABEL: i8i16callee:
-; FAST: ldrb  {{w[0-9]+}}, [sp, #5]
-; FAST: ldrb  {{w[0-9]+}}, [sp, #4]
-; FAST: ldrh  {{w[0-9]+}}, [sp, #2]
-; FAST: ldrb  {{w[0-9]+}}, [sp]
+; FAST-DAG: ldrsb  {{w[0-9]+}}, [sp, #5]
+; FAST-DAG: ldrsb  {{w[0-9]+}}, [sp, #4]
+; FAST-DAG: ldrsh  {{w[0-9]+}}, [sp, #2]
+; FAST-DAG: ldrsb  {{w[0-9]+}}, [sp]
   %conv = sext i8 %a4 to i64
   %conv3 = sext i16 %a5 to i64
   %conv8 = sext i8 %b1 to i64
@@ -44,10 +45,10 @@ entry:
 ; CHECK: i8i16caller
 ; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
 ; They are i8, i16, i8 and i8.
-; CHECK: strb {{w[0-9]+}}, [sp, #5]
-; CHECK: strb {{w[0-9]+}}, [sp, #4]
-; CHECK: strh {{w[0-9]+}}, [sp, #2]
-; CHECK: strb {{w[0-9]+}}, [sp]
+; CHECK-DAG: strb {{w[0-9]+}}, [sp, #5]
+; CHECK-DAG: strb {{w[0-9]+}}, [sp, #4]
+; CHECK-DAG: strh {{w[0-9]+}}, [sp, #2]
+; CHECK-DAG: strb {{w[0-9]+}}, [sp]
 ; CHECK: bl
 ; FAST: i8i16caller
 ; FAST: strb {{w[0-9]+}}, [sp]
diff --git a/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll b/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
index 34d6287..38661a5 100644
--- a/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
+++ b/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
@@ -1,4 +1,4 @@
-; RUN: llc %s -o - | FileCheck %s
+; RUN: llc %s -o - -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 ; Check that ANDS (tst) is not merged with ADD when the immediate
 ; is not 0.
 ; <rdar://problem/16693089>
@@ -8,18 +8,18 @@ target triple = "arm64-apple-ios"
 ; CHECK-LABEL: tst1:
 ; CHECK: add [[REG:w[0-9]+]], w{{[0-9]+}}, #1
 ; CHECK: tst [[REG]], #0x1
-define void @tst1() {
+define void @tst1(i1 %tst, i32 %true) {
 entry:
-  br i1 undef, label %for.end, label %for.body
+  br i1 %tst, label %for.end, label %for.body
 
 for.body:                                         ; preds = %for.body, %entry
   %result.09 = phi i32 [ %add2.result.0, %for.body ], [ 1, %entry ]
   %i.08 = phi i32 [ %inc, %for.body ], [ 2, %entry ]
   %and = and i32 %i.08, 1
   %cmp1 = icmp eq i32 %and, 0
-  %add2.result.0 = select i1 %cmp1, i32 undef, i32 %result.09
+  %add2.result.0 = select i1 %cmp1, i32 %true, i32 %result.09
   %inc = add nsw i32 %i.08, 1
-  %cmp = icmp slt i32 %i.08, undef
+  %cmp = icmp slt i32 %i.08, %true
   br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
 
 for.cond.for.end_crit_edge:                       ; preds = %for.body
diff --git a/test/CodeGen/AArch64/arm64-arith.ll b/test/CodeGen/AArch64/arm64-arith.ll
index ed9b569..f36e706 100644
--- a/test/CodeGen/AArch64/arm64-arith.ll
+++ b/test/CodeGen/AArch64/arm64-arith.ll
@@ -260,3 +260,11 @@ define i64 @f3(i64 %a) nounwind readnone ssp {
   %res = mul nsw i64 %a, 17
   ret i64 %res
 }
+
+define i32 @f4(i32 %a) nounwind readnone ssp {
+; CHECK-LABEL: f4:
+; CHECK-NEXT: add w0, w0, w0, lsl #1
+; CHECK-NEXT: ret
+  %res = mul i32 %a, 3
+  ret i32 %res
+}
diff --git a/test/CodeGen/AArch64/arm64-atomic-128.ll b/test/CodeGen/AArch64/arm64-atomic-128.ll
index 3b43aa1..3377849 100644
--- a/test/CodeGen/AArch64/arm64-atomic-128.ll
+++ b/test/CodeGen/AArch64/arm64-atomic-128.ll
@@ -13,7 +13,8 @@ define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
 ; CHECK: stxp   [[SCRATCH_RES:w[0-9]+]], x4, x5, [x[[ADDR]]]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
 ; CHECK: [[DONE]]:
-  %val = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire
+  %pair = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire
+  %val = extractvalue { i128, i1 } %pair, 0
   ret i128 %val
 }
 
@@ -21,8 +22,10 @@ define void @fetch_and_nand(i128* %p, i128 %bits) {
 ; CHECK-LABEL: fetch_and_nand:
 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
 ; CHECK: ldxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK-DAG: bic    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
-; CHECK-DAG: bic    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK-DAG: and    [[TMP_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK-DAG: and    [[TMP_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK-DAG: mvn    [[SCRATCH_REGLO:x[0-9]+]], [[TMP_REGLO]]
+; CHECK-DAG: mvn    [[SCRATCH_REGHI:x[0-9]+]], [[TMP_REGHI]]
 ; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
 
diff --git a/test/CodeGen/AArch64/arm64-atomic.ll b/test/CodeGen/AArch64/arm64-atomic.ll
index aa9b284..b56f91d 100644
--- a/test/CodeGen/AArch64/arm64-atomic.ll
+++ b/test/CodeGen/AArch64/arm64-atomic.ll
@@ -10,7 +10,8 @@ define i32 @val_compare_and_swap(i32* %p) {
 ; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEWVAL_REG]], [x0]
 ; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
 ; CHECK: [[LABEL2]]:
-  %val = cmpxchg i32* %p, i32 7, i32 4 acquire acquire
+  %pair = cmpxchg i32* %p, i32 7, i32 4 acquire acquire
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -25,7 +26,8 @@ define i64 @val_compare_and_swap_64(i64* %p) {
 ; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], x[[NEWVAL_REG]], [x0]
 ; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
 ; CHECK: [[LABEL2]]:
-  %val = cmpxchg i64* %p, i64 7, i64 4 monotonic monotonic
+  %pair = cmpxchg i64* %p, i64 7, i64 4 monotonic monotonic
+  %val = extractvalue { i64, i1 } %pair, 0
   ret i64 %val
 }
 
@@ -33,7 +35,8 @@ define i32 @fetch_and_nand(i32* %p) {
 ; CHECK-LABEL: fetch_and_nand:
 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
 ; CHECK: ldxr   w[[DEST_REG:[0-9]+]], [x0]
-; CHECK: and    [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], #0xfffffff8
+; CHECK: mvn    [[TMP_REG:w[0-9]+]], w[[DEST_REG]]
+; CHECK: orr    [[SCRATCH2_REG:w[0-9]+]], [[TMP_REG]], #0xfffffff8
 ; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
 ; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
 ; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
@@ -46,8 +49,9 @@ define i64 @fetch_and_nand_64(i64* %p) {
 ; CHECK-LABEL: fetch_and_nand_64:
 ; CHECK: mov    x[[ADDR:[0-9]+]], x0
 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxr   [[DEST_REG:x[0-9]+]], [x[[ADDR]]]
-; CHECK: and    [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], #0xfffffffffffffff8
+; CHECK: ldaxr   x[[DEST_REG:[0-9]+]], [x[[ADDR]]]
+; CHECK: mvn    w[[TMP_REG:[0-9]+]], w[[DEST_REG]]
+; CHECK: orr    [[SCRATCH2_REG:x[0-9]+]], x[[TMP_REG]], #0xfffffffffffffff8
 ; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]]
 ; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
 
diff --git a/test/CodeGen/AArch64/arm64-build-vector.ll b/test/CodeGen/AArch64/arm64-build-vector.ll
index c109263..d0f6db0 100644
--- a/test/CodeGen/AArch64/arm64-build-vector.ll
+++ b/test/CodeGen/AArch64/arm64-build-vector.ll
@@ -33,3 +33,27 @@ define <4 x float>  @foo(float %a, float %b, float %c, float %d) nounwind {
   %4 = insertelement <4 x float> %3, float %d, i32 3
   ret <4 x float> %4
 }
+
+define <8 x i16> @build_all_zero(<8 x i16> %a) #1 {
+; CHECK-LABEL: build_all_zero:
+; CHECK: movz	w[[GREG:[0-9]+]], #0xae80
+; CHECK-NEXT:	fmov	s[[FREG:[0-9]+]], w[[GREG]]
+; CHECK-NEXT:	mul.8h	v0, v0, v[[FREG]]
+  %b = add <8 x i16> %a, <i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>
+  %c = mul <8 x i16> %b, <i16 -20864, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>
+  ret <8 x i16> %c
+}
+
+; There is an optimization in DAG Combiner as following:
+;   fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...))
+;        -> (BUILD_VECTOR A, B, ..., C, D, ...)
+; This case checks when A,B and C,D are different types, there should be no
+; assertion failure.
+define <8 x i16> @concat_2_build_vector(<4 x i16> %in0) {
+; CHECK-LABEL: concat_2_build_vector:
+; CHECK: movi
+  %vshl_n = shl <4 x i16> %in0, <i16 8, i16 8, i16 8, i16 8>
+  %vshl_n2 = shl <4 x i16> %vshl_n, <i16 9, i16 9, i16 9, i16 9>
+  %shuffle.i = shufflevector <4 x i16> %vshl_n2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle.i
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll b/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll
deleted file mode 100644
index d862b1e..0000000
--- a/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
-
-; CHECK: fptosi_1
-; CHECK: fcvtzs.2d
-; CHECK: xtn.2s
-; CHECK: ret
-define void @fptosi_1() nounwind noinline ssp {
-entry:
-  %0 = fptosi <2 x double> undef to <2 x i32>
-  store <2 x i32> %0, <2 x i32>* undef, align 8
-  ret void
-}
-
-; CHECK: fptoui_1
-; CHECK: fcvtzu.2d
-; CHECK: xtn.2s
-; CHECK: ret
-define void @fptoui_1() nounwind noinline ssp {
-entry:
-  %0 = fptoui <2 x double> undef to <2 x i32>
-  store <2 x i32> %0, <2 x i32>* undef, align 8
-  ret void
-}
-
diff --git a/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll b/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll
deleted file mode 100644
index daaf1e0..0000000
--- a/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
-
-define <2 x double> @f1(<2 x i32> %v) nounwind readnone {
-; CHECK-LABEL: f1:
-; CHECK: sshll.2d v0, v0, #0
-; CHECK-NEXT: scvtf.2d v0, v0
-; CHECK-NEXT: ret
-  %conv = sitofp <2 x i32> %v to <2 x double>
-  ret <2 x double> %conv
-}
-define <2 x double> @f2(<2 x i32> %v) nounwind readnone {
-; CHECK-LABEL: f2:
-; CHECK: ushll.2d v0, v0, #0
-; CHECK-NEXT: ucvtf.2d v0, v0
-; CHECK-NEXT: ret
-  %conv = uitofp <2 x i32> %v to <2 x double>
-  ret <2 x double> %conv
-}
-
-; CHECK: autogen_SD19655
-; CHECK: scvtf
-; CHECK: ret
-define void @autogen_SD19655() {
-  %T = load <2 x i64>* undef
-  %F = sitofp <2 x i64> undef to <2 x float>
-  store <2 x float> %F, <2 x float>* undef
-  ret void
-}
-
diff --git a/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/test/CodeGen/AArch64/arm64-convert-v4f64.ll
new file mode 100644
index 0000000..7123e5e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -march=arm64 | FileCheck %s
+
+
+define <4 x i16> @fptosi_v4f64_to_v4i16(<4 x double>* %ptr) {
+; CHECK: fptosi_v4f64_to_v4i16
+; CHECK-DAG: fcvtzs  v[[LHS:[0-9]+]].2d, v1.2d
+; CHECK-DAG: fcvtzs  v[[RHS:[0-9]+]].2d, v0.2d
+; CHECK-DAG: xtn  v[[LHS_NA:[0-9]+]].2s, v[[LHS]].2d
+; CHECK-DAG: xtn  v[[RHS_NA:[0-9]+]].2s, v[[RHS]].2d
+; CHECK:     uzp1  v0.4h, v[[RHS_NA]].4h, v[[LHS_NA]].4h
+  %tmp1 = load <4 x double>* %ptr
+  %tmp2 = fptosi <4 x double> %tmp1 to <4 x i16>
+  ret <4 x i16> %tmp2
+}
+
+define <8 x i8> @fptosi_v4f64_to_v4i8(<8 x double>* %ptr) {
+; CHECK: fptosi_v4f64_to_v4i8
+; CHECK-DAG:  fcvtzs  v[[CONV3:[0-9]+]].2d, v3.2d
+; CHECK-DAG:  fcvtzs  v[[CONV2:[0-9]+]].2d, v2.2d
+; CHECK-DAG:  fcvtzs  v[[CONV1:[0-9]+]].2d, v1.2d
+; CHECK-DAG:  fcvtzs  v[[CONV0:[0-9]+]].2d, v0.2d
+; CHECK-DAG:  xtn  v[[NA3:[0-9]+]].2s, v[[CONV3]].2d
+; CHECK-DAG:  xtn  v[[NA2:[0-9]+]].2s, v[[CONV2]].2d
+; CHECK-DAG:  xtn  v[[NA1:[0-9]+]].2s, v[[CONV1]].2d
+; CHECK-DAG:  xtn  v[[NA0:[0-9]+]].2s, v[[CONV0]].2d
+; CHECK-DAG:  uzp1  v[[TMP1:[0-9]+]].4h, v[[CONV2]].4h, v[[CONV3]].4h
+; CHECK-DAG:  uzp1  v[[TMP2:[0-9]+]].4h, v[[CONV0]].4h, v[[CONV1]].4h
+; CHECK:      uzp1  v0.8b, v[[TMP2]].8b, v[[TMP1]].8b
+  %tmp1 = load <8 x double>* %ptr
+  %tmp2 = fptosi <8 x double> %tmp1 to <8 x i8>
+  ret <8 x i8> %tmp2
+}
+
diff --git a/test/CodeGen/AArch64/arm64-cse.ll b/test/CodeGen/AArch64/arm64-cse.ll
index bb14c89..5d62cfe 100644
--- a/test/CodeGen/AArch64/arm64-cse.ll
+++ b/test/CodeGen/AArch64/arm64-cse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 < %s | FileCheck %s
+; RUN: llc -O3 < %s -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 target triple = "arm64-apple-ios"
 
 ; rdar://12462006
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
index 2cf0135..6eed48b 100644
--- a/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
@@ -1,5 +1,8 @@
 ; RUN: llc -mcpu=cyclone < %s | FileCheck %s
 
+; r208640 broke ppc64/Linux self-hosting; xfailing while this is worked on.
+; XFAIL: *
+
 target datalayout = "e-i64:64-n32:64-S128"
 target triple = "arm64-apple-ios"
 
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
index 2e4b658..ce132c6 100644
--- a/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
@@ -13,12 +13,12 @@ target triple = "arm64-apple-ios"
 
 ; CHECK-LABEL: XX:
 ; CHECK: ldr
-define void @XX(%class.A* %K) {
+define i32 @XX(%class.A* %K, i1 %tst, i32* %addr, %class.C** %ppC, %class.C* %pC) {
 entry:
-  br i1 undef, label %if.then, label %lor.rhs.i
+  br i1 %tst, label %if.then, label %lor.rhs.i
 
 lor.rhs.i:                                        ; preds = %entry
-  %tmp = load i32* undef, align 4
+  %tmp = load i32* %addr, align 4
   %y.i.i.i = getelementptr inbounds %class.A* %K, i64 0, i32 1
   %tmp1 = load i64* %y.i.i.i, align 8
   %U.sroa.3.8.extract.trunc.i = trunc i64 %tmp1 to i32
@@ -30,17 +30,17 @@ lor.rhs.i:                                        ; preds = %entry
   %add16.i = add nsw i32 %add12.i, %div15.i
   %rem.i.i = srem i32 %add16.i, %tmp
   %idxprom = sext i32 %rem.i.i to i64
-  %arrayidx = getelementptr inbounds %class.C** undef, i64 %idxprom
-  %tobool533 = icmp eq %class.C* undef, null
+  %arrayidx = getelementptr inbounds %class.C** %ppC, i64 %idxprom
+  %tobool533 = icmp eq %class.C* %pC, null
   br i1 %tobool533, label %while.end, label %while.body
 
 if.then:                                          ; preds = %entry
-  unreachable
+  ret i32 42
 
 while.body:                                       ; preds = %lor.rhs.i
-  unreachable
+  ret i32 5
 
 while.end:                                        ; preds = %lor.rhs.i
   %tmp3 = load %class.C** %arrayidx, align 8
-  unreachable
+  ret i32 50
 }
diff --git a/test/CodeGen/AArch64/arm64-early-ifcvt.ll b/test/CodeGen/AArch64/arm64-early-ifcvt.ll
index 17d783a..44150c2 100644
--- a/test/CodeGen/AArch64/arm64-early-ifcvt.ll
+++ b/test/CodeGen/AArch64/arm64-early-ifcvt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -stress-early-ifcvt | FileCheck %s
+; RUN: llc < %s -stress-early-ifcvt -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 target triple = "arm64-apple-macosx"
 
 ; CHECK: mm2
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
index a3d5f6c..1152988 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
@@ -133,3 +133,16 @@ define void @t8() {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 4, i32 1, i1 false)
   ret void
 }
+
+define void @test_distant_memcpy(i8* %dst) {
+; ARM64-LABEL: test_distant_memcpy:
+; ARM64: mov [[ARRAY:x[0-9]+]], sp
+; ARM64: movz [[OFFSET:x[0-9]+]], #0x1f40
+; ARM64: add x[[ADDR:[0-9]+]], [[ARRAY]], [[OFFSET]]
+; ARM64: ldrb [[BYTE:w[0-9]+]], [x[[ADDR]]]
+; ARM64: strb [[BYTE]], [x0]
+  %array = alloca i8, i32 8192
+  %elem = getelementptr i8* %array, i32 8000
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %elem, i64 1, i32 1, i1 false)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-fp128.ll b/test/CodeGen/AArch64/arm64-fp128.ll
index 57bbb93..b1d5010 100644
--- a/test/CodeGen/AArch64/arm64-fp128.ll
+++ b/test/CodeGen/AArch64/arm64-fp128.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone -aarch64-atomic-cfg-tidy=0 < %s | FileCheck %s
 
 @lhs = global fp128 zeroinitializer, align 16
 @rhs = global fp128 zeroinitializer, align 16
diff --git a/test/CodeGen/AArch64/arm64-frame-index.ll b/test/CodeGen/AArch64/arm64-frame-index.ll
index 4a91ff3..321f335 100644
--- a/test/CodeGen/AArch64/arm64-frame-index.ll
+++ b/test/CodeGen/AArch64/arm64-frame-index.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios -aarch64-atomic-cfg-tidy=0 < %s | FileCheck %s
 ; rdar://11935841
 
 define void @t1() nounwind ssp {
diff --git a/test/CodeGen/AArch64/arm64-misched-basic-A53.ll b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
index f88bd6a..bc7ed7f 100644
--- a/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
+++ b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
@@ -122,3 +122,82 @@ define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2(i8* %A, i8** %ptr) {
 }
 
 declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8*)
+
+; Regression Test for PR20057.
+;
+; Cortex-A53 machine model stalls on A53UnitFPMDS contention. Instructions that
+; are otherwise ready are jammed in the pending queue.
+; CHECK: ********** MI Scheduling **********
+; CHECK: testResourceConflict
+; CHECK: *** Final schedule for BB#0 ***
+; CHECK: BRK
+; CHECK: ********** INTERVALS **********
+define void @testResourceConflict(float* %ptr) {
+entry:
+  %add1 = fadd float undef, undef
+  %mul2 = fmul float undef, undef
+  %add3 = fadd float %mul2, undef
+  %mul4 = fmul float undef, %add3
+  %add5 = fadd float %mul4, undef
+  %sub6 = fsub float 0.000000e+00, undef
+  %sub7 = fsub float %add5, undef
+  %div8 = fdiv float 1.000000e+00, undef
+  %mul9 = fmul float %div8, %sub7
+  %mul14 = fmul float %sub6, %div8
+  %mul10 = fsub float -0.000000e+00, %mul14
+  %mul15 = fmul float undef, %div8
+  %mul11 = fsub float -0.000000e+00, %mul15
+  %mul12 = fmul float 0.000000e+00, %div8
+  %mul13 = fmul float %add1, %mul9
+  %mul21 = fmul float %add5, %mul11
+  %add22 = fadd float %mul13, %mul21
+  store float %add22, float* %ptr, align 4
+  %mul28 = fmul float %add1, %mul10
+  %mul33 = fmul float %add5, %mul12
+  %add34 = fadd float %mul33, %mul28
+  store float %add34, float* %ptr, align 4
+  %mul240 = fmul float undef, %mul9
+  %add246 = fadd float %mul240, undef
+  store float %add246, float* %ptr, align 4
+  %mul52 = fmul float undef, %mul10
+  %mul57 = fmul float undef, %mul12
+  %add58 = fadd float %mul57, %mul52
+  store float %add58, float* %ptr, align 4
+  %mul27 = fmul float 0.000000e+00, %mul9
+  %mul81 = fmul float undef, %mul10
+  %add82 = fadd float %mul27, %mul81
+  store float %add82, float* %ptr, align 4
+  call void @llvm.trap()
+  unreachable
+}
+
+declare void @llvm.trap()
+
+; Regression test for PR20057: "permanent hazard"'
+; Resource contention on LDST.
+; CHECK: ********** MI Scheduling **********
+; CHECK: testLdStConflict
+; CHECK: *** Final schedule for BB#1 ***
+; CHECK: LD4Fourv2d
+; CHECK: STRQui
+; CHECK: ********** INTERVALS **********
+define void @testLdStConflict() {
+entry:
+  br label %loop
+
+loop:
+  %0 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i8(i8* null)
+  %ptr = bitcast i8* undef to <2 x i64>*
+  store <2 x i64> zeroinitializer, <2 x i64>* %ptr, align 4
+  %ptr1 = bitcast i8* undef to <2 x i64>*
+  store <2 x i64> zeroinitializer, <2 x i64>* %ptr1, align 4
+  %ptr2 = bitcast i8* undef to <2 x i64>*
+  store <2 x i64> zeroinitializer, <2 x i64>* %ptr2, align 4
+  %ptr3 = bitcast i8* undef to <2 x i64>*
+  store <2 x i64> zeroinitializer, <2 x i64>* %ptr3, align 4
+  %ptr4 = bitcast i8* undef to <2 x i64>*
+  store <2 x i64> zeroinitializer, <2 x i64>* %ptr4, align 4
+  br label %loop
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i8(i8*)
diff --git a/test/CodeGen/AArch64/arm64-misched-basic-A57.ll b/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
new file mode 100644
index 0000000..238474a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
@@ -0,0 +1,112 @@
+; REQUIRES: asserts
+;
+; The Cortext-A57 machine model will avoid scheduling load instructions in
+; succession because loads on the A57 have a latency of 4 cycles and they all
+; issue to the same pipeline. Instead, it will move other instructions between
+; the loads to avoid unnecessary stalls. The generic machine model schedules 4
+; loads consecutively for this case and will cause stalls.
+;
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; CHECK: ********** MI Scheduling **********
+; CHECK: main:BB#2
+; CHECK LDR
+; CHECK Latency : 4
+; CHECK: *** Final schedule for BB#2 ***
+; CHECK: LDR
+; CHECK: LDR
+; CHECK-NOT: LDR
+; CHECK: {{.*}}
+; CHECK: ********** MI Scheduling **********
+
+@main.x = private unnamed_addr constant [8 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 4
+@main.y = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2], align 4
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %x = alloca [8 x i32], align 4
+  %y = alloca [8 x i32], align 4
+  %i = alloca i32, align 4
+  %xx = alloca i32, align 4
+  %yy = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = bitcast [8 x i32]* %x to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([8 x i32]* @main.x to i8*), i64 32, i32 4, i1 false)
+  %1 = bitcast [8 x i32]* %y to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ([8 x i32]* @main.y to i8*), i64 32, i32 4, i1 false)
+  store i32 0, i32* %xx, align 4
+  store i32 0, i32* %yy, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %2 = load i32* %i, align 4
+  %cmp = icmp slt i32 %2, 8
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %3 = load i32* %yy, align 4
+  %4 = load i32* %i, align 4
+  %idxprom = sext i32 %4 to i64
+  %arrayidx = getelementptr inbounds [8 x i32]* %x, i32 0, i64 %idxprom
+  %5 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %5, 1
+  store i32 %add, i32* %xx, align 4
+  %6 = load i32* %xx, align 4
+  %add1 = add nsw i32 %6, 12
+  store i32 %add1, i32* %xx, align 4
+  %7 = load i32* %xx, align 4
+  %add2 = add nsw i32 %7, 23
+  store i32 %add2, i32* %xx, align 4
+  %8 = load i32* %xx, align 4
+  %add3 = add nsw i32 %8, 34
+  store i32 %add3, i32* %xx, align 4
+  %9 = load i32* %i, align 4
+  %idxprom4 = sext i32 %9 to i64
+  %arrayidx5 = getelementptr inbounds [8 x i32]* %y, i32 0, i64 %idxprom4
+  %10 = load i32* %arrayidx5, align 4
+
+  %add4 = add nsw i32 %9, %add
+  %add5 = add nsw i32 %10, %add1
+  %add6 = add nsw i32 %add4, %add5
+
+  %add7 = add nsw i32 %9, %add3
+  %add8 = add nsw i32 %10, %add4
+  %add9 = add nsw i32 %add7, %add8
+
+  %add10 = add nsw i32 %9, %add6
+  %add11 = add nsw i32 %10, %add7
+  %add12 = add nsw i32 %add10, %add11
+
+  %add13 = add nsw i32 %9, %add9
+  %add14 = add nsw i32 %10, %add10
+  %add15 = add nsw i32 %add13, %add14
+
+  store i32 %add15, i32* %xx, align 4
+
+  %div = sdiv i32 %4, %5
+
+  store i32 %div, i32* %yy, align 4
+
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %11 = load i32* %i, align 4
+  %inc = add nsw i32 %11, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %12 = load i32* %xx, align 4
+  %13 = load i32* %yy, align 4
+  %add67 = add nsw i32 %12, %13
+  ret i32 %add67
+}
+
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
index 97bfb5c..07373cc 100644
--- a/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
+++ b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
@@ -6,9 +6,10 @@
 ;
 ; CHECK: ********** MI Scheduling **********
 ; CHECK: shiftable
-; CHECK: *** Final schedule for BB#0 ***
-; CHECK: ADDXrr %vreg0, %vreg2
-; CHECK: ADDXrs %vreg0, %vreg2, 5
+; CHECK: SU(2):   %vreg2<def> = SUBXri %vreg1, 20, 0
+; CHECK:   Successors:
+; CHECK-NEXT:    val SU(4): Latency=1 Reg=%vreg2
+; CHECK-NEXT:    val SU(3): Latency=2 Reg=%vreg2
 ; CHECK: ********** INTERVALS **********
 define i64 @shiftable(i64 %A, i64 %B) {
         %tmp0 = sub i64 %B, 20
diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll
index cfc2ebf..1cfba82 100644
--- a/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -842,7 +842,7 @@ define <2 x i64> @scalar_to_vector.v2i64(i64 %a) {
 
 define <8 x i8> @testDUP.v1i8(<1 x i8> %a) {
 ; CHECK-LABEL: testDUP.v1i8:
-; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
+; CHECK: dup v0.8b, v0.b[0]
   %b = extractelement <1 x i8> %a, i32 0
   %c = insertelement <8 x i8> undef, i8 %b, i32 0
   %d = insertelement <8 x i8> %c, i8 %b, i32 1
@@ -857,7 +857,7 @@ define <8 x i8> @testDUP.v1i8(<1 x i8> %a) {
 
 define <8 x i16> @testDUP.v1i16(<1 x i16> %a) {
 ; CHECK-LABEL: testDUP.v1i16:
-; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
+; CHECK: dup v0.8h, v0.h[0]
   %b = extractelement <1 x i16> %a, i32 0
   %c = insertelement <8 x i16> undef, i16 %b, i32 0
   %d = insertelement <8 x i16> %c, i16 %b, i32 1
@@ -872,7 +872,7 @@ define <8 x i16> @testDUP.v1i16(<1 x i16> %a) {
 
 define <4 x i32> @testDUP.v1i32(<1 x i32> %a) {
 ; CHECK-LABEL: testDUP.v1i32:
-; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
+; CHECK: dup v0.4s, v0.s[0]
   %b = extractelement <1 x i32> %a, i32 0
   %c = insertelement <4 x i32> undef, i32 %b, i32 0
   %d = insertelement <4 x i32> %c, i32 %b, i32 1
@@ -1411,35 +1411,35 @@ define <16 x i8> @concat_vector_v16i8_const() {
 
 define <4 x i16> @concat_vector_v4i16(<1 x i16> %a) {
 ; CHECK-LABEL: concat_vector_v4i16:
-; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
+; CHECK: dup v0.4h, v0.h[0]
  %r = shufflevector <1 x i16> %a, <1 x i16> undef, <4 x i32> zeroinitializer
  ret <4 x i16> %r
 }
 
 define <4 x i32> @concat_vector_v4i32(<1 x i32> %a) {
 ; CHECK-LABEL: concat_vector_v4i32:
-; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
+; CHECK: dup v0.4s, v0.s[0]
  %r = shufflevector <1 x i32> %a, <1 x i32> undef, <4 x i32> zeroinitializer
  ret <4 x i32> %r
 }
 
 define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) {
 ; CHECK-LABEL: concat_vector_v8i8:
-; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
+; CHECK: dup v0.8b, v0.b[0]
  %r = shufflevector <1 x i8> %a, <1 x i8> undef, <8 x i32> zeroinitializer
  ret <8 x i8> %r
 }
 
 define <8 x i16> @concat_vector_v8i16(<1 x i16> %a) {
 ; CHECK-LABEL: concat_vector_v8i16:
-; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
+; CHECK: dup v0.8h, v0.h[0]
  %r = shufflevector <1 x i16> %a, <1 x i16> undef, <8 x i32> zeroinitializer
  ret <8 x i16> %r
 }
 
 define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) {
 ; CHECK-LABEL: concat_vector_v16i8:
-; CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}}
+; CHECK: dup v0.16b, v0.b[0]
  %r = shufflevector <1 x i8> %a, <1 x i8> undef, <16 x i32> zeroinitializer
  ret <16 x i8> %r
 }
diff --git a/test/CodeGen/AArch64/arm64-neon-select_cc.ll b/test/CodeGen/AArch64/arm64-neon-select_cc.ll
index 255b90d..95c582a 100644
--- a/test/CodeGen/AArch64/arm64-neon-select_cc.ll
+++ b/test/CodeGen/AArch64/arm64-neon-select_cc.ll
@@ -136,8 +136,8 @@ define <2x i64> @test_select_cc_v2i64(i64 %a, i64 %b, <2x i64> %c, <2x i64> %d )
 
 define <1 x float> @test_select_cc_v1f32(float %a, float %b, <1 x float> %c, <1 x float> %d ) {
 ; CHECK-LABEL: test_select_cc_v1f32:
-; CHECK: fcmp s0, s1
-; CHECK-NEXT: fcsel s0, s2, s3, eq
+; CHECK: fcmeq [[MASK:v[0-9]+]].2s, v0.2s, v1.2s
+; CHECK-NEXT: bsl [[MASK]].8b, v2.8b, v3.8b
   %cmp31 = fcmp oeq float %a, %b
   %e = select i1 %cmp31, <1 x float> %c, <1 x float> %d
   ret <1 x float> %e
diff --git a/test/CodeGen/AArch64/arm64-shrink-v1i64.ll b/test/CodeGen/AArch64/arm64-shrink-v1i64.ll
new file mode 100644
index 0000000..f31a570
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-shrink-v1i64.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=arm64 < %s
+
+; The DAGCombiner tries to do following shrink:
+;     Convert x+y to (VT)((SmallVT)x+(SmallVT)y)
+; But currently it can't handle vector type and will trigger an assertion failure
+; when it tries to generate an add mixed using vector type and scaler type.
+; This test checks that such assertion failur should not happen.
+define <1 x i64> @dotest(<1 x i64> %in0) {
+entry:
+  %0 = add <1 x i64> %in0, %in0
+  %vshl_n = shl <1 x i64> %0, <i64 32>
+  %vsra_n = ashr <1 x i64> %vshl_n, <i64 32>
+  ret <1 x i64> %vsra_n
+}
diff --git a/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll b/test/CodeGen/AArch64/arm64-sqshl-uqshl-i64Contant.ll
index 3949b85..3949b85 100644
--- a/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll
+++ b/test/CodeGen/AArch64/arm64-sqshl-uqshl-i64Contant.ll
diff --git a/test/CodeGen/AArch64/arm64-vcvt.ll b/test/CodeGen/AArch64/arm64-vcvt.ll
index 8c9e4e9..6570f0e 100644
--- a/test/CodeGen/AArch64/arm64-vcvt.ll
+++ b/test/CodeGen/AArch64/arm64-vcvt.ll
@@ -665,19 +665,19 @@ define <2 x double> @ucvtf_2dc(<2 x i64> %A) nounwind {
 ;CHECK-LABEL: autogen_SD28458:
 ;CHECK: fcvt
 ;CHECK: ret
-define void @autogen_SD28458() {
-  %Tr53 = fptrunc <8 x double> undef to <8 x float>
-  store <8 x float> %Tr53, <8 x float>* undef
+define void @autogen_SD28458(<8 x double> %val.f64, <8 x float>* %addr.f32) {
+  %Tr53 = fptrunc <8 x double> %val.f64 to <8 x float>
+  store <8 x float> %Tr53, <8 x float>* %addr.f32
   ret void
 }
 
 ;CHECK-LABEL: autogen_SD19225:
 ;CHECK: fcvt
 ;CHECK: ret
-define void @autogen_SD19225() {
-  %A = load <8 x float>* undef
+define void @autogen_SD19225(<8 x double>* %addr.f64, <8 x float>* %addr.f32) {
+  %A = load <8 x float>* %addr.f32
   %Tr53 = fpext <8 x float> %A to <8 x double>
-  store <8 x double> %Tr53, <8 x double>* undef
+  store <8 x double> %Tr53, <8 x double>* %addr.f64
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/arm64-vshift.ll b/test/CodeGen/AArch64/arm64-vshift.ll
index 82ae486..65bd50c 100644
--- a/test/CodeGen/AArch64/arm64-vshift.ll
+++ b/test/CodeGen/AArch64/arm64-vshift.ll
@@ -1313,6 +1313,15 @@ define <8 x i8> @uqshli8b(<8 x i8>* %A) nounwind {
         ret <8 x i8> %tmp3
 }
 
+define <8 x i8> @uqshli8b_1(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: uqshli8b_1:
+;CHECK: movi.8b [[REG:v[0-9]+]], #0x8
+;CHECK: uqshl.8b v0, v0, [[REG]]
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>)
+        ret <8 x i8> %tmp3
+}
+
 define <4 x i16> @uqshli4h(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: uqshli4h:
 ;CHECK: uqshl.4h v0, {{v[0-9]+}}, #1
diff --git a/test/CodeGen/AArch64/arm64-xaluo.ll b/test/CodeGen/AArch64/arm64-xaluo.ll
index 6cffbde..0c300de 100644
--- a/test/CodeGen/AArch64/arm64-xaluo.ll
+++ b/test/CodeGen/AArch64/arm64-xaluo.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 
 ;
 ; Get the actual value of the overflow bit.
diff --git a/test/CodeGen/AArch64/atomic-ops.ll b/test/CodeGen/AArch64/atomic-ops.ll
index 58b5d1d..26301b9 100644
--- a/test/CodeGen/AArch64/atomic-ops.ll
+++ b/test/CodeGen/AArch64/atomic-ops.ll
@@ -878,7 +878,9 @@ define i64 @test_atomic_load_umax_i64(i64 %offset) nounwind {
 
 define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i8:
-   %old = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire
+   %pair = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire
+   %old = extractvalue { i8, i1 } %pair, 0
+
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
@@ -889,8 +891,7 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
   ;  function there.
 ; CHECK-NEXT: cmp w[[OLD]], w0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
-  ; As above, w1 is a reasonable guess.
-; CHECK: stxrb [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
+; CHECK: stxrb [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb
 
@@ -900,7 +901,9 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
 
 define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i16:
-   %old = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst seq_cst
+   %pair = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst seq_cst
+   %old = extractvalue { i16, i1 } %pair, 0
+
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
@@ -911,8 +914,7 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
   ;  function there.
 ; CHECK-NEXT: cmp w[[OLD]], w0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
-  ; As above, w1 is a reasonable guess.
-; CHECK: stlxrh [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
+; CHECK: stlxrh [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb
 
@@ -922,7 +924,9 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
 
 define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i32:
-   %old = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic
+   %pair = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic
+   %old = extractvalue { i32, i1 } %pair, 0
+
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
@@ -933,8 +937,7 @@ define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
   ;  function there.
 ; CHECK-NEXT: cmp w[[OLD]], w0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
-  ; As above, w1 is a reasonable guess.
-; CHECK: stlxr [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
+; CHECK: stlxr [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb
 
@@ -944,7 +947,9 @@ define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
 
 define void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i64:
-   %old = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic monotonic
+   %pair = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic monotonic
+   %old = extractvalue { i64, i1 } %pair, 0
+
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
diff --git a/test/CodeGen/AArch64/blockaddress.ll b/test/CodeGen/AArch64/blockaddress.ll
index 1eec4cc..3a5dbdc 100644
--- a/test/CodeGen/AArch64/blockaddress.ll
+++ b/test/CodeGen/AArch64/blockaddress.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -code-model=large -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -code-model=large -mtriple=aarch64-none-linux-gnu -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-LARGE %s
 
 @addr = global i8* null
 
diff --git a/test/CodeGen/AArch64/branch-relax-asm.ll b/test/CodeGen/AArch64/branch-relax-asm.ll
new file mode 100644
index 0000000..7409c84
--- /dev/null
+++ b/test/CodeGen/AArch64/branch-relax-asm.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=aarch64-apple-ios7.0 -disable-block-placement -aarch64-tbz-offset-bits=4 -o - %s | FileCheck %s
+define i32 @test_asm_length(i32 %in) {
+; CHECK-LABEL: test_asm_length:
+
+  ; It would be more natural to use just one "tbnz %false" here, but if the
+  ; number of instructions in the asm is counted reasonably, that block is out
+  ; of the limited range we gave tbz. So branch relaxation has to invert the
+  ; condition.
+; CHECK:     tbz w0, #0, [[TRUE:LBB[0-9]+_[0-9]+]]
+; CHECK:     b [[FALSE:LBB[0-9]+_[0-9]+]]
+
+; CHECK: [[TRUE]]:
+; CHECK:     orr w0, wzr, #0x4
+; CHECK:     nop
+; CHECK:     nop
+; CHECK:     nop
+; CHECK:     nop
+; CHECK:     nop
+; CHECK:     nop
+; CHECK:     ret
+
+; CHECK: [[FALSE]]:
+; CHECK:     ret
+
+  %val = and i32 %in, 1
+  %tst = icmp eq i32 %val, 0
+  br i1 %tst, label %true, label %false
+
+true:
+  call void asm sideeffect "nop\0A\09nop\0A\09nop\0A\09nop\0A\09nop\0A\09nop", ""()
+  ret i32 4
+
+false:
+  ret i32 0
+}
diff --git a/test/CodeGen/AArch64/breg.ll b/test/CodeGen/AArch64/breg.ll
index 591f483..9524044 100644
--- a/test/CodeGen/AArch64/breg.ll
+++ b/test/CodeGen/AArch64/breg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 
 @stored_label = global i8* null
 
diff --git a/test/CodeGen/AArch64/cmpxchg-idioms.ll b/test/CodeGen/AArch64/cmpxchg-idioms.ll
new file mode 100644
index 0000000..0c008c2
--- /dev/null
+++ b/test/CodeGen/AArch64/cmpxchg-idioms.ll
@@ -0,0 +1,93 @@
+; RUN: llc -mtriple=aarch64-apple-ios7.0 -o - %s | FileCheck %s
+
+define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
+; CHECK-LABEL: test_return:
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr [[LOADED:w[0-9]+]], [x0]
+; CHECK: cmp [[LOADED]], w1
+; CHECK: b.ne [[FAILED:LBB[0-9]+_[0-9]+]]
+
+; CHECK: stlxr [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x0]
+; CHECK: cbnz [[STATUS]], [[LOOP]]
+
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: orr w0, wzr, #0x1
+; CHECK: ret
+
+; CHECK: [[FAILED]]:
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: mov w0, wzr
+; CHECK: ret
+
+  %pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
+  %success = extractvalue { i32, i1 } %pair, 1
+  %conv = zext i1 %success to i32
+  ret i32 %conv
+}
+
+define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) {
+; CHECK-LABEL: test_return_bool:
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxrb [[LOADED:w[0-9]+]], [x0]
+; CHECK: cmp [[LOADED]], w1, uxtb
+; CHECK: b.ne [[FAILED:LBB[0-9]+_[0-9]+]]
+
+; CHECK: stlxrb [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x0]
+; CHECK: cbnz [[STATUS]], [[LOOP]]
+
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+  ; FIXME: DAG combine should be able to deal with this.
+; CHECK: orr [[TMP:w[0-9]+]], wzr, #0x1
+; CHECK: eor w0, [[TMP]], #0x1
+; CHECK: ret
+
+; CHECK: [[FAILED]]:
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: mov [[TMP:w[0-9]+]], wzr
+; CHECK: eor w0, [[TMP]], #0x1
+; CHECK: ret
+
+  %pair = cmpxchg i8* %value, i8 %oldValue, i8 %newValue acq_rel monotonic
+  %success = extractvalue { i8, i1 } %pair, 1
+  %failure = xor i1 %success, 1
+  ret i1 %failure
+}
+
+define void @test_conditional(i32* %p, i32 %oldval, i32 %newval) {
+; CHECK-LABEL: test_conditional:
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr [[LOADED:w[0-9]+]], [x0]
+; CHECK: cmp [[LOADED]], w1
+; CHECK: b.ne [[FAILED:LBB[0-9]+_[0-9]+]]
+
+; CHECK: stlxr [[STATUS:w[0-9]+]], w2, [x0]
+; CHECK: cbnz [[STATUS]], [[LOOP]]
+
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: b _bar
+
+; CHECK: [[FAILED]]:
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: b _baz
+
+  %pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
+  %success = extractvalue { i32, i1 } %pair, 1
+  br i1 %success, label %true, label %false
+
+true:
+  tail call void @bar() #2
+  br label %end
+
+false:
+  tail call void @baz() #2
+  br label %end
+
+end:
+  ret void
+}
+
+declare void @bar()
+declare void @baz()
diff --git a/test/CodeGen/AArch64/compiler-ident.ll b/test/CodeGen/AArch64/compiler-ident.ll
new file mode 100644
index 0000000..0350571
--- /dev/null
+++ b/test/CodeGen/AArch64/compiler-ident.ll
@@ -0,0 +1,12 @@
+; RUN: llc -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
+
+; ModuleID = 'compiler-ident.c'
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK: .ident  "some LLVM version"
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"some LLVM version"}
+
diff --git a/test/CodeGen/AArch64/complex-fp-to-int.ll b/test/CodeGen/AArch64/complex-fp-to-int.ll
new file mode 100644
index 0000000..13cf762
--- /dev/null
+++ b/test/CodeGen/AArch64/complex-fp-to-int.ll
@@ -0,0 +1,141 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x i64> @test_v2f32_to_signed_v2i64(<2 x float> %in) {
+; CHECK-LABEL: test_v2f32_to_signed_v2i64:
+; CHECK: fcvtl [[VAL64:v[0-9]+]].2d, v0.2s
+; CHECK: fcvtzs.2d v0, [[VAL64]]
+
+  %val = fptosi <2 x float> %in to <2 x i64>
+  ret <2 x i64> %val
+}
+
+define <2 x i64> @test_v2f32_to_unsigned_v2i64(<2 x float> %in) {
+; CHECK-LABEL: test_v2f32_to_unsigned_v2i64:
+; CHECK: fcvtl [[VAL64:v[0-9]+]].2d, v0.2s
+; CHECK: fcvtzu.2d v0, [[VAL64]]
+
+  %val = fptoui <2 x float> %in to <2 x i64>
+  ret <2 x i64> %val
+}
+
+define <2 x i16> @test_v2f32_to_signed_v2i16(<2 x float> %in) {
+; CHECK-LABEL: test_v2f32_to_signed_v2i16:
+; CHECK: fcvtzs.2s v0, v0
+
+  %val = fptosi <2 x float> %in to <2 x i16>
+  ret <2 x i16> %val
+}
+
+define <2 x i16> @test_v2f32_to_unsigned_v2i16(<2 x float> %in) {
+; CHECK-LABEL: test_v2f32_to_unsigned_v2i16:
+; CHECK: fcvtzs.2s v0, v0
+
+  %val = fptoui <2 x float> %in to <2 x i16>
+  ret <2 x i16> %val
+}
+
+define <2 x i8> @test_v2f32_to_signed_v2i8(<2 x float> %in) {
+; CHECK-LABEL: test_v2f32_to_signed_v2i8:
+; CHECK: fcvtzs.2s v0, v0
+
+  %val = fptosi <2 x float> %in to <2 x i8>
+  ret <2 x i8> %val
+}
+
+define <2 x i8> @test_v2f32_to_unsigned_v2i8(<2 x float> %in) {
+; CHECK-LABEL: test_v2f32_to_unsigned_v2i8:
+; CHECK: fcvtzs.2s v0, v0
+
+  %val = fptoui <2 x float> %in to <2 x i8>
+  ret <2 x i8> %val
+}
+
+define <4 x i16> @test_v4f32_to_signed_v4i16(<4 x float> %in) {
+; CHECK-LABEL: test_v4f32_to_signed_v4i16:
+; CHECK: fcvtzs.4s [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.4h v0, [[VAL64]]
+
+  %val = fptosi <4 x float> %in to <4 x i16>
+  ret <4 x i16> %val
+}
+
+define <4 x i16> @test_v4f32_to_unsigned_v4i16(<4 x float> %in) {
+; CHECK-LABEL: test_v4f32_to_unsigned_v4i16:
+; CHECK: fcvtzu.4s [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.4h v0, [[VAL64]]
+
+  %val = fptoui <4 x float> %in to <4 x i16>
+  ret <4 x i16> %val
+}
+
+define <4 x i8> @test_v4f32_to_signed_v4i8(<4 x float> %in) {
+; CHECK-LABEL: test_v4f32_to_signed_v4i8:
+; CHECK: fcvtzs.4s [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.4h v0, [[VAL64]]
+
+  %val = fptosi <4 x float> %in to <4 x i8>
+  ret <4 x i8> %val
+}
+
+define <4 x i8> @test_v4f32_to_unsigned_v4i8(<4 x float> %in) {
+; CHECK-LABEL: test_v4f32_to_unsigned_v4i8:
+; CHECK: fcvtzs.4s [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.4h v0, [[VAL64]]
+
+  %val = fptoui <4 x float> %in to <4 x i8>
+  ret <4 x i8> %val
+}
+
+define <2 x i32> @test_v2f64_to_signed_v2i32(<2 x double> %in) {
+; CHECK-LABEL: test_v2f64_to_signed_v2i32:
+; CHECK: fcvtzs.2d [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.2s v0, [[VAL64]]
+
+  %val = fptosi <2 x double> %in to <2 x i32>
+  ret <2 x i32> %val
+}
+
+define <2 x i32> @test_v2f64_to_unsigned_v2i32(<2 x double> %in) {
+; CHECK-LABEL: test_v2f64_to_unsigned_v2i32:
+; CHECK: fcvtzu.2d [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.2s v0, [[VAL64]]
+
+  %val = fptoui <2 x double> %in to <2 x i32>
+  ret <2 x i32> %val
+}
+
+define <2 x i16> @test_v2f64_to_signed_v2i16(<2 x double> %in) {
+; CHECK-LABEL: test_v2f64_to_signed_v2i16:
+; CHECK: fcvtzs.2d [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.2s v0, [[VAL64]]
+
+  %val = fptosi <2 x double> %in to <2 x i16>
+  ret <2 x i16> %val
+}
+
+define <2 x i16> @test_v2f64_to_unsigned_v2i16(<2 x double> %in) {
+; CHECK-LABEL: test_v2f64_to_unsigned_v2i16:
+; CHECK: fcvtzs.2d [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.2s v0, [[VAL64]]
+
+  %val = fptoui <2 x double> %in to <2 x i16>
+  ret <2 x i16> %val
+}
+
+define <2 x i8> @test_v2f64_to_signed_v2i8(<2 x double> %in) {
+; CHECK-LABEL: test_v2f64_to_signed_v2i8:
+; CHECK: fcvtzs.2d [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.2s v0, [[VAL64]]
+
+  %val = fptosi <2 x double> %in to <2 x i8>
+  ret <2 x i8> %val
+}
+
+define <2 x i8> @test_v2f64_to_unsigned_v2i8(<2 x double> %in) {
+; CHECK-LABEL: test_v2f64_to_unsigned_v2i8:
+; CHECK: fcvtzs.2d [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.2s v0, [[VAL64]]
+
+  %val = fptoui <2 x double> %in to <2 x i8>
+  ret <2 x i8> %val
+}
diff --git a/test/CodeGen/AArch64/complex-int-to-fp.ll b/test/CodeGen/AArch64/complex-int-to-fp.ll
new file mode 100644
index 0000000..5c943f9
--- /dev/null
+++ b/test/CodeGen/AArch64/complex-int-to-fp.ll
@@ -0,0 +1,164 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+; CHECK: autogen_SD19655
+; CHECK: scvtf
+; CHECK: ret
+define void @autogen_SD19655(<2 x i64>* %addr, <2 x float>* %addrfloat) {
+  %T = load <2 x i64>* %addr
+  %F = sitofp <2 x i64> %T to <2 x float>
+  store <2 x float> %F, <2 x float>* %addrfloat
+  ret void
+}
+
+define <2 x double> @test_signed_v2i32_to_v2f64(<2 x i32> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v2i32_to_v2f64:
+; CHECK: sshll.2d [[VAL64:v[0-9]+]], v0, #0
+; CHECK-NEXT: scvtf.2d v0, [[VAL64]]
+; CHECK-NEXT: ret
+  %conv = sitofp <2 x i32> %v to <2 x double>
+  ret <2 x double> %conv
+}
+
+define <2 x double> @test_unsigned_v2i32_to_v2f64(<2 x i32> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v2i32_to_v2f64
+; CHECK: ushll.2d [[VAL64:v[0-9]+]], v0, #0
+; CHECK-NEXT: ucvtf.2d v0, [[VAL64]]
+; CHECK-NEXT: ret
+  %conv = uitofp <2 x i32> %v to <2 x double>
+  ret <2 x double> %conv
+}
+
+define <2 x double> @test_signed_v2i16_to_v2f64(<2 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v2i16_to_v2f64:
+; CHECK: shl.2s [[TMP:v[0-9]+]], v0, #16
+; CHECK: sshr.2s [[VAL32:v[0-9]+]], [[TMP]], #16
+; CHECK: sshll.2d [[VAL64:v[0-9]+]], [[VAL32]], #0
+; CHECK: scvtf.2d v0, [[VAL64]]
+
+  %conv = sitofp <2 x i16> %v to <2 x double>
+  ret <2 x double> %conv
+}
+define <2 x double> @test_unsigned_v2i16_to_v2f64(<2 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v2i16_to_v2f64
+; CHECK: movi d[[MASK:[0-9]+]], #0x00ffff0000ffff
+; CHECK: and.8b [[VAL32:v[0-9]+]], v0, v[[MASK]]
+; CHECK: ushll.2d [[VAL64:v[0-9]+]], [[VAL32]], #0
+; CHECK: ucvtf.2d v0, [[VAL64]]
+
+  %conv = uitofp <2 x i16> %v to <2 x double>
+  ret <2 x double> %conv
+}
+
+define <2 x double> @test_signed_v2i8_to_v2f64(<2 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v2i8_to_v2f64:
+; CHECK: shl.2s [[TMP:v[0-9]+]], v0, #24
+; CHECK: sshr.2s [[VAL32:v[0-9]+]], [[TMP]], #24
+; CHECK: sshll.2d [[VAL64:v[0-9]+]], [[VAL32]], #0
+; CHECK: scvtf.2d v0, [[VAL64]]
+
+  %conv = sitofp <2 x i8> %v to <2 x double>
+  ret <2 x double> %conv
+}
+define <2 x double> @test_unsigned_v2i8_to_v2f64(<2 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v2i8_to_v2f64
+; CHECK: movi d[[MASK:[0-9]+]], #0x0000ff000000ff
+; CHECK: and.8b [[VAL32:v[0-9]+]], v0, v[[MASK]]
+; CHECK: ushll.2d [[VAL64:v[0-9]+]], [[VAL32]], #0
+; CHECK: ucvtf.2d v0, [[VAL64]]
+
+  %conv = uitofp <2 x i8> %v to <2 x double>
+  ret <2 x double> %conv
+}
+
+define <2 x float> @test_signed_v2i64_to_v2f32(<2 x i64> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v2i64_to_v2f32:
+; CHECK: scvtf.2d [[VAL64:v[0-9]+]], v0
+; CHECK: fcvtn v0.2s, [[VAL64]].2d
+
+  %conv = sitofp <2 x i64> %v to <2 x float>
+  ret <2 x float> %conv
+}
+define <2 x float> @test_unsigned_v2i64_to_v2f32(<2 x i64> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v2i64_to_v2f32
+; CHECK: ucvtf.2d [[VAL64:v[0-9]+]], v0
+; CHECK: fcvtn v0.2s, [[VAL64]].2d
+
+  %conv = uitofp <2 x i64> %v to <2 x float>
+  ret <2 x float> %conv
+}
+
+define <2 x float> @test_signed_v2i16_to_v2f32(<2 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v2i16_to_v2f32:
+; CHECK: shl.2s [[TMP:v[0-9]+]], v0, #16
+; CHECK: sshr.2s [[VAL32:v[0-9]+]], [[TMP]], #16
+; CHECK: scvtf.2s v0, [[VAL32]]
+
+  %conv = sitofp <2 x i16> %v to <2 x float>
+  ret <2 x float> %conv
+}
+define <2 x float> @test_unsigned_v2i16_to_v2f32(<2 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v2i16_to_v2f32
+; CHECK: movi d[[MASK:[0-9]+]], #0x00ffff0000ffff
+; CHECK: and.8b [[VAL32:v[0-9]+]], v0, v[[MASK]]
+; CHECK: ucvtf.2s v0, [[VAL32]]
+
+  %conv = uitofp <2 x i16> %v to <2 x float>
+  ret <2 x float> %conv
+}
+
+define <2 x float> @test_signed_v2i8_to_v2f32(<2 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v2i8_to_v2f32:
+; CHECK: shl.2s [[TMP:v[0-9]+]], v0, #24
+; CHECK: sshr.2s [[VAL32:v[0-9]+]], [[TMP]], #24
+; CHECK: scvtf.2s v0, [[VAL32]]
+
+  %conv = sitofp <2 x i8> %v to <2 x float>
+  ret <2 x float> %conv
+}
+define <2 x float> @test_unsigned_v2i8_to_v2f32(<2 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v2i8_to_v2f32
+; CHECK: movi d[[MASK:[0-9]+]], #0x0000ff000000ff
+; CHECK: and.8b [[VAL32:v[0-9]+]], v0, v[[MASK]]
+; CHECK: ucvtf.2s v0, [[VAL32]]
+
+  %conv = uitofp <2 x i8> %v to <2 x float>
+  ret <2 x float> %conv
+}
+
+define <4 x float> @test_signed_v4i16_to_v4f32(<4 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v4i16_to_v4f32:
+; CHECK: sshll.4s [[VAL32:v[0-9]+]], v0, #0
+; CHECK: scvtf.4s v0, [[VAL32]]
+
+  %conv = sitofp <4 x i16> %v to <4 x float>
+  ret <4 x float> %conv
+}
+
+define <4 x float> @test_unsigned_v4i16_to_v4f32(<4 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v4i16_to_v4f32
+; CHECK: ushll.4s [[VAL32:v[0-9]+]], v0, #0
+; CHECK: ucvtf.4s v0, [[VAL32]]
+
+  %conv = uitofp <4 x i16> %v to <4 x float>
+  ret <4 x float> %conv
+}
+
+define <4 x float> @test_signed_v4i8_to_v4f32(<4 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v4i8_to_v4f32:
+; CHECK: shl.4h [[TMP:v[0-9]+]], v0, #8
+; CHECK: sshr.4h [[VAL16:v[0-9]+]], [[TMP]], #8
+; CHECK: sshll.4s [[VAL32:v[0-9]+]], [[VAL16]], #0
+; CHECK: scvtf.4s v0, [[VAL32]]
+
+  %conv = sitofp <4 x i8> %v to <4 x float>
+  ret <4 x float> %conv
+}
+define <4 x float> @test_unsigned_v4i8_to_v4f32(<4 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v4i8_to_v4f32
+; CHECK: bic.4h v0, #0xff, lsl #8
+; CHECK: ushll.4s [[VAL32:v[0-9]+]], v0, #0
+; CHECK: ucvtf.4s v0, [[VAL32]]
+
+  %conv = uitofp <4 x i8> %v to <4 x float>
+  ret <4 x float> %conv
+}
diff --git a/test/CodeGen/AArch64/directcond.ll b/test/CodeGen/AArch64/directcond.ll
index 1b51928..fbea4a6 100644
--- a/test/CodeGen/AArch64/directcond.ll
+++ b/test/CodeGen/AArch64/directcond.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 -aarch64-atomic-cfg-tidy=0 | FileCheck %s --check-prefix=CHECK
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -aarch64-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-NOFP %s
 
 define i32 @test_select_i32(i1 %bit, i32 %a, i32 %b) {
 ; CHECK-LABEL: test_select_i32:
diff --git a/test/CodeGen/AArch64/f16-convert.ll b/test/CodeGen/AArch64/f16-convert.ll
new file mode 100644
index 0000000..6fabdc5
--- /dev/null
+++ b/test/CodeGen/AArch64/f16-convert.ll
@@ -0,0 +1,254 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios -asm-verbose=false | FileCheck %s
+
+define float @load0(i16* nocapture readonly %a) nounwind {
+; CHECK-LABEL: load0:
+; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0]
+; CHECK-NEXT: fcvt s0, [[HREG]]
+; CHECK-NEXT: ret
+
+  %tmp = load i16* %a, align 2
+  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  ret float %tmp1
+}
+
+define double @load1(i16* nocapture readonly %a) nounwind {
+; CHECK-LABEL: load1:
+; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0]
+; CHECK-NEXT: fcvt d0, [[HREG]]
+; CHECK-NEXT: ret
+
+  %tmp = load i16* %a, align 2
+  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %conv = fpext float %tmp1 to double
+  ret double %conv
+}
+
+define float @load2(i16* nocapture readonly %a, i32 %i) nounwind {
+; CHECK-LABEL: load2:
+; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0, w1, sxtw #1]
+; CHECK-NEXT: fcvt s0, [[HREG]]
+; CHECK-NEXT: ret
+
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom
+  %tmp = load i16* %arrayidx, align 2
+  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  ret float %tmp1
+}
+
+define double @load3(i16* nocapture readonly %a, i32 %i) nounwind {
+; CHECK-LABEL: load3:
+; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0, w1, sxtw #1]
+; CHECK-NEXT: fcvt d0, [[HREG]]
+; CHECK-NEXT: ret
+
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom
+  %tmp = load i16* %arrayidx, align 2
+  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %conv = fpext float %tmp1 to double
+  ret double %conv
+}
+
+define float @load4(i16* nocapture readonly %a, i64 %i) nounwind {
+; CHECK-LABEL: load4:
+; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: fcvt s0, [[HREG]]
+; CHECK-NEXT: ret
+
+  %arrayidx = getelementptr inbounds i16* %a, i64 %i
+  %tmp = load i16* %arrayidx, align 2
+  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  ret float %tmp1
+}
+
+define double @load5(i16* nocapture readonly %a, i64 %i) nounwind {
+; CHECK-LABEL: load5:
+; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: fcvt d0, [[HREG]]
+; CHECK-NEXT: ret
+
+  %arrayidx = getelementptr inbounds i16* %a, i64 %i
+  %tmp = load i16* %arrayidx, align 2
+  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %conv = fpext float %tmp1 to double
+  ret double %conv
+}
+
+define float @load6(i16* nocapture readonly %a) nounwind {
+; CHECK-LABEL: load6:
+; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0, #20]
+; CHECK-NEXT: fcvt s0, [[HREG]]
+; CHECK-NEXT: ret
+
+  %arrayidx = getelementptr inbounds i16* %a, i64 10
+  %tmp = load i16* %arrayidx, align 2
+  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  ret float %tmp1
+}
+
+define double @load7(i16* nocapture readonly %a) nounwind {
+; CHECK-LABEL: load7:
+; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0, #20]
+; CHECK-NEXT: fcvt d0, [[HREG]]
+; CHECK-NEXT: ret
+
+  %arrayidx = getelementptr inbounds i16* %a, i64 10
+  %tmp = load i16* %arrayidx, align 2
+  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %conv = fpext float %tmp1 to double
+  ret double %conv
+}
+
+define float @load8(i16* nocapture readonly %a) nounwind {
+; CHECK-LABEL: load8:
+; CHECK-NEXT: ldur [[HREG:h[0-9]+]], [x0, #-20]
+; CHECK-NEXT: fcvt s0, [[HREG]]
+; CHECK-NEXT: ret
+
+  %arrayidx = getelementptr inbounds i16* %a, i64 -10
+  %tmp = load i16* %arrayidx, align 2
+  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  ret float %tmp1
+}
+
+define double @load9(i16* nocapture readonly %a) nounwind {
+; CHECK-LABEL: load9:
+; CHECK-NEXT: ldur [[HREG:h[0-9]+]], [x0, #-20]
+; CHECK-NEXT: fcvt d0, [[HREG]]
+; CHECK-NEXT: ret
+
+  %arrayidx = getelementptr inbounds i16* %a, i64 -10
+  %tmp = load i16* %arrayidx, align 2
+  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %conv = fpext float %tmp1 to double
+  ret double %conv
+}
+
+define void @store0(i16* nocapture %a, float %val) nounwind {
+; CHECK-LABEL: store0:
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: str  h0, [x0]
+; CHECK-NEXT: ret
+
+  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  store i16 %tmp, i16* %a, align 2
+  ret void
+}
+
+define void @store1(i16* nocapture %a, double %val) nounwind {
+; CHECK-LABEL: store1:
+; CHECK-NEXT: fcvt h0, d0
+; CHECK-NEXT: str  h0, [x0]
+; CHECK-NEXT: ret
+
+  %conv = fptrunc double %val to float
+  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  store i16 %tmp, i16* %a, align 2
+  ret void
+}
+
+define void @store2(i16* nocapture %a, i32 %i, float %val) nounwind {
+; CHECK-LABEL: store2:
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: str h0, [x0, w1, sxtw #1]
+; CHECK-NEXT: ret
+
+  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom
+  store i16 %tmp, i16* %arrayidx, align 2
+  ret void
+}
+
+define void @store3(i16* nocapture %a, i32 %i, double %val) nounwind {
+; CHECK-LABEL: store3:
+; CHECK-NEXT: fcvt h0, d0
+; CHECK-NEXT: str h0, [x0, w1, sxtw #1]
+; CHECK-NEXT: ret
+
+  %conv = fptrunc double %val to float
+  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom
+  store i16 %tmp, i16* %arrayidx, align 2
+  ret void
+}
+
+define void @store4(i16* nocapture %a, i64 %i, float %val) nounwind {
+; CHECK-LABEL: store4:
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: str h0, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+
+  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %arrayidx = getelementptr inbounds i16* %a, i64 %i
+  store i16 %tmp, i16* %arrayidx, align 2
+  ret void
+}
+
+define void @store5(i16* nocapture %a, i64 %i, double %val) nounwind {
+; CHECK-LABEL: store5:
+; CHECK-NEXT: fcvt h0, d0
+; CHECK-NEXT: str h0, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+
+  %conv = fptrunc double %val to float
+  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %arrayidx = getelementptr inbounds i16* %a, i64 %i
+  store i16 %tmp, i16* %arrayidx, align 2
+  ret void
+}
+
+define void @store6(i16* nocapture %a, float %val) nounwind {
+; CHECK-LABEL: store6:
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: str h0, [x0, #20]
+; CHECK-NEXT: ret
+
+  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %arrayidx = getelementptr inbounds i16* %a, i64 10
+  store i16 %tmp, i16* %arrayidx, align 2
+  ret void
+}
+
+define void @store7(i16* nocapture %a, double %val) nounwind {
+; CHECK-LABEL: store7:
+; CHECK-NEXT: fcvt h0, d0
+; CHECK-NEXT: str h0, [x0, #20]
+; CHECK-NEXT: ret
+
+  %conv = fptrunc double %val to float
+  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %arrayidx = getelementptr inbounds i16* %a, i64 10
+  store i16 %tmp, i16* %arrayidx, align 2
+  ret void
+}
+
+define void @store8(i16* nocapture %a, float %val) nounwind {
+; CHECK-LABEL: store8:
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: stur h0, [x0, #-20]
+; CHECK-NEXT: ret
+
+  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %arrayidx = getelementptr inbounds i16* %a, i64 -10
+  store i16 %tmp, i16* %arrayidx, align 2
+  ret void
+}
+
+define void @store9(i16* nocapture %a, double %val) nounwind {
+; CHECK-LABEL: store9:
+; CHECK-NEXT: fcvt h0, d0
+; CHECK-NEXT: stur h0, [x0, #-20]
+; CHECK-NEXT: ret
+
+  %conv = fptrunc double %val to float
+  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %arrayidx = getelementptr inbounds i16* %a, i64 -10
+  store i16 %tmp, i16* %arrayidx, align 2
+  ret void
+}
+
+declare i16 @llvm.convert.to.fp16(float) nounwind readnone
+declare float @llvm.convert.from.fp16(i16) nounwind readnone
diff --git a/test/CodeGen/AArch64/fast-isel-mul.ll b/test/CodeGen/AArch64/fast-isel-mul.ll
new file mode 100644
index 0000000..d02c67f
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-mul.ll
@@ -0,0 +1,40 @@
+; RUN: llc -fast-isel -fast-isel-abort -mtriple=aarch64 -o - %s | FileCheck %s
+
+@var8 = global i8 0
+@var16 = global i16 0
+@var32 = global i32 0
+@var64 = global i64 0
+
+define void @test_mul8(i8 %lhs, i8 %rhs) {
+; CHECK-LABEL: test_mul8:
+; CHECK: mul w0, w0, w1
+;  %lhs = load i8* @var8
+;  %rhs = load i8* @var8
+  %prod = mul i8 %lhs, %rhs
+  store i8 %prod, i8* @var8
+  ret void
+}
+
+define void @test_mul16(i16 %lhs, i16 %rhs) {
+; CHECK-LABEL: test_mul16:
+; CHECK: mul w0, w0, w1
+  %prod = mul i16 %lhs, %rhs
+  store i16 %prod, i16* @var16
+  ret void
+}
+
+define void @test_mul32(i32 %lhs, i32 %rhs) {
+; CHECK-LABEL: test_mul32:
+; CHECK: mul w0, w0, w1
+  %prod = mul i32 %lhs, %rhs
+  store i32 %prod, i32* @var32
+  ret void
+}
+
+define void @test_mul64(i64 %lhs, i64 %rhs) {
+; CHECK-LABEL: test_mul64:
+; CHECK: mul x0, x0, x1
+  %prod = mul i64 %lhs, %rhs
+  store i64 %prod, i64* @var64
+  ret void
+}
diff --git a/test/CodeGen/AArch64/flags-multiuse.ll b/test/CodeGen/AArch64/flags-multiuse.ll
index c9b0b9f..77bbcdd 100644
--- a/test/CodeGen/AArch64/flags-multiuse.ll
+++ b/test/CodeGen/AArch64/flags-multiuse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -o - %s | FileCheck %s
 
 ; LLVM should be able to cope with multiple uses of the same flag-setting
 ; instruction at different points of a routine. Either by rematerializing the
diff --git a/test/CodeGen/AArch64/funcptr_cast.ll b/test/CodeGen/AArch64/funcptr_cast.ll
new file mode 100644
index 0000000..a00b7bc
--- /dev/null
+++ b/test/CodeGen/AArch64/funcptr_cast.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+define i8 @test() {
+; CHECK-LABEL: @test
+; CHECK: adrp {{x[0-9]+}}, foo
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, :lo12:foo
+; CHECK: ldrb w0, [{{x[0-9]+}}]
+entry:
+  %0 = load i8* bitcast (void (...)* @foo to i8*), align 1
+  ret i8 %0
+}
+
+declare void @foo(...)
diff --git a/test/CodeGen/AArch64/global-merge-1.ll b/test/CodeGen/AArch64/global-merge-1.ll
new file mode 100644
index 0000000..68aba5e
--- /dev/null
+++ b/test/CodeGen/AArch64/global-merge-1.ll
@@ -0,0 +1,26 @@
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -enable-global-merge -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -enable-global-merge -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+
+; RUN: llc %s -mtriple=aarch64-apple-ios -enable-global-merge -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+; RUN: llc %s -mtriple=aarch64-apple-ios -enable-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+
+@m = internal global i32 0, align 4
+@n = internal global i32 0, align 4
+
+define void @f1(i32 %a1, i32 %a2) {
+;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals@PAGE
+;CHECK-APPLE-IOS-NOT: adrp
+;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals@PAGEOFF
+  store i32 %a1, i32* @m, align 4
+  store i32 %a2, i32* @n, align 4
+  ret void
+}
+
+;CHECK:	.type	_MergedGlobals,@object  // @_MergedGlobals
+;CHECK:	.local	_MergedGlobals
+;CHECK:	.comm	_MergedGlobals,8,8
+
+;CHECK-APPLE-IOS: .zerofill __DATA,__bss,__MergedGlobals,8,3 ; @_MergedGlobals
diff --git a/test/CodeGen/AArch64/global-merge-2.ll b/test/CodeGen/AArch64/global-merge-2.ll
new file mode 100644
index 0000000..a773566
--- /dev/null
+++ b/test/CodeGen/AArch64/global-merge-2.ll
@@ -0,0 +1,51 @@
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-apple-ios -enable-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+
+@x = global i32 0, align 4
+@y = global i32 0, align 4
+@z = global i32 0, align 4
+
+define void @f1(i32 %a1, i32 %a2) {
+;CHECK-APPLE-IOS-LABEL: _f1:
+;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals_x@PAGE
+;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals_x@PAGEOFF
+;CHECK-APPLE-IOS-NOT: adrp
+  store i32 %a1, i32* @x, align 4
+  store i32 %a2, i32* @y, align 4
+  ret void
+}
+
+define void @g1(i32 %a1, i32 %a2) {
+;CHECK-APPLE-IOS-LABEL: _g1:
+;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals_x@PAGE
+;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals_x@PAGEOFF
+;CHECK-APPLE-IOS-NOT: adrp
+  store i32 %a1, i32* @y, align 4
+  store i32 %a2, i32* @z, align 4
+  ret void
+}
+
+;CHECK:	.type	_MergedGlobals_x,@object // @_MergedGlobals_x
+;CHECK:	.globl	_MergedGlobals_x
+;CHECK:	.align	3
+;CHECK: _MergedGlobals_x:
+;CHECK:	.size	_MergedGlobals_x, 12
+
+;CHECK:	.globl	x
+;CHECK: x = _MergedGlobals_x
+;CHECK:	.globl	y
+;CHECK: y = _MergedGlobals_x+4
+;CHECK:	.globl	z
+;CHECK: z = _MergedGlobals_x+8
+
+;CHECK-APPLE-IOS: .globl	__MergedGlobals_x       ; @_MergedGlobals_x
+;CHECK-APPLE-IOS: .zerofill __DATA,__common,__MergedGlobals_x,12,3
+
+;CHECK-APPLE-IOS: .globl	_x
+;CHECK-APPLE-IOS: _x = __MergedGlobals_x
+;CHECK-APPLE-IOS: .globl	_y
+;CHECK-APPLE-IOS: _y = __MergedGlobals_x+4
+;CHECK-APPLE-IOS: .globl	_z
+;CHECK-APPLE-IOS: _z = __MergedGlobals_x+8
+;CHECK-APPLE-IOS: .subsections_via_symbols
diff --git a/test/CodeGen/AArch64/global-merge-3.ll b/test/CodeGen/AArch64/global-merge-3.ll
new file mode 100644
index 0000000..d455d40
--- /dev/null
+++ b/test/CodeGen/AArch64/global-merge-3.ll
@@ -0,0 +1,51 @@
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-apple-ios -enable-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+
+@x = global [1000 x i32] zeroinitializer, align 1
+@y = global [1000 x i32] zeroinitializer, align 1
+@z = internal global i32 1, align 4
+
+define void @f1(i32 %a1, i32 %a2, i32 %a3) {
+;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals_x@PAGE
+;CHECK-APPLE-IOS-NOT: adrp
+;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals_x@PAGEOFF
+;CHECK-APPLE-IOS: adrp	x9, __MergedGlobals_y@PAGE
+;CHECK-APPLE-IOS: add	x9, x9, __MergedGlobals_y@PAGEOFF
+  %x3 = getelementptr inbounds [1000 x i32]* @x, i32 0, i64 3
+  %y3 = getelementptr inbounds [1000 x i32]* @y, i32 0, i64 3
+  store i32 %a1, i32* %x3, align 4
+  store i32 %a2, i32* %y3, align 4
+  store i32 %a3, i32* @z, align 4
+  ret void
+}
+
+;CHECK:	.type	_MergedGlobals_x,@object // @_MergedGlobals_x
+;CHECK: .globl	_MergedGlobals_x
+;CHECK: .align	4
+;CHECK: _MergedGlobals_x:
+;CHECK: .size	_MergedGlobals_x, 4004
+
+;CHECK: .type	_MergedGlobals_y,@object // @_MergedGlobals_y
+;CHECK: .globl	_MergedGlobals_y
+;CHECK: _MergedGlobals_y:
+;CHECK: .size	_MergedGlobals_y, 4000
+
+;CHECK-APPLE-IOS: .globl	__MergedGlobals_x       ; @_MergedGlobals_x
+;CHECK-APPLE-IOS: .align	4
+;CHECK-APPLE-IOS:  __MergedGlobals_x:
+;CHECK-APPLE-IOS: .long 1
+;CHECK-APPLE-IOS: .space	4000
+
+;CHECK-APPLE-IOS: .globl	__MergedGlobals_y       ; @_MergedGlobals_y
+;CHECK-APPLE-IOS: .zerofill __DATA,__common,__MergedGlobals_y,4000,4
+
+;CHECK:	.globl	x
+;CHECK: x = _MergedGlobals_x+4
+;CHECK:	.globl	y
+;CHECK: y = _MergedGlobals_y
+
+;CHECK-APPLE-IOS:.globl	_x
+;CHECK-APPLE-IOS: _x = __MergedGlobals_x+4
+;CHECK-APPLE-IOS:.globl	_y
+;CHECK-APPLE-IOS: _y = __MergedGlobals_y
diff --git a/test/CodeGen/AArch64/global-merge-4.ll b/test/CodeGen/AArch64/global-merge-4.ll
new file mode 100644
index 0000000..a525ccd
--- /dev/null
+++ b/test/CodeGen/AArch64/global-merge-4.ll
@@ -0,0 +1,73 @@
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -enable-global-merge -o - | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+@bar = internal global [5 x i32] zeroinitializer, align 4
+@baz = internal global [5 x i32] zeroinitializer, align 4
+@foo = internal global [5 x i32] zeroinitializer, align 4
+
+; Function Attrs: nounwind ssp
+define internal void @initialize() #0 {
+  %1 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %1, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 0), align 4
+  %2 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %2, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 0), align 4
+  %3 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %3, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 1), align 4
+  %4 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %4, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 1), align 4
+  %5 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %5, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 2), align 4
+  %6 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 2), align 4
+  %7 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %7, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 3), align 4
+  %8 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %8, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 3), align 4
+  %9 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %9, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 4), align 4
+  %10 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %10, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 4), align 4
+  ret void
+}
+
+declare i32 @calc(...)
+
+; Function Attrs: nounwind ssp
+define internal void @calculate() #0 {
+  %1 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 0), align 4
+  %2 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 0), align 4
+  %3 = mul nsw i32 %2, %1
+  store i32 %3, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 0), align 4
+  %4 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 1), align 4
+  %5 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 1), align 4
+  %6 = mul nsw i32 %5, %4
+  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 1), align 4
+  %7 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 2), align 4
+  %8 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 2), align 4
+  %9 = mul nsw i32 %8, %7
+  store i32 %9, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 2), align 4
+  %10 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 3), align 4
+  %11 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 3), align 4
+  %12 = mul nsw i32 %11, %10
+  store i32 %12, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 3), align 4
+  %13 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 4), align 4
+  %14 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 4), align 4
+  %15 = mul nsw i32 %14, %13
+  store i32 %15, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 4), align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone ssp
+define internal i32* @returnFoo() #1 {
+  ret i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 0)
+}
+
+;CHECK:	.type	_MergedGlobals,@object  // @_MergedGlobals
+;CHECK:	.local	_MergedGlobals
+;CHECK:	.comm	_MergedGlobals,60,16
+
+attributes #0 = { nounwind ssp }
+attributes #1 = { nounwind readnone ssp }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/AArch64/global-merge.ll b/test/CodeGen/AArch64/global-merge.ll
new file mode 100644
index 0000000..aed1dc4
--- /dev/null
+++ b/test/CodeGen/AArch64/global-merge.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck --check-prefix=NO-MERGE %s
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -O0 -global-merge-on-external=true | FileCheck --check-prefix=NO-MERGE %s
+
+; RUN: llc < %s -mtriple=aarch64-apple-ios -O0 | FileCheck %s --check-prefix=CHECK-APPLE-IOS-NO-MERGE
+; RUN: llc < %s -mtriple=aarch64-apple-ios -O0 -global-merge-on-external=true | FileCheck %s --check-prefix=CHECK-APPLE-IOS-NO-MERGE
+
+; FIXME: add O1/O2 test for aarch64-none-linux-gnu and aarch64-apple-ios
+
+@m = internal global i32 0, align 4
+@n = internal global i32 0, align 4
+
+define void @f1(i32 %a1, i32 %a2) {
+; CHECK-LABEL: f1:
+; CHECK: adrp x{{[0-9]+}}, _MergedGlobals
+; CHECK-NOT: adrp
+
+; CHECK-APPLE-IOS-LABEL: f1:
+; CHECK-APPLE-IOS: adrp x{{[0-9]+}}, __MergedGlobals
+; CHECK-APPLE-IOS-NOT: adrp
+  store i32 %a1, i32* @m, align 4
+  store i32 %a2, i32* @n, align 4
+  ret void
+}
+
+; CHECK:        .local _MergedGlobals
+; CHECK:        .comm  _MergedGlobals,8,8
+; NO-MERGE-NOT: .local _MergedGlobals
+
+; CHECK-APPLE-IOS: .zerofill __DATA,__bss,__MergedGlobals,8,3
+; CHECK-APPLE-IOS-NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,8,3
diff --git a/test/CodeGen/AArch64/i128-fast-isel-fallback.ll b/test/CodeGen/AArch64/i128-fast-isel-fallback.ll
new file mode 100644
index 0000000..1cffbf3
--- /dev/null
+++ b/test/CodeGen/AArch64/i128-fast-isel-fallback.ll
@@ -0,0 +1,18 @@
+; RUN: llc -O0 -mtriple=arm64-apple-ios7.0 -mcpu=generic < %s | FileCheck %s
+
+; Function Attrs: nounwind ssp
+define void @test1() {
+  %1 = sext i32 0 to i128
+  call void  @test2(i128 %1)
+  ret void
+
+; The i128 is 0 so the we can test to make sure it is propogated into the x
+; registers that make up the i128 pair
+
+; CHECK:  mov  x0, xzr
+; CHECK:  mov  x1, x0
+; CHECK:  bl  _test2
+
+}
+
+declare void @test2(i128)
diff --git a/test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll b/test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll
new file mode 100644
index 0000000..645214a
--- /dev/null
+++ b/test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll
@@ -0,0 +1,26 @@
+; We actually need to use -filetype=obj in this test because if we output
+; assembly, the current code path will bypass the parser and just write the
+; raw text out to the Streamer. We need to actually parse the inlineasm to
+; demonstrate the bug. Going the asm->obj route does not show the issue.
+; RUN: llc -mtriple=aarch64   < %s -filetype=obj | llvm-objdump -arch=aarch64 -d - | FileCheck %s
+
+; CHECK-LABEL: foo:
+; CHECK:       a0 79 95 d2 	 movz	x0, #0xabcd
+; CHECK:       c0 03 5f d6   ret
+define i32 @foo() nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "ldr $0,=0xabcd", "=r"() nounwind
+  ret i32 %0
+}
+; CHECK-LABEL: bar:
+; CHECK:        40 00 00 58                                      ldr    x0, #8
+; CHECK:        c0 03 5f d6                                      ret
+; Make sure the constant pool entry comes after the return
+; CHECK-LABEL:        $d.1:
+define i32 @bar() nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "ldr $0,=0x10001", "=r"() nounwind
+  ret i32 %0
+}
+
+
diff --git a/test/CodeGen/AArch64/jump-table.ll b/test/CodeGen/AArch64/jump-table.ll
index 1dfb789..69fbd99 100644
--- a/test/CodeGen/AArch64/jump-table.ll
+++ b/test/CodeGen/AArch64/jump-table.ll
@@ -1,6 +1,6 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -code-model=large -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -o - %s | FileCheck --check-prefix=CHECK-PIC %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc -code-model=large -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -aarch64-atomic-cfg-tidy=0 -o - %s | FileCheck --check-prefix=CHECK-PIC %s
 
 define i32 @test_jumptable(i32 %in) {
 ; CHECK: test_jumptable
diff --git a/test/CodeGen/AArch64/ldst-opt.ll b/test/CodeGen/AArch64/ldst-opt.ll
index 1ce5c95..e4f4295 100644
--- a/test/CodeGen/AArch64/ldst-opt.ll
+++ b/test/CodeGen/AArch64/ldst-opt.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -o - %s | FileCheck %s
 
 ; This file contains tests for the AArch64 load/store optimizer.
 
@@ -166,6 +166,217 @@ bar:
 
 ; Check the following transform:
 ;
+; add x8, x8, #16
+;  ...
+; ldr X, [x8]
+;  ->
+; ldr X, [x8, #16]!
+;
+; with X being either w0, x0, s0, d0 or q0.
+
+%pre.struct.i32 = type { i32, i32, i32}
+%pre.struct.i64 = type { i32, i64, i64}
+%pre.struct.i128 = type { i32, <2 x i64>, <2 x i64>}
+%pre.struct.float = type { i32, float, float}
+%pre.struct.double = type { i32, double, double}
+
+define i32 @load-pre-indexed-word2(%pre.struct.i32** %this, i1 %cond,
+                                   %pre.struct.i32* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-word2
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}, #4]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i32** %this
+  %gep1 = getelementptr inbounds %pre.struct.i32* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i32* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi i32* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load i32* %retptr
+  ret i32 %ret
+}
+
+define i64 @load-pre-indexed-doubleword2(%pre.struct.i64** %this, i1 %cond,
+                                         %pre.struct.i64* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-doubleword2
+; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}, #8]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i64** %this
+  %gep1 = getelementptr inbounds %pre.struct.i64* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i64* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi i64* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load i64* %retptr
+  ret i64 %ret
+}
+
+define <2 x i64> @load-pre-indexed-quadword2(%pre.struct.i128** %this, i1 %cond,
+                                             %pre.struct.i128* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-quadword2
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}, #16]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i128** %this
+  %gep1 = getelementptr inbounds %pre.struct.i128* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i128* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi <2 x i64>* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load <2 x i64>* %retptr
+  ret <2 x i64> %ret
+}
+
+define float @load-pre-indexed-float2(%pre.struct.float** %this, i1 %cond,
+                                      %pre.struct.float* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-float2
+; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+}}, #4]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.float** %this
+  %gep1 = getelementptr inbounds %pre.struct.float* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.float* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi float* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load float* %retptr
+  ret float %ret
+}
+
+define double @load-pre-indexed-double2(%pre.struct.double** %this, i1 %cond,
+                                        %pre.struct.double* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-double2
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+}}, #8]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.double** %this
+  %gep1 = getelementptr inbounds %pre.struct.double* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.double* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi double* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load double* %retptr
+  ret double %ret
+}
+
+; Check the following transform:
+;
+; add x8, x8, #16
+;  ...
+; str X, [x8]
+;  ->
+; str X, [x8, #16]!
+;
+; with X being either w0, x0, s0, d0 or q0.
+
+define void @store-pre-indexed-word2(%pre.struct.i32** %this, i1 %cond,
+                                     %pre.struct.i32* %load2,
+                                     i32 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-word2
+; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}, #4]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i32** %this
+  %gep1 = getelementptr inbounds %pre.struct.i32* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i32* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi i32* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store i32 %val, i32* %retptr
+  ret void
+}
+
+define void @store-pre-indexed-doubleword2(%pre.struct.i64** %this, i1 %cond,
+                                           %pre.struct.i64* %load2,
+                                           i64 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-doubleword2
+; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}, #8]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i64** %this
+  %gep1 = getelementptr inbounds %pre.struct.i64* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i64* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi i64* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store i64 %val, i64* %retptr
+  ret void
+}
+
+define void @store-pre-indexed-quadword2(%pre.struct.i128** %this, i1 %cond,
+                                         %pre.struct.i128* %load2,
+                                         <2 x i64> %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-quadword2
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+}}, #16]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i128** %this
+  %gep1 = getelementptr inbounds %pre.struct.i128* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i128* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi <2 x i64>* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store <2 x i64> %val, <2 x i64>* %retptr
+  ret void
+}
+
+define void @store-pre-indexed-float2(%pre.struct.float** %this, i1 %cond,
+                                      %pre.struct.float* %load2,
+                                      float %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-float2
+; CHECK: str s{{[0-9]+}}, [x{{[0-9]+}}, #4]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.float** %this
+  %gep1 = getelementptr inbounds %pre.struct.float* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.float* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi float* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store float %val, float* %retptr
+  ret void
+}
+
+define void @store-pre-indexed-double2(%pre.struct.double** %this, i1 %cond,
+                                      %pre.struct.double* %load2,
+                                      double %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-double2
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+}}, #8]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.double** %this
+  %gep1 = getelementptr inbounds %pre.struct.double* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.double* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi double* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store double %val, double* %retptr
+  ret void
+}
+
+; Check the following transform:
+;
 ; ldr X, [x20]
 ;  ...
 ; add x20, x20, #32
@@ -294,8 +505,263 @@ exit:
   ret void
 }
 
+; Check the following transform:
+;
+; str X, [x20]
+;  ...
+; add x20, x20, #32
+;  ->
+; str X, [x20], #32
+;
+; with X being either w0, x0, s0, d0 or q0.
+
+define void @store-post-indexed-word(i32* %array, i64 %count, i32 %val) nounwind {
+; CHECK-LABEL: store-post-indexed-word
+; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}], #16
+entry:
+  %gep1 = getelementptr i32* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi i32* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr i32* %iv2, i64 -1
+  %load = load i32* %gep2
+  call void @use-word(i32 %load)
+  store i32 %val, i32* %iv2
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr i32* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @store-post-indexed-doubleword(i64* %array, i64 %count, i64 %val) nounwind {
+; CHECK-LABEL: store-post-indexed-doubleword
+; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}], #32
+entry:
+  %gep1 = getelementptr i64* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi i64* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr i64* %iv2, i64 -1
+  %load = load i64* %gep2
+  call void @use-doubleword(i64 %load)
+  store i64 %val, i64* %iv2
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr i64* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @store-post-indexed-quadword(<2 x i64>* %array, i64 %count, <2 x i64> %val) nounwind {
+; CHECK-LABEL: store-post-indexed-quadword
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+}}], #64
+entry:
+  %gep1 = getelementptr <2 x i64>* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi <2 x i64>* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr <2 x i64>* %iv2, i64 -1
+  %load = load <2 x i64>* %gep2
+  call void @use-quadword(<2 x i64> %load)
+  store <2 x i64> %val, <2 x i64>* %iv2
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr <2 x i64>* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @store-post-indexed-float(float* %array, i64 %count, float %val) nounwind {
+; CHECK-LABEL: store-post-indexed-float
+; CHECK: str s{{[0-9]+}}, [x{{[0-9]+}}], #16
+entry:
+  %gep1 = getelementptr float* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi float* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr float* %iv2, i64 -1
+  %load = load float* %gep2
+  call void @use-float(float %load)
+  store float %val, float* %iv2
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr float* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @store-post-indexed-double(double* %array, i64 %count, double %val) nounwind {
+; CHECK-LABEL: store-post-indexed-double
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+}}], #32
+entry:
+  %gep1 = getelementptr double* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi double* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr double* %iv2, i64 -1
+  %load = load double* %gep2
+  call void @use-double(double %load)
+  store double %val, double* %iv2
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr double* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
 declare void @use-word(i32)
 declare void @use-doubleword(i64)
 declare void @use-quadword(<2 x i64>)
 declare void @use-float(float)
 declare void @use-double(double)
+
+; Check the following transform:
+;
+; (ldr|str) X, [x20]
+;  ...
+; sub x20, x20, #16
+;  ->
+; (ldr|str) X, [x20], #-16
+;
+; with X being either w0, x0, s0, d0 or q0.
+
+define void @post-indexed-sub-word(i32* %a, i32* %b, i64 %count) nounwind {
+; CHECK-LABEL: post-indexed-sub-word
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}], #-8
+; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}], #-8
+  br label %for.body
+for.body:
+  %phi1 = phi i32* [ %gep4, %for.body ], [ %b, %0 ]
+  %phi2 = phi i32* [ %gep3, %for.body ], [ %a, %0 ]
+  %i = phi i64 [ %dec.i, %for.body], [ %count, %0 ]
+  %gep1 = getelementptr i32* %phi1, i64 -1
+  %load1 = load i32* %gep1
+  %gep2 = getelementptr i32* %phi2, i64 -1
+  store i32 %load1, i32* %gep2
+  %load2 = load i32* %phi1
+  store i32 %load2, i32* %phi2
+  %dec.i = add nsw i64 %i, -1
+  %gep3 = getelementptr i32* %phi2, i64 -2
+  %gep4 = getelementptr i32* %phi1, i64 -2
+  %cond = icmp sgt i64 %dec.i, 0
+  br i1 %cond, label %for.body, label %end
+end:
+  ret void
+}
+
+define void @post-indexed-sub-doubleword(i64* %a, i64* %b, i64 %count) nounwind {
+; CHECK-LABEL: post-indexed-sub-doubleword
+; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}], #-16
+; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}], #-16
+  br label %for.body
+for.body:
+  %phi1 = phi i64* [ %gep4, %for.body ], [ %b, %0 ]
+  %phi2 = phi i64* [ %gep3, %for.body ], [ %a, %0 ]
+  %i = phi i64 [ %dec.i, %for.body], [ %count, %0 ]
+  %gep1 = getelementptr i64* %phi1, i64 -1
+  %load1 = load i64* %gep1
+  %gep2 = getelementptr i64* %phi2, i64 -1
+  store i64 %load1, i64* %gep2
+  %load2 = load i64* %phi1
+  store i64 %load2, i64* %phi2
+  %dec.i = add nsw i64 %i, -1
+  %gep3 = getelementptr i64* %phi2, i64 -2
+  %gep4 = getelementptr i64* %phi1, i64 -2
+  %cond = icmp sgt i64 %dec.i, 0
+  br i1 %cond, label %for.body, label %end
+end:
+  ret void
+}
+
+define void @post-indexed-sub-quadword(<2 x i64>* %a, <2 x i64>* %b, i64 %count) nounwind {
+; CHECK-LABEL: post-indexed-sub-quadword
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}], #-32
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+}}], #-32
+  br label %for.body
+for.body:
+  %phi1 = phi <2 x i64>* [ %gep4, %for.body ], [ %b, %0 ]
+  %phi2 = phi <2 x i64>* [ %gep3, %for.body ], [ %a, %0 ]
+  %i = phi i64 [ %dec.i, %for.body], [ %count, %0 ]
+  %gep1 = getelementptr <2 x i64>* %phi1, i64 -1
+  %load1 = load <2 x i64>* %gep1
+  %gep2 = getelementptr <2 x i64>* %phi2, i64 -1
+  store <2 x i64> %load1, <2 x i64>* %gep2
+  %load2 = load <2 x i64>* %phi1
+  store <2 x i64> %load2, <2 x i64>* %phi2
+  %dec.i = add nsw i64 %i, -1
+  %gep3 = getelementptr <2 x i64>* %phi2, i64 -2
+  %gep4 = getelementptr <2 x i64>* %phi1, i64 -2
+  %cond = icmp sgt i64 %dec.i, 0
+  br i1 %cond, label %for.body, label %end
+end:
+  ret void
+}
+
+define void @post-indexed-sub-float(float* %a, float* %b, i64 %count) nounwind {
+; CHECK-LABEL: post-indexed-sub-float
+; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+}}], #-8
+; CHECK: str s{{[0-9]+}}, [x{{[0-9]+}}], #-8
+  br label %for.body
+for.body:
+  %phi1 = phi float* [ %gep4, %for.body ], [ %b, %0 ]
+  %phi2 = phi float* [ %gep3, %for.body ], [ %a, %0 ]
+  %i = phi i64 [ %dec.i, %for.body], [ %count, %0 ]
+  %gep1 = getelementptr float* %phi1, i64 -1
+  %load1 = load float* %gep1
+  %gep2 = getelementptr float* %phi2, i64 -1
+  store float %load1, float* %gep2
+  %load2 = load float* %phi1
+  store float %load2, float* %phi2
+  %dec.i = add nsw i64 %i, -1
+  %gep3 = getelementptr float* %phi2, i64 -2
+  %gep4 = getelementptr float* %phi1, i64 -2
+  %cond = icmp sgt i64 %dec.i, 0
+  br i1 %cond, label %for.body, label %end
+end:
+  ret void
+}
+
+define void @post-indexed-sub-double(double* %a, double* %b, i64 %count) nounwind {
+; CHECK-LABEL: post-indexed-sub-double
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+}}], #-16
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+}}], #-16
+  br label %for.body
+for.body:
+  %phi1 = phi double* [ %gep4, %for.body ], [ %b, %0 ]
+  %phi2 = phi double* [ %gep3, %for.body ], [ %a, %0 ]
+  %i = phi i64 [ %dec.i, %for.body], [ %count, %0 ]
+  %gep1 = getelementptr double* %phi1, i64 -1
+  %load1 = load double* %gep1
+  %gep2 = getelementptr double* %phi2, i64 -1
+  store double %load1, double* %gep2
+  %load2 = load double* %phi1
+  store double %load2, double* %phi2
+  %dec.i = add nsw i64 %i, -1
+  %gep3 = getelementptr double* %phi2, i64 -2
+  %gep4 = getelementptr double* %phi1, i64 -2
+  %cond = icmp sgt i64 %dec.i, 0
+  br i1 %cond, label %for.body, label %end
+end:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/lit.local.cfg b/test/CodeGen/AArch64/lit.local.cfg
index 77493d8..125995c 100644
--- a/test/CodeGen/AArch64/lit.local.cfg
+++ b/test/CodeGen/AArch64/lit.local.cfg
@@ -2,8 +2,7 @@ import re
 
 config.suffixes = ['.ll']
 
-targets = set(config.root.targets_to_build.split())
-if not 'AArch64' in targets:
+if not 'AArch64' in config.root.targets:
     config.unsupported = True
 
 # For now we don't test arm64-win32.
diff --git a/test/CodeGen/AArch64/memcpy-f128.ll b/test/CodeGen/AArch64/memcpy-f128.ll
new file mode 100644
index 0000000..76db297
--- /dev/null
+++ b/test/CodeGen/AArch64/memcpy-f128.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=aarch64 -mtriple=aarch64-linux-gnu | FileCheck %s
+
+%structA = type { i128 }
+@stubA = internal unnamed_addr constant %structA zeroinitializer, align 8
+
+; Make sure we don't hit llvm_unreachable.
+
+define void @test1() {
+; CHECK-LABEL: @test1
+; CHECK: adrp
+; CHECK: ldr q0
+; CHECK: str q0
+; CHECK: ret
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* undef, i8* bitcast (%structA* @stubA to i8*), i64 48, i32 8, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)
diff --git a/test/CodeGen/AArch64/mul_pow2.ll b/test/CodeGen/AArch64/mul_pow2.ll
new file mode 100644
index 0000000..efc0ec8
--- /dev/null
+++ b/test/CodeGen/AArch64/mul_pow2.ll
@@ -0,0 +1,123 @@
+; RUN: llc < %s -march=aarch64 | FileCheck %s
+
+; Convert mul x, pow2 to shift.
+; Convert mul x, pow2 +/- 1 to shift + add/sub.
+
+define i32 @test2(i32 %x) {
+; CHECK-LABEL: test2
+; CHECK: lsl w0, w0, #1
+
+  %mul = shl nsw i32 %x, 1
+  ret i32 %mul
+}
+
+define i32 @test3(i32 %x) {
+; CHECK-LABEL: test3
+; CHECK: add w0, w0, w0, lsl #1
+
+  %mul = mul nsw i32 %x, 3
+  ret i32 %mul
+}
+
+define i32 @test4(i32 %x) {
+; CHECK-LABEL: test4
+; CHECK: lsl w0, w0, #2
+
+  %mul = shl nsw i32 %x, 2
+  ret i32 %mul
+}
+
+define i32 @test5(i32 %x) {
+; CHECK-LABEL: test5
+; CHECK: add w0, w0, w0, lsl #2
+
+
+  %mul = mul nsw i32 %x, 5
+  ret i32 %mul
+}
+
+define i32 @test7(i32 %x) {
+; CHECK-LABEL: test7
+; CHECK: lsl {{w[0-9]+}}, w0, #3
+; CHECK: sub w0, {{w[0-9]+}}, w0
+
+  %mul = mul nsw i32 %x, 7
+  ret i32 %mul
+}
+
+define i32 @test8(i32 %x) {
+; CHECK-LABEL: test8
+; CHECK: lsl w0, w0, #3
+
+  %mul = shl nsw i32 %x, 3
+  ret i32 %mul
+}
+
+define i32 @test9(i32 %x) {
+; CHECK-LABEL: test9
+; CHECK: add w0, w0, w0, lsl #3
+
+  %mul = mul nsw i32 %x, 9
+  ret i32 %mul
+}
+
+; Convert mul x, -pow2 to shift.
+; Convert mul x, -(pow2 +/- 1) to shift + add/sub.
+
+define i32 @ntest2(i32 %x) {
+; CHECK-LABEL: ntest2
+; CHECK: neg w0, w0, lsl #1
+
+  %mul = mul nsw i32 %x, -2
+  ret i32 %mul
+}
+
+define i32 @ntest3(i32 %x) {
+; CHECK-LABEL: ntest3
+; CHECK: add {{w[0-9]+}}, w0, w0, lsl #1
+; CHECK: neg w0, {{w[0-9]+}}
+
+  %mul = mul nsw i32 %x, -3
+  ret i32 %mul
+}
+
+define i32 @ntest4(i32 %x) {
+; CHECK-LABEL: ntest4
+; CHECK:neg w0, w0, lsl #2
+
+  %mul = mul nsw i32 %x, -4
+  ret i32 %mul
+}
+
+define i32 @ntest5(i32 %x) {
+; CHECK-LABEL: ntest5
+; CHECK: add {{w[0-9]+}}, w0, w0, lsl #2
+; CHECK: neg w0, {{w[0-9]+}}
+  %mul = mul nsw i32 %x, -5
+  ret i32 %mul
+}
+
+define i32 @ntest7(i32 %x) {
+; CHECK-LABEL: ntest7
+; CHECK: sub w0, w0, w0, lsl #3
+
+  %mul = mul nsw i32 %x, -7
+  ret i32 %mul
+}
+
+define i32 @ntest8(i32 %x) {
+; CHECK-LABEL: ntest8
+; CHECK: neg w0, w0, lsl #3
+
+  %mul = mul nsw i32 %x, -8
+  ret i32 %mul
+}
+
+define i32 @ntest9(i32 %x) {
+; CHECK-LABEL: ntest9
+; CHECK: add {{w[0-9]+}}, w0, w0, lsl #3
+; CHECK: neg w0, {{w[0-9]+}}
+
+  %mul = mul nsw i32 %x, -9
+  ret i32 %mul
+}
diff --git a/test/CodeGen/AArch64/regress-tail-livereg.ll b/test/CodeGen/AArch64/regress-tail-livereg.ll
index e32ac84..03c3f33 100644
--- a/test/CodeGen/AArch64/regress-tail-livereg.ll
+++ b/test/CodeGen/AArch64/regress-tail-livereg.ll
@@ -17,3 +17,17 @@ define void @foo() {
 ; CHECK: br {{x([0-79]|1[0-8])}}
        ret void
 }
+
+; No matter how tempting it is, LLVM should not use x30 since that'll be
+; restored to its incoming value before the "br".
+define void @test_x30_tail() {
+; CHECK-LABEL: test_x30_tail:
+; CHECK: mov [[DEST:x[0-9]+]], x30
+; CHECK: br [[DEST]]
+  %addr = call i8* @llvm.returnaddress(i32 0)
+  %faddr = bitcast i8* %addr to void()*
+  tail call void %faddr()
+  ret void
+}
+
+declare i8* @llvm.returnaddress(i32)
diff --git a/test/CodeGen/AArch64/trunc-v1i64.ll b/test/CodeGen/AArch64/trunc-v1i64.ll
new file mode 100644
index 0000000..159b8e0
--- /dev/null
+++ b/test/CodeGen/AArch64/trunc-v1i64.ll
@@ -0,0 +1,63 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -verify-machineinstrs < %s | FileCheck %s
+
+; An optimization in DAG Combiner to fold
+; (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...))
+; will generate nodes like:
+;     v1i32 trunc v1i64, v1i16 trunc v1i64, v1i8 trunc v1i64.
+; And such nodes will be defaultly scalarized in type legalization. But such
+; scalarization will cause an assertion failure, as v1i64 is a legal type in
+; AArch64. We change the default behaviour from be scalarized to be widen.
+
+; FIXME: Currently XTN is generated for v1i32, but it can be optimized.
+; Just like v1i16 and v1i8, there is no XTN generated.
+
+define <2 x i32> @test_v1i32_0(<1 x i64> %in0) {
+; CHECK-LABEL: test_v1i32_0:
+; CHECK: xtn v0.2s, v0.2d
+  %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <2 x i32> <i32 0, i32 undef>
+  %2 = trunc <2 x i64> %1 to <2 x i32>
+  ret <2 x i32> %2
+}
+
+define <2 x i32> @test_v1i32_1(<1 x i64> %in0) {
+; CHECK-LABEL: test_v1i32_1:
+; CHECK: xtn v0.2s, v0.2d
+; CHECK-NEXT: dup v0.2s, v0.s[0]
+  %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <2 x i32> <i32 undef, i32 0>
+  %2 = trunc <2 x i64> %1 to <2 x i32>
+  ret <2 x i32> %2
+}
+
+define <4 x i16> @test_v1i16_0(<1 x i64> %in0) {
+; CHECK-LABEL: test_v1i16_0:
+; CHECK-NOT: xtn
+  %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %2 = trunc <4 x i64> %1 to <4 x i16>
+  ret <4 x i16> %2
+}
+
+define <4 x i16> @test_v1i16_1(<1 x i64> %in0) {
+; CHECK-LABEL: test_v1i16_1:
+; CHECK-NOT: xtn
+; CHECK: dup v0.4h, v0.h[0]
+  %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
+  %2 = trunc <4 x i64> %1 to <4 x i16>
+  ret <4 x i16> %2
+}
+
+define <8 x i8> @test_v1i8_0(<1 x i64> %in0) {
+; CHECK-LABEL: test_v1i8_0:
+; CHECK-NOT: xtn
+  %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2 = trunc <8 x i64> %1 to <8 x i8>
+  ret <8 x i8> %2
+}
+
+define <8 x i8> @test_v1i8_1(<1 x i64> %in0) {
+; CHECK-LABEL: test_v1i8_1:
+; CHECK-NOT: xtn
+; CHECK: dup v0.8b, v0.b[0]
+  %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2 = trunc <8 x i64> %1 to <8 x i8>
+  ret <8 x i8> %2
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/tst-br.ll b/test/CodeGen/AArch64/tst-br.ll
index 8a2fe26..5dc7b5d 100644
--- a/test/CodeGen/AArch64/tst-br.ll
+++ b/test/CodeGen/AArch64/tst-br.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 
 ; We've got the usual issues with LLVM reordering blocks here. The
 ; tests are correct for the current order, but who knows when that
author	Stephen Hines <srhines@google.com>	2014-07-21 00:45:20 -0700
committer	Stephen Hines <srhines@google.com>	2014-07-21 00:45:20 -0700
commit	c6a4f5e819217e1e12c458aed8e7b122e23a3a58 (patch)
tree	81b7dd2bb4370a392f31d332a566c903b5744764 /test/CodeGen/AArch64
parent	19c6fbb3e8aaf74093afa08013134b61fa08f245 (diff)
download	external_llvm-c6a4f5e819217e1e12c458aed8e7b122e23a3a58.zip external_llvm-c6a4f5e819217e1e12c458aed8e7b122e23a3a58.tar.gz external_llvm-c6a4f5e819217e1e12c458aed8e7b122e23a3a58.tar.bz2