139 files changed, 11039 insertions, 861 deletions
diff --git a/test/CodeGen/AArch64/PBQP-chain.ll b/test/CodeGen/AArch64/PBQP-chain.ll
new file mode 100644
index 0000000..c4ba026
--- /dev/null
+++ b/test/CodeGen/AArch64/PBQP-chain.ll
@@ -0,0 +1,104 @@
+; RUN: llc < %s -mcpu=cortex-a57 -mattr=+neon -fp-contract=fast -regalloc=pbqp -pbqp-coalescing | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN
+; RUN: llc < %s -mcpu=cortex-a57 -mattr=+neon -fp-contract=fast -regalloc=pbqp -pbqp-coalescing | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD
+;
+; Test PBQP is able to fulfill the accumulator chaining constraint.
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+; CHECK-LABEL: fir
+; CHECK-EVEN: fmadd {{d[0-9]*[02468]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[02468]}}
+; CHECK-EVEN: fmadd {{d[0-9]*[02468]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[02468]}}
+; CHECK-EVEN: fmadd {{d[0-9]*[02468]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[02468]}}
+; CHECK-EVEN: fmadd {{d[0-9]*[02468]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[02468]}}
+; CHECK-EVEN: fmadd {{d[0-9]*[02468]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[02468]}}
+; CHECK-EVEN: fmadd {{d[0-9]*[02468]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[02468]}}
+; CHECK-EVEN: fmadd {{d[0-9]*[02468]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[02468]}}
+; CHECK-ODD: fmadd {{d[0-9]*[13579]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[13579]}}
+; CHECK-ODD: fmadd {{d[0-9]*[13579]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[13579]}}
+; CHECK-ODD: fmadd {{d[0-9]*[13579]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[13579]}}
+; CHECK-ODD: fmadd {{d[0-9]*[13579]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[13579]}}
+; CHECK-ODD: fmadd {{d[0-9]*[13579]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[13579]}}
+; CHECK-ODD: fmadd {{d[0-9]*[13579]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[13579]}}
+; CHECK-ODD: fmadd {{d[0-9]*[13579]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[13579]}}
+define void @fir(double* nocapture %rx, double* nocapture %ry, double* nocapture %c, double* nocapture %x, double* nocapture %y) {
+entry:
+  %0 = load double* %c, align 8
+  %1 = load double* %x, align 8
+  %mul = fmul fast double %1, %0
+  %2 = load double* %y, align 8
+  %mul7 = fmul fast double %2, %0
+  %arrayidx.1 = getelementptr inbounds double* %c, i64 1
+  %3 = load double* %arrayidx.1, align 8
+  %arrayidx2.1 = getelementptr inbounds double* %x, i64 1
+  %4 = load double* %arrayidx2.1, align 8
+  %mul.1 = fmul fast double %4, %3
+  %add.1 = fadd fast double %mul.1, %mul
+  %arrayidx6.1 = getelementptr inbounds double* %y, i64 1
+  %5 = load double* %arrayidx6.1, align 8
+  %mul7.1 = fmul fast double %5, %3
+  %add8.1 = fadd fast double %mul7.1, %mul7
+  %arrayidx.2 = getelementptr inbounds double* %c, i64 2
+  %6 = load double* %arrayidx.2, align 8
+  %arrayidx2.2 = getelementptr inbounds double* %x, i64 2
+  %7 = load double* %arrayidx2.2, align 8
+  %mul.2 = fmul fast double %7, %6
+  %add.2 = fadd fast double %mul.2, %add.1
+  %arrayidx6.2 = getelementptr inbounds double* %y, i64 2
+  %8 = load double* %arrayidx6.2, align 8
+  %mul7.2 = fmul fast double %8, %6
+  %add8.2 = fadd fast double %mul7.2, %add8.1
+  %arrayidx.3 = getelementptr inbounds double* %c, i64 3
+  %9 = load double* %arrayidx.3, align 8
+  %arrayidx2.3 = getelementptr inbounds double* %x, i64 3
+  %10 = load double* %arrayidx2.3, align 8
+  %mul.3 = fmul fast double %10, %9
+  %add.3 = fadd fast double %mul.3, %add.2
+  %arrayidx6.3 = getelementptr inbounds double* %y, i64 3
+  %11 = load double* %arrayidx6.3, align 8
+  %mul7.3 = fmul fast double %11, %9
+  %add8.3 = fadd fast double %mul7.3, %add8.2
+  %arrayidx.4 = getelementptr inbounds double* %c, i64 4
+  %12 = load double* %arrayidx.4, align 8
+  %arrayidx2.4 = getelementptr inbounds double* %x, i64 4
+  %13 = load double* %arrayidx2.4, align 8
+  %mul.4 = fmul fast double %13, %12
+  %add.4 = fadd fast double %mul.4, %add.3
+  %arrayidx6.4 = getelementptr inbounds double* %y, i64 4
+  %14 = load double* %arrayidx6.4, align 8
+  %mul7.4 = fmul fast double %14, %12
+  %add8.4 = fadd fast double %mul7.4, %add8.3
+  %arrayidx.5 = getelementptr inbounds double* %c, i64 5
+  %15 = load double* %arrayidx.5, align 8
+  %arrayidx2.5 = getelementptr inbounds double* %x, i64 5
+  %16 = load double* %arrayidx2.5, align 8
+  %mul.5 = fmul fast double %16, %15
+  %add.5 = fadd fast double %mul.5, %add.4
+  %arrayidx6.5 = getelementptr inbounds double* %y, i64 5
+  %17 = load double* %arrayidx6.5, align 8
+  %mul7.5 = fmul fast double %17, %15
+  %add8.5 = fadd fast double %mul7.5, %add8.4
+  %arrayidx.6 = getelementptr inbounds double* %c, i64 6
+  %18 = load double* %arrayidx.6, align 8
+  %arrayidx2.6 = getelementptr inbounds double* %x, i64 6
+  %19 = load double* %arrayidx2.6, align 8
+  %mul.6 = fmul fast double %19, %18
+  %add.6 = fadd fast double %mul.6, %add.5
+  %arrayidx6.6 = getelementptr inbounds double* %y, i64 6
+  %20 = load double* %arrayidx6.6, align 8
+  %mul7.6 = fmul fast double %20, %18
+  %add8.6 = fadd fast double %mul7.6, %add8.5
+  %arrayidx.7 = getelementptr inbounds double* %c, i64 7
+  %21 = load double* %arrayidx.7, align 8
+  %arrayidx2.7 = getelementptr inbounds double* %x, i64 7
+  %22 = load double* %arrayidx2.7, align 8
+  %mul.7 = fmul fast double %22, %21
+  %add.7 = fadd fast double %mul.7, %add.6
+  %arrayidx6.7 = getelementptr inbounds double* %y, i64 7
+  %23 = load double* %arrayidx6.7, align 8
+  %mul7.7 = fmul fast double %23, %21
+  %add8.7 = fadd fast double %mul7.7, %add8.6
+  store double %add.7, double* %rx, align 8
+  store double %add8.7, double* %ry, align 8
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/PBQP-coalesce-benefit.ll b/test/CodeGen/AArch64/PBQP-coalesce-benefit.ll
new file mode 100644
index 0000000..45ac5e6
--- /dev/null
+++ b/test/CodeGen/AArch64/PBQP-coalesce-benefit.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a57 -mattr=+neon -fp-contract=fast -regalloc=pbqp -pbqp-coalescing | FileCheck %s
+
+; CHECK-LABEL: test:
+define i32 @test(i32 %acc, i32* nocapture readonly %c) {
+entry:
+  %0 = load i32* %c, align 4
+; CHECK-NOT: mov	 w{{[0-9]*}}, w0
+  %add = add nsw i32 %0, %acc
+  %arrayidx1 = getelementptr inbounds i32* %c, i64 1
+  %1 = load i32* %arrayidx1, align 4
+  %add2 = add nsw i32 %add, %1
+  ret i32 %add2
+}
+
diff --git a/test/CodeGen/AArch64/PBQP-csr.ll b/test/CodeGen/AArch64/PBQP-csr.ll
new file mode 100644
index 0000000..64335ae
--- /dev/null
+++ b/test/CodeGen/AArch64/PBQP-csr.ll
@@ -0,0 +1,91 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a57 -mattr=+neon -fp-contract=fast -regalloc=pbqp -pbqp-coalescing | FileCheck %s
+
+%pl = type { i32, i32, i32, i32, %p*, %l*, double* }
+%p = type { i32, %ca*, [27 x %ca*], %v*, %v*, %v*, i32 }
+%ca = type { %v, float, i32 }
+%v = type { double, double, double }
+%l = type opaque
+%rs = type { i32, i32, i32, i32, %v*, %v*, [21 x double], %v, %v, %v, double, double, double }
+
+;CHECK-LABEL: test_csr
+define void @test_csr(%pl* nocapture readnone %this, %rs* nocapture %r) align 2 {
+;CHECK-NOT: stp {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %x.i = getelementptr inbounds %rs* %r, i64 0, i32 7, i32 0
+  %y.i = getelementptr inbounds %rs* %r, i64 0, i32 7, i32 1
+  %z.i = getelementptr inbounds %rs* %r, i64 0, i32 7, i32 2
+  %x.i61 = getelementptr inbounds %rs* %r, i64 0, i32 8, i32 0
+  %y.i62 = getelementptr inbounds %rs* %r, i64 0, i32 8, i32 1
+  %z.i63 = getelementptr inbounds %rs* %r, i64 0, i32 8, i32 2
+  %x.i58 = getelementptr inbounds %rs* %r, i64 0, i32 9, i32 0
+  %y.i59 = getelementptr inbounds %rs* %r, i64 0, i32 9, i32 1
+  %z.i60 = getelementptr inbounds %rs* %r, i64 0, i32 9, i32 2
+  %na = getelementptr inbounds %rs* %r, i64 0, i32 0
+  %0 = bitcast double* %x.i to i8*
+  call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 72, i32 8, i1 false)
+  %1 = load i32* %na, align 4
+  %cmp70 = icmp sgt i32 %1, 0
+  br i1 %cmp70, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %fn = getelementptr inbounds %rs* %r, i64 0, i32 4
+  %2 = load %v** %fn, align 8
+  %fs = getelementptr inbounds %rs* %r, i64 0, i32 5
+  %3 = load %v** %fs, align 8
+  %4 = sext i32 %1 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %5 = phi double [ 0.000000e+00, %for.body.lr.ph ], [ %add6.i, %for.body ]
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %6 = phi <2 x double> [ zeroinitializer, %for.body.lr.ph ], [ %17, %for.body ]
+  %7 = phi <2 x double> [ zeroinitializer, %for.body.lr.ph ], [ %22, %for.body ]
+  %8 = phi <2 x double> [ zeroinitializer, %for.body.lr.ph ], [ %26, %for.body ]
+  %9 = phi <2 x double> [ zeroinitializer, %for.body.lr.ph ], [ %28, %for.body ]
+  %x.i54 = getelementptr inbounds %v* %2, i64 %indvars.iv, i32 0
+  %x1.i = getelementptr inbounds %v* %3, i64 %indvars.iv, i32 0
+  %y.i56 = getelementptr inbounds %v* %2, i64 %indvars.iv, i32 1
+  %10 = bitcast double* %x.i54 to <2 x double>*
+  %11 = load <2 x double>* %10, align 8
+  %y2.i = getelementptr inbounds %v* %3, i64 %indvars.iv, i32 1
+  %12 = bitcast double* %x1.i to <2 x double>*
+  %13 = load <2 x double>* %12, align 8
+  %14 = fadd fast <2 x double> %13, %11
+  %z.i57 = getelementptr inbounds %v* %2, i64 %indvars.iv, i32 2
+  %15 = load double* %z.i57, align 8
+  %z4.i = getelementptr inbounds %v* %3, i64 %indvars.iv, i32 2
+  %16 = load double* %z4.i, align 8
+  %add5.i = fadd fast double %16, %15
+  %17 = fadd fast <2 x double> %6, %11
+  %18 = bitcast double* %x.i to <2 x double>*
+  store <2 x double> %17, <2 x double>* %18, align 8
+  %19 = load double* %x1.i, align 8
+  %20 = insertelement <2 x double> undef, double %15, i32 0
+  %21 = insertelement <2 x double> %20, double %19, i32 1
+  %22 = fadd fast <2 x double> %7, %21
+  %23 = bitcast double* %z.i to <2 x double>*
+  store <2 x double> %22, <2 x double>* %23, align 8
+  %24 = bitcast double* %y2.i to <2 x double>*
+  %25 = load <2 x double>* %24, align 8
+  %26 = fadd fast <2 x double> %8, %25
+  %27 = bitcast double* %y.i62 to <2 x double>*
+  store <2 x double> %26, <2 x double>* %27, align 8
+  %28 = fadd fast <2 x double> %14, %9
+  %29 = bitcast double* %x.i58 to <2 x double>*
+  store <2 x double> %28, <2 x double>* %29, align 8
+  %add6.i = fadd fast double %add5.i, %5
+  store double %add6.i, double* %z.i60, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cmp = icmp slt i64 %indvars.iv.next, %4
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
diff --git a/test/CodeGen/AArch64/PBQP.ll b/test/CodeGen/AArch64/PBQP.ll
new file mode 100644
index 0000000..675a2ca
--- /dev/null
+++ b/test/CodeGen/AArch64/PBQP.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 -regalloc=pbqp -pbqp-coalescing -o - %s | FileCheck %s
+
+define i32 @foo(i32 %a) {
+; CHECK-LABEL: foo:
+; CHECK: bl bar
+; CHECK: bl baz
+  %call = call i32 @bar(i32 %a)
+  %call1 = call i32 @baz(i32 %call)
+  ret i32 %call1
+}
+
+declare i32 @bar(i32)
+declare i32 @baz(i32)
+
diff --git a/test/CodeGen/AArch64/Redundantstore.ll b/test/CodeGen/AArch64/Redundantstore.ll
new file mode 100644
index 0000000..72f7f46
--- /dev/null
+++ b/test/CodeGen/AArch64/Redundantstore.ll
@@ -0,0 +1,25 @@
+; RUN: llc -O3 -march=aarch64 < %s | FileCheck %s 
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+@end_of_array = common global i8* null, align 8
+
+; CHECK-LABEL: @test
+; CHECK: stur
+; CHECK-NOT: stur
+define i8* @test(i32 %size) {
+entry:
+  %0 = load i8** @end_of_array, align 8
+  %conv = sext i32 %size to i64
+  %and = and i64 %conv, -8
+  %conv2 = trunc i64 %and to i32
+  %add.ptr.sum = add nsw i64 %and, -4
+  %add.ptr3 = getelementptr inbounds i8* %0, i64 %add.ptr.sum
+  %size4 = bitcast i8* %add.ptr3 to i32*
+  store i32 %conv2, i32* %size4, align 4
+  %add.ptr.sum9 = add nsw i64 %and, -4
+  %add.ptr5 = getelementptr inbounds i8* %0, i64 %add.ptr.sum9
+  %size6 = bitcast i8* %add.ptr5 to i32*
+  store i32 %conv2, i32* %size6, align 4
+  ret i8* %0
+}
+
diff --git a/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll b/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
new file mode 100644
index 0000000..4da33a0
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
@@ -0,0 +1,106 @@
+; RUN: llc < %s -O2 -mtriple=aarch64-none-linux-gnu 
+
+; Bug 20598
+
+
+define void @test() #0 {
+entry:
+  br label %for.body, !dbg !39
+
+for.body:                                         ; preds = %for.body, %entry
+  %arrayidx5 = getelementptr inbounds i32* null, i64 1, !dbg !43
+  %0 = load i32* null, align 4, !dbg !45, !tbaa !46
+  %s1 = sub nsw i32 0, %0, !dbg !50
+  %n1 = sext i32 %s1 to i64, !dbg !50
+  %arrayidx21 = getelementptr inbounds i32* null, i64 3, !dbg !51
+  %add53 = add nsw i64 %n1, 0, !dbg !52
+  %add55 = add nsw i64 %n1, 0, !dbg !53
+  %mul63 = mul nsw i64 %add53, -20995, !dbg !54
+  tail call void @llvm.dbg.value(metadata !{i64 %mul63}, i64 0, metadata !30, metadata !{metadata !"0x102"}), !dbg !55
+  %mul65 = mul nsw i64 %add55, -3196, !dbg !56
+  %add67 = add nsw i64 0, %mul65, !dbg !57
+  %add80 = add i64 0, 1024, !dbg !58
+  %add81 = add i64 %add80, %mul63, !dbg !58
+  %add82 = add i64 %add81, 0, !dbg !58
+  %shr83351 = lshr i64 %add82, 11, !dbg !58
+  %conv84 = trunc i64 %shr83351 to i32, !dbg !58
+  store i32 %conv84, i32* %arrayidx21, align 4, !dbg !58, !tbaa !46
+  %add86 = add i64 0, 1024, !dbg !59
+  %add87 = add i64 %add86, 0, !dbg !59
+  %add88 = add i64 %add87, %add67, !dbg !59
+  %shr89352 = lshr i64 %add88, 11, !dbg !59
+  %n2 = trunc i64 %shr89352 to i32, !dbg !59
+  store i32 %n2, i32* %arrayidx5, align 4, !dbg !59, !tbaa !46
+  br label %for.body, !dbg !39
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!36, !37}
+!llvm.ident = !{!38}
+
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.6.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [] [] []
+!1 = metadata !{metadata !"test.c", metadata !""}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !"0x2e\00\00\00\00140\000\001\000\006\00256\001\00141", metadata !1, metadata !5, metadata !6, null, void ()* @test, null, null, metadata !12} ; [ DW_TAG_subprogram ] [] [] [def] [scope 141] []
+!5 = metadata !{metadata !"0x29", metadata !1} ; [ DW_TAG_file_type ] [] []
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [] [] [from ]
+!7 = metadata !{null, metadata !8}
+!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [] [] []
+!9 = metadata !{metadata !"0x16\00\0030\000\000\000\000", metadata !10, null, metadata !11} ; [ DW_TAG_typedef ] [] [] [] [from int]
+!10 = metadata !{metadata !"", metadata !""}
+!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [] [int] []
+!12 = metadata !{metadata !13, metadata !14, metadata !18, metadata !19, metadata !20, metadata !21, metadata !22, metadata !23, metadata !24, metadata !25, metadata !26, metadata !27, metadata !28, metadata !29, metadata !30, metadata !31, metadata !32, metadata !33, metadata !34, metadata !35}
+!13 = metadata !{metadata !"0x101\00\0016777356\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [] [data] []
+!14 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
+!15 = metadata !{metadata !"0x16\00\00183\000\000\000\000", metadata !16, null, metadata !17} ; [ DW_TAG_typedef ] [] [INT32] [] [from long int]
+!16 = metadata !{metadata !"", metadata !""}
+!17 = metadata !{metadata !"0x24\00\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [] [long int] []
+!18 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
+!19 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
+!20 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
+!21 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
+!22 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
+!23 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
+!24 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!25 = metadata !{metadata !"0x100\00\00143\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!26 = metadata !{metadata !"0x100\00\00143\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!27 = metadata !{metadata !"0x100\00\00143\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!28 = metadata !{metadata !"0x100\00\00143\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!29 = metadata !{metadata !"0x100\00\00144\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!30 = metadata !{metadata !"0x100\00\00144\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!31 = metadata !{metadata !"0x100\00\00144\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!32 = metadata !{metadata !"0x100\00\00144\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [ ] [] []
+!33 = metadata !{metadata !"0x100\00\00144\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!34 = metadata !{metadata !"0x100\00\00145\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [  ] [] []
+!35 = metadata !{metadata !"0x100\00\00146\000", metadata !4, metadata !5, metadata !11} ; [ DW_TAG_auto_variable ] [  ] [] []
+!36 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!37 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!38 = metadata !{metadata !"clang version 3.6.0 "}
+!39 = metadata !{i32 154, i32 8, metadata !40, null}
+!40 = metadata !{metadata !"0xb\00154\008\002", metadata !1, metadata !41} ; [ DW_TAG_lexical_block ] [  ] []
+!41 = metadata !{metadata !"0xb\00154\008\001", metadata !1, metadata !42} ; [ DW_TAG_lexical_block ] [  ] []
+!42 = metadata !{metadata !"0xb\00154\003\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [  ] []
+!43 = metadata !{i32 157, i32 5, metadata !44, null}
+!44 = metadata !{metadata !"0xb\00154\0042\000", metadata !1, metadata !42} ; [ DW_TAG_lexical_block ] [  ] []
+!45 = metadata !{i32 159, i32 5, metadata !44, null}
+!46 = metadata !{metadata !47, metadata !47, i64 0}
+!47 = metadata !{metadata !"int", metadata !48, i64 0}
+!48 = metadata !{metadata !"omnipotent char", metadata !49, i64 0}
+!49 = metadata !{metadata !"Simple C/C++ TBAA"}
+!50 = metadata !{i32 160, i32 5, metadata !44, null}
+!51 = metadata !{i32 161, i32 5, metadata !44, null}
+!52 = metadata !{i32 188, i32 5, metadata !44, null}
+!53 = metadata !{i32 190, i32 5, metadata !44, null}
+!54 = metadata !{i32 198, i32 5, metadata !44, null}
+!55 = metadata !{i32 144, i32 13, metadata !4, null}
+!56 = metadata !{i32 200, i32 5, metadata !44, null}
+!57 = metadata !{i32 203, i32 5, metadata !44, null}
+!58 = metadata !{i32 207, i32 5, metadata !44, null}
+!59 = metadata !{i32 208, i32 5, metadata !44, null}
diff --git a/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll b/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
index fb229fc..7108bc0 100644
--- a/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
+++ b/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN
-; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD
+; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A57 --check-prefix CHECK-EVEN
+; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A57 --check-prefix CHECK-ODD
+; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A53 --check-prefix CHECK-EVEN
+; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A53 --check-prefix CHECK-ODD
 
 ; Test the AArch64A57FPLoadBalancing pass. This pass relies heavily on register allocation, so
 ; our test strategy is to:
@@ -73,7 +75,9 @@ entry:
 ; CHECK: fmsub [[x]]
 ; CHECK: fmadd [[y]]
 ; CHECK: fmadd [[x]]
-; CHECK: stp [[x]], [[y]]
+; CHECK-A57: stp [[x]], [[y]]
+; CHECK-A53-DAG: str [[x]]
+; CHECK-A53-DAG: str [[y]]
 
 define void @f2(double* nocapture readonly %p, double* nocapture %q) #0 {
 entry:
@@ -166,7 +170,9 @@ declare void @g(...) #1
 ; CHECK: fmsub [[x]]
 ; CHECK: fmadd [[y]]
 ; CHECK: fmadd [[x]]
-; CHECK: stp [[x]], [[y]]
+; CHECK-A57: stp [[x]], [[y]]
+; CHECK-A53-DAG: str [[x]]
+; CHECK-A53-DAG: str [[y]]
 
 define void @f4(float* nocapture readonly %p, float* nocapture %q) #0 {
 entry:
diff --git a/test/CodeGen/AArch64/aarch64-be-bv.ll b/test/CodeGen/AArch64/aarch64-be-bv.ll
new file mode 100644
index 0000000..01642a4
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-be-bv.ll
@@ -0,0 +1,831 @@
+; RUN: llc -mtriple=aarch64_be--linux-gnu < %s | FileCheck %s
+
+@vec_v8i16 = global <8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
+
+; CHECK-LABEL: movi_modimm_t1:
+define i16 @movi_modimm_t1() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].4s, #0x1
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t2:
+define i16 @movi_modimm_t2() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].4s, #0x1, lsl #8
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 256, i16 0, i16 256, i16 0, i16 256, i16 0, i16 256, i16 0>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t3:
+define i16 @movi_modimm_t3() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].4s, #0x1, lsl #16
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t4:
+define i16 @movi_modimm_t4() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].4s, #0x1, lsl #24
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 0, i16 256, i16 0, i16 256, i16 0, i16 256, i16 0, i16 256>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t5:
+define i16 @movi_modimm_t5() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].8h, #0x1
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t6:
+define i16 @movi_modimm_t6() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].8h, #0x1, lsl #8
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t7:
+define i16 @movi_modimm_t7() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].4s, #0x1, msl #8
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 511, i16 0, i16 511, i16 0, i16 511, i16 0, i16 511, i16 0>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t8:
+define i16 @movi_modimm_t8() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].4s, #0x1, msl #16
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65535, i16 1, i16 65535, i16 1, i16 65535, i16 1, i16 65535, i16 1>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t9:
+define i16 @movi_modimm_t9() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].16b, #0x1
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t10:
+define i16 @movi_modimm_t10() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].2d, #0x00ffff0000ffff
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 -1, i16 0, i16 -1, i16 0, i16 -1, i16 0, i16 -1, i16 0>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: fmov_modimm_t11:
+define i16 @fmov_modimm_t11() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    fmov    v[[REG2:[0-9]+]].4s, #3.00000000
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 0, i16 16448, i16 0, i16 16448, i16 0, i16 16448, i16 0, i16 16448>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: fmov_modimm_t12:
+define i16 @fmov_modimm_t12() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    fmov    v[[REG2:[0-9]+]].2d, #0.17968750
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 0, i16 0, i16 0, i16 16327, i16 0, i16 0, i16 0, i16 16327>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t1:
+define i16 @mvni_modimm_t1() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].4s, #0x1
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t2:
+define i16 @mvni_modimm_t2() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].4s, #0x1, lsl #8
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65279, i16 65535, i16 65279, i16 65535, i16 65279, i16 65535, i16 65279, i16 65535>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t3:
+define i16 @mvni_modimm_t3() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].4s, #0x1, lsl #16
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t4:
+define i16 @mvni_modimm_t4() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].4s, #0x1, lsl #24
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65535, i16 65279, i16 65535, i16 65279, i16 65535, i16 65279, i16 65535, i16 65279>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t5:
+define i16 @mvni_modimm_t5() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].8h, #0x1
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t6:
+define i16 @mvni_modimm_t6() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].8h, #0x1, lsl #8
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65279, i16 65279, i16 65279, i16 65279, i16 65279, i16 65279, i16 65279, i16 65279>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t7:
+define i16 @mvni_modimm_t7() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].4s, #0x1, msl #8
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65024, i16 65535, i16 65024, i16 65535, i16 65024, i16 65535, i16 65024, i16 65535>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t8:
+define i16 @mvni_modimm_t8() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].4s, #0x1, msl #16
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 0, i16 65534, i16 0, i16 65534, i16 0, i16 65534, i16 0, i16 65534>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: bic_modimm_t1:
+define i16 @bic_modimm_t1() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    bic	   v[[REG2:[0-9]+]].4s, #0x1
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = and <8 x i16> %in, <i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: bic_modimm_t2:
+define i16 @bic_modimm_t2() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    bic	   v[[REG2:[0-9]+]].4s, #0x1, lsl #8
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = and <8 x i16> %in, <i16 65279, i16 65535, i16 65279, i16 65535, i16 65279, i16 65535, i16 65279, i16 65535>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: bic_modimm_t3:
+define i16 @bic_modimm_t3() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    bic	   v[[REG2:[0-9]+]].4s, #0x1, lsl #16
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = and <8 x i16> %in, <i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: bic_modimm_t4:
+define i16 @bic_modimm_t4() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    bic	   v[[REG2:[0-9]+]].4s, #0x1, lsl #24
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = and <8 x i16> %in, <i16 65535, i16 65279, i16 65535, i16 65279, i16 65535, i16 65279, i16 65535, i16 65279>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: bic_modimm_t5:
+define i16 @bic_modimm_t5() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    bic	   v[[REG2:[0-9]+]].8h, #0x1
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = and <8 x i16> %in, <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: bic_modimm_t6:
+define i16 @bic_modimm_t6() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    bic	   v[[REG2:[0-9]+]].8h, #0x1, lsl #8
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = and <8 x i16> %in, <i16 65279, i16 65279, i16 65279, i16 65279, i16 65279, i16 65279, i16 65279, i16 65279>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: orr_modimm_t1:
+define i16 @orr_modimm_t1() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    orr	   v[[REG2:[0-9]+]].4s, #0x1
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = or <8 x i16> %in, <i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: orr_modimm_t2:
+define i16 @orr_modimm_t2() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    orr     v[[REG2:[0-9]+]].4s, #0x1, lsl #8
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = or <8 x i16> %in, <i16 256, i16 0, i16 256, i16 0, i16 256, i16 0, i16 256, i16 0>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: orr_modimm_t3:
+define i16 @orr_modimm_t3() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    orr	   v[[REG2:[0-9]+]].4s, #0x1, lsl #16
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = or <8 x i16> %in, <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: orr_modimm_t4:
+define i16 @orr_modimm_t4() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    orr	   v[[REG2:[0-9]+]].4s, #0x1, lsl #24
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = or <8 x i16> %in, <i16 0, i16 256, i16 0, i16 256, i16 0, i16 256, i16 0, i16 256>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: orr_modimm_t5:
+define i16 @orr_modimm_t5() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    orr	   v[[REG2:[0-9]+]].8h, #0x1
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = or <8 x i16> %in, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: orr_modimm_t6:
+define i16 @orr_modimm_t6() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    orr	   v[[REG2:[0-9]+]].8h, #0x1, lsl #8
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = or <8 x i16> %in, <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+declare i8 @f_v8i8(<8 x i8> %arg)
+declare i16 @f_v4i16(<4 x i16> %arg)
+declare i32 @f_v2i32(<2 x i32> %arg)
+declare i64 @f_v1i64(<1 x i64> %arg)
+declare i8 @f_v16i8(<16 x i8> %arg)
+declare i16 @f_v8i16(<8 x i16> %arg)
+declare i32 @f_v4i32(<4 x i32> %arg)
+declare i64 @f_v2i64(<2 x i64> %arg)
+
+; CHECK-LABEL: modimm_t1_call:
+define void @modimm_t1_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 8, i8 0, i8 0, i8 0, i8 8, i8 0, i8 0, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x7
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 7, i16 0, i16 7, i16 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x6
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 6, i32 6>)
+  ; CHECK:         movi    v{{[0-9]+}}.2s, #0x5
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 21474836485>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x5
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x4
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 4, i16 0, i16 4, i16 0, i16 4, i16 0, i16 4, i16 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x3
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].4s, #0x2
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 8589934594, i64 8589934594>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t2_call:
+define void @modimm_t2_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x8, lsl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 0, i8 8, i8 0, i8 0, i8 0, i8 8, i8 0, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x7, lsl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 1792, i16 0, i16 1792, i16 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x6, lsl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 1536, i32 1536>)
+  ; CHECK:         movi    v{{[0-9]+}}.2s, #0x5, lsl #8
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 5497558140160>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x5, lsl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x4, lsl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 1024, i16 0, i16 1024, i16 0, i16 1024, i16 0, i16 1024, i16 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x3, lsl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 768, i32 768, i32 768, i32 768>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].4s, #0x2, lsl #8
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 2199023256064, i64 2199023256064>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t3_call:
+define void @modimm_t3_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x8, lsl #16
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 0, i8 0, i8 8, i8 0, i8 0, i8 0, i8 8, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x7, lsl #16
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 0, i16 7, i16 0, i16 7>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x6, lsl #16
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 393216, i32 393216>)
+  ; CHECK:         movi    v{{[0-9]+}}.2s, #0x5, lsl #16
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 1407374883880960>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x5, lsl #16
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x4, lsl #16
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 0, i16 4, i16 0, i16 4, i16 0, i16 4, i16 0, i16 4>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x3, lsl #16
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 196608, i32 196608, i32 196608, i32 196608>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].4s, #0x2, lsl #16
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 562949953552384, i64 562949953552384>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t4_call:
+define void @modimm_t4_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x8, lsl #24
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 0, i8 0, i8 0, i8 8, i8 0, i8 0, i8 0, i8 8>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x7, lsl #24
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 0, i16 1792, i16 0, i16 1792>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x6, lsl #24
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 100663296, i32 100663296>)
+  ; CHECK:         movi    v{{[0-9]+}}.2s, #0x5, lsl #24
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 360287970273525760>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x5, lsl #24
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x4, lsl #24
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 0, i16 1024, i16 0, i16 1024, i16 0, i16 1024, i16 0, i16 1024>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x3, lsl #24
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 50331648, i32 50331648, i32 50331648, i32 50331648>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].4s, #0x2, lsl #24
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 144115188109410304, i64 144115188109410304>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t5_call:
+define void @modimm_t5_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4h, #0x8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 8, i8 0, i8 8, i8 0, i8 8, i8 0, i8 8, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4h, #0x7
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 7, i16 7, i16 7, i16 7>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4h, #0x6
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 393222, i32 393222>)
+  ; CHECK:         movi    v{{[0-9]+}}.4h, #0x5
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 1407396358717445>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8h, #0x5
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8h, #0x4
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8h, #0x3
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 196611, i32 196611, i32 196611, i32 196611>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].8h, #0x2
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 562958543486978, i64 562958543486978>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t6_call:
+define void @modimm_t6_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4h, #0x8, lsl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 0, i8 8, i8 0, i8 8, i8 0, i8 8, i8 0, i8 8>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4h, #0x7, lsl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 1792, i16 1792, i16 1792, i16 1792>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4h, #0x6, lsl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 100664832, i32 100664832>)
+  ; CHECK:         movi    v{{[0-9]+}}.4h, #0x5, lsl #8
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 360293467831665920>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8h, #0x5, lsl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8h, #0x4, lsl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 1024, i16 1024, i16 1024, i16 1024, i16 1024, i16 1024, i16 1024, i16 1024>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8h, #0x3, lsl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 50332416, i32 50332416, i32 50332416, i32 50332416>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].8h, #0x2, lsl #8
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 144117387132666368, i64 144117387132666368>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t7_call:
+define void @modimm_t7_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x8, msl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 255, i8 8, i8 0, i8 0, i8 255, i8 8, i8 0, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x7, msl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 2047, i16 0, i16 2047, i16 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x6, msl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 1791, i32 1791>)
+  ; CHECK:         movi    v{{[0-9]+}}.2s, #0x5, msl #8
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 6592774800895>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x5, msl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 255, i8 5, i8 0, i8 0, i8 255, i8 5, i8 0, i8 0, i8 255, i8 5, i8 0, i8 0, i8 255, i8 5, i8 0, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x4, msl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 1279, i16 0, i16 1279, i16 0, i16 1279, i16 0, i16 1279, i16 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x3, msl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 1023, i32 1023, i32 1023, i32 1023>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].4s, #0x2, msl #8
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 3294239916799, i64 3294239916799>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t8_call:
+define void @modimm_t8_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x8, msl #16
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 255, i8 255, i8 8, i8 0, i8 255, i8 255, i8 8, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x7, msl #16
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 65535, i16 7, i16 65535, i16 7>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x6, msl #16
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 458751, i32 458751>)
+  ; CHECK:         movi    v{{[0-9]+}}.2s, #0x5, msl #16
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 1688845565689855>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x5, msl #16
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 255, i8 255, i8 5, i8 0, i8 255, i8 255, i8 5, i8 0, i8 255, i8 255, i8 5, i8 0, i8 255, i8 255, i8 5, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x4, msl #16
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 65535, i16 4, i16 65535, i16 4, i16 65535, i16 4, i16 65535, i16 4>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x3, msl #16
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 262143, i32 262143, i32 262143, i32 262143>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].4s, #0x2, msl #16
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 844420635361279, i64 844420635361279>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t9_call:
+define void @modimm_t9_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8b, #0x8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8b, #0x7
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 1799, i16 1799, i16 1799, i16 1799>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8b, #0x6
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 101058054, i32 101058054>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].16b, #0x5
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].16b, #0x4
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].16b, #0x3
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 50529027, i32 50529027, i32 50529027, i32 50529027>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t10_call:
+define void @modimm_t10_call() {
+  ; CHECK:         movi    d[[REG1:[0-9]+]], #0x0000ff000000ff
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0>)
+  ; CHECK:         movi    d[[REG1:[0-9]+]], #0x00ffff0000ffff
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 -1, i16 0, i16 -1, i16 0>)
+  ; CHECK:         movi    d[[REG1:[0-9]+]], #0xffffffffffffffff
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 -1, i32 -1>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2d, #0xffffff00ffffff
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2d, #0xffffffffffff0000
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 0, i16 -1, i16 -1, i16 -1, i16 0, i16 -1, i16 -1, i16 -1>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2d, #0xffffffff00000000
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 0, i32 -1, i32 0, i32 -1>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t11_call:
+define void @modimm_t11_call() {
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].2s, #4.00000000
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 0, i8 0, i8 128, i8 64, i8 0, i8 0, i8 128, i8 64>)
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].2s, #3.75000000
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 0, i16 16496, i16 0, i16 16496>)
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].2s, #3.50000000
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 1080033280, i32 1080033280>)
+  ; CHECK:         fmov    v{{[0-9]+}}.2s, #0.39062500
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 4523865826746957824>)
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].4s, #3.25000000
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 0, i8 0, i8 80, i8 64, i8 0, i8 0, i8 80, i8 64, i8 0, i8 0, i8 80, i8 64, i8 0, i8 0, i8 80, i8 64>)
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].4s, #3.00000000
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 0, i16 16448, i16 0, i16 16448, i16 0, i16 16448, i16 0, i16 16448>)
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].4s, #2.75000000
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 1076887552, i32 1076887552, i32 1076887552, i32 1076887552>)
+  ; CHECK:         fmov    v[[REG:[0-9]+]].4s, #2.5000000
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 4620693218757967872, i64 4620693218757967872>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t12_call:
+define void @modimm_t12_call() {
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].2d, #0.18750000
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 200, i8 63, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 200, i8 63>)
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].2d, #0.17968750
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 0, i16 0, i16 0, i16 16327, i16 0, i16 0, i16 0, i16 16327>)
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].2d, #0.17187500
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 0, i32 1069940736, i32 0, i32 1069940736>)
+
+  ret void
+}
diff --git a/test/CodeGen/AArch64/aarch64-gep-opt.ll b/test/CodeGen/AArch64/aarch64-gep-opt.ll
new file mode 100644
index 0000000..811eed9
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-gep-opt.ll
@@ -0,0 +1,163 @@
+; RUN: llc -O3 -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -O3 -print-after=codegenprepare -mcpu=cyclone < %s >%t 2>&1 && FileCheck --check-prefix=CHECK-NoAA <%t %s
+; RUN: llc -O3 -print-after=codegenprepare -mcpu=cortex-a53 < %s >%t 2>&1 && FileCheck --check-prefix=CHECK-UseAA <%t %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linux-gnueabi"
+
+; Following test cases test enabling SeparateConstOffsetFromGEP pass in AArch64
+; backend. If useAA() returns true, it will lower a GEP with multiple indices
+; into GEPs with a single index, otherwise it will lower it into a
+; "ptrtoint+arithmetics+inttoptr" form.
+
+%struct = type { i32, i32, i32, i32, [20 x i32] }
+
+; Check that when two complex GEPs are used in two basic blocks, LLVM can
+; elimilate the common subexpression for the second use.
+define void @test_GEP_CSE([240 x %struct]* %string, i32* %adj, i32 %lib, i64 %idxprom) {
+  %liberties = getelementptr [240 x %struct]* %string, i64 1, i64 %idxprom, i32 3
+  %1 = load i32* %liberties, align 4
+  %cmp = icmp eq i32 %1, %lib
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %origin = getelementptr [240 x %struct]* %string, i64 1, i64 %idxprom, i32 2
+  %2 = load i32* %origin, align 4
+  store i32 %2, i32* %adj, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; CHECK-LABEL: test_GEP_CSE:
+; CHECK: madd
+; CHECK: ldr
+; CHECK-NOT: madd
+; CHECK:ldr
+
+; CHECK-NoAA-LABEL: @test_GEP_CSE(
+; CHECK-NoAA: [[PTR0:%[a-zA-Z0-9]+]] = ptrtoint [240 x %struct]* %string to i64
+; CHECK-NoAA: [[PTR1:%[a-zA-Z0-9]+]] = mul i64 %idxprom, 96
+; CHECK-NoAA: [[PTR2:%[a-zA-Z0-9]+]] = add i64 [[PTR0]], [[PTR1]]
+; CHECK-NoAA: add i64 [[PTR2]], 23052
+; CHECK-NoAA: inttoptr
+; CHECK-NoAA: if.then:
+; CHECK-NoAA-NOT: ptrtoint
+; CHECK-NoAA-NOT: mul
+; CHECK-NoAA: add i64 [[PTR2]], 23048
+; CHECK-NoAA: inttoptr
+
+; CHECK-UseAA-LABEL: @test_GEP_CSE(
+; CHECK-UseAA: [[PTR0:%[a-zA-Z0-9]+]] = bitcast [240 x %struct]* %string to i8*
+; CHECK-UseAA: [[IDX:%[a-zA-Z0-9]+]] = mul i64 %idxprom, 96
+; CHECK-UseAA: [[PTR1:%[a-zA-Z0-9]+]] = getelementptr i8* [[PTR0]], i64 [[IDX]]
+; CHECK-UseAA: getelementptr i8* [[PTR1]], i64 23052
+; CHECK-UseAA: bitcast
+; CHECK-UseAA: if.then:
+; CHECK-UseAA: getelementptr i8* [[PTR1]], i64 23048
+; CHECK-UseAA: bitcast
+
+%class.my = type { i32, [128 x i32], i32, [256 x %struct.pt]}
+%struct.pt = type { %struct.point*, i32, i32 }
+%struct.point = type { i32, i32 }
+
+; Check when a GEP is used across two basic block, LLVM can sink the address
+; calculation and code gen can generate a better addressing mode for the second
+; use.
+define void @test_GEP_across_BB(%class.my* %this, i64 %idx) {
+  %1 = getelementptr %class.my* %this, i64 0, i32 3, i64 %idx, i32 1
+  %2 = load i32* %1, align 4
+  %3 = getelementptr %class.my* %this, i64 0, i32 3, i64 %idx, i32 2
+  %4 = load i32* %3, align 4
+  %5 = icmp eq i32 %2, %4
+  br i1 %5, label %if.true, label %exit
+
+if.true:
+  %6 = shl i32 %4, 1
+  store i32 %6, i32* %3, align 4
+  br label %exit
+
+exit:
+  %7 = add nsw i32 %4, 1
+  store i32 %7, i32* %1, align 4
+  ret void
+}
+; CHECK-LABEL: test_GEP_across_BB:
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #528]
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #532]
+; CHECK-NOT: add
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #532]
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #528]
+
+; CHECK-NoAA-LABEL: test_GEP_across_BB(
+; CHECK-NoAA: add i64 [[TMP:%[a-zA-Z0-9]+]], 528
+; CHECK-NoAA: add i64 [[TMP]], 532
+; CHECK-NoAA: if.true:
+; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = add i64 [[TMP]], 532
+; CHECK-NoAA: exit:
+; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = add i64 [[TMP]], 528
+
+; CHECK-UseAA-LABEL: test_GEP_across_BB(
+; CHECK-UseAA: [[PTR0:%[a-zA-Z0-9]+]] = getelementptr
+; CHECK-UseAA: getelementptr i8* [[PTR0]], i64 528
+; CHECK-UseAA: getelementptr i8* [[PTR0]], i64 532
+; CHECK-UseAA: if.true:
+; CHECK-UseAA: {{%sunk[a-zA-Z0-9]+}} = getelementptr i8* [[PTR0]], i64 532
+; CHECK-UseAA: exit:
+; CHECK-UseAA: {{%sunk[a-zA-Z0-9]+}} = getelementptr i8* [[PTR0]], i64 528
+
+%struct.S = type { float, double }
+@struct_array = global [1024 x %struct.S] zeroinitializer, align 16
+
+; The following two test cases check we can extract constant from indices of
+; struct type.
+; The constant offsets are from indices "i64 %idxprom" and "i32 1". As the
+; alloca size of %struct.S is 16, and "i32 1" is the 2rd element whose field
+; offset is 8, the total constant offset is (5 * 16 + 8) = 88.
+define double* @test-struct_1(i32 %i) {
+entry:
+  %add = add nsw i32 %i, 5
+  %idxprom = sext i32 %add to i64
+  %p = getelementptr [1024 x %struct.S]* @struct_array, i64 0, i64 %idxprom, i32 1
+  ret double* %p
+}
+; CHECK-NoAA-LABEL: @test-struct_1(
+; CHECK-NoAA-NOT: getelementptr
+; CHECK-NoAA: add i64 %{{[a-zA-Z0-9]+}}, 88
+
+; CHECK-UseAA-LABEL: @test-struct_1(
+; CHECK-UseAA: getelementptr i8* %{{[a-zA-Z0-9]+}}, i64 88
+
+%struct3 = type { i64, i32 }
+%struct2 = type { %struct3, i32 }
+%struct1 = type { i64, %struct2 }
+%struct0 = type { i32, i32, i64*, [100 x %struct1] }
+
+; The constant offsets are from indices "i32 3", "i64 %arrayidx" and "i32 1".
+; "i32 3" is the 4th element whose field offset is 16. The alloca size of
+; %struct1 is 32. "i32 1" is the 2rd element whose field offset is 8. So the
+; total constant offset is 16 + (-2 * 32) + 8 = -40
+define %struct2* @test-struct_2(%struct0* %ptr, i64 %idx) {
+entry:
+  %arrayidx = add nsw i64 %idx, -2
+  %ptr2 = getelementptr %struct0* %ptr, i64 0, i32 3, i64 %arrayidx, i32 1
+  ret %struct2* %ptr2
+}
+; CHECK-NoAA-LABEL: @test-struct_2(
+; CHECK-NoAA-NOT: = getelementptr
+; CHECK-NoAA: add i64 %{{[a-zA-Z0-9]+}}, -40
+
+; CHECK-UseAA-LABEL: @test-struct_2(
+; CHECK-UseAA: getelementptr i8* %{{[a-zA-Z0-9]+}}, i64 -40
+
+; Test that when a index is added from two constant, SeparateConstOffsetFromGEP
+; pass does not generate incorrect result.
+define void @test_const_add([3 x i32]* %in) {
+  %inc = add nsw i32 2, 1
+  %idxprom = sext i32 %inc to i64
+  %arrayidx = getelementptr [3 x i32]* %in, i64 %idxprom, i64 2
+  store i32 0, i32* %arrayidx, align 4
+  ret void
+}
+; CHECK-LABEL: test_const_add:
+; CHECK: str wzr, [x0, #44]
diff --git a/test/CodeGen/AArch64/aarch64-smull.ll b/test/CodeGen/AArch64/aarch64-smull.ll
new file mode 100644
index 0000000..92582d7
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-smull.ll
@@ -0,0 +1,332 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o -| FileCheck %s
+
+define <8 x i16> @smull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+; CHECK-LABEL: smull_v8i8_v8i16:
+; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = mul <8 x i16> %tmp3, %tmp4
+  ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @smull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+; CHECK-LABEL: smull_v4i16_v4i32:
+; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = mul <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @smull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+; CHECK-LABEL: smull_v2i32_v2i64:
+; CHECK:  smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = mul <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @umull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+; CHECK-LABEL: umull_v8i8_v8i16:
+; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = mul <8 x i16> %tmp3, %tmp4
+  ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @umull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+; CHECK-LABEL: umull_v4i16_v4i32:
+; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = mul <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @umull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+; CHECK-LABEL: umull_v2i32_v2i64:
+; CHECK:  umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = mul <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @smlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+; CHECK-LABEL: smlal_v8i8_v8i16:
+; CHECK:  smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = load <8 x i8>* %C
+  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
+  %tmp6 = mul <8 x i16> %tmp4, %tmp5
+  %tmp7 = add <8 x i16> %tmp1, %tmp6
+  ret <8 x i16> %tmp7
+}
+
+define <4 x i32> @smlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+; CHECK-LABEL: smlal_v4i16_v4i32:
+; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i16>* %C
+  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
+  %tmp6 = mul <4 x i32> %tmp4, %tmp5
+  %tmp7 = add <4 x i32> %tmp1, %tmp6
+  ret <4 x i32> %tmp7
+}
+
+define <2 x i64> @smlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+; CHECK-LABEL: smlal_v2i32_v2i64:
+; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i64>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i32>* %C
+  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
+  %tmp6 = mul <2 x i64> %tmp4, %tmp5
+  %tmp7 = add <2 x i64> %tmp1, %tmp6
+  ret <2 x i64> %tmp7
+}
+
+define <8 x i16> @umlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+; CHECK-LABEL: umlal_v8i8_v8i16:
+; CHECK:  umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = load <8 x i8>* %C
+  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
+  %tmp6 = mul <8 x i16> %tmp4, %tmp5
+  %tmp7 = add <8 x i16> %tmp1, %tmp6
+  ret <8 x i16> %tmp7
+}
+
+define <4 x i32> @umlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+; CHECK-LABEL: umlal_v4i16_v4i32:
+; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i16>* %C
+  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
+  %tmp6 = mul <4 x i32> %tmp4, %tmp5
+  %tmp7 = add <4 x i32> %tmp1, %tmp6
+  ret <4 x i32> %tmp7
+}
+
+define <2 x i64> @umlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+; CHECK-LABEL: umlal_v2i32_v2i64:
+; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i64>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i32>* %C
+  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
+  %tmp6 = mul <2 x i64> %tmp4, %tmp5
+  %tmp7 = add <2 x i64> %tmp1, %tmp6
+  ret <2 x i64> %tmp7
+}
+
+define <8 x i16> @smlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+; CHECK-LABEL: smlsl_v8i8_v8i16:
+; CHECK:  smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = load <8 x i8>* %C
+  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
+  %tmp6 = mul <8 x i16> %tmp4, %tmp5
+  %tmp7 = sub <8 x i16> %tmp1, %tmp6
+  ret <8 x i16> %tmp7
+}
+
+define <4 x i32> @smlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+; CHECK-LABEL: smlsl_v4i16_v4i32:
+; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i16>* %C
+  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
+  %tmp6 = mul <4 x i32> %tmp4, %tmp5
+  %tmp7 = sub <4 x i32> %tmp1, %tmp6
+  ret <4 x i32> %tmp7
+}
+
+define <2 x i64> @smlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+; CHECK-LABEL: smlsl_v2i32_v2i64:
+; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i64>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i32>* %C
+  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
+  %tmp6 = mul <2 x i64> %tmp4, %tmp5
+  %tmp7 = sub <2 x i64> %tmp1, %tmp6
+  ret <2 x i64> %tmp7
+}
+
+define <8 x i16> @umlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+; CHECK-LABEL: umlsl_v8i8_v8i16:
+; CHECK:  umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = load <8 x i8>* %C
+  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
+  %tmp6 = mul <8 x i16> %tmp4, %tmp5
+  %tmp7 = sub <8 x i16> %tmp1, %tmp6
+  ret <8 x i16> %tmp7
+}
+
+define <4 x i32> @umlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+; CHECK-LABEL: umlsl_v4i16_v4i32:
+; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i16>* %C
+  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
+  %tmp6 = mul <4 x i32> %tmp4, %tmp5
+  %tmp7 = sub <4 x i32> %tmp1, %tmp6
+  ret <4 x i32> %tmp7
+}
+
+define <2 x i64> @umlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+; CHECK-LABEL: umlsl_v2i32_v2i64:
+; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i64>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i32>* %C
+  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
+  %tmp6 = mul <2 x i64> %tmp4, %tmp5
+  %tmp7 = sub <2 x i64> %tmp1, %tmp6
+  ret <2 x i64> %tmp7
+}
+
+; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements.
+define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
+; CHECK-LABEL: smull_extvec_v8i8_v8i16:
+; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp3 = sext <8 x i8> %arg to <8 x i16>
+  %tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12>
+  ret <8 x i16> %tmp4
+}
+
+define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
+; Do not use SMULL if the BUILD_VECTOR element values are too big.
+; CHECK-LABEL: smull_noextvec_v8i8_v8i16:
+; CHECK: movz
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+  %tmp3 = sext <8 x i8> %arg to <8 x i16>
+  %tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
+; CHECK-LABEL: smull_extvec_v4i16_v4i32:
+; CHECK:  smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp3 = sext <4 x i16> %arg to <4 x i32>
+  %tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12>
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
+; CHECK: smull_extvec_v2i32_v2i64
+; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp3 = sext <2 x i32> %arg to <2 x i64>
+  %tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
+  ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
+; CHECK-LABEL: umull_extvec_v8i8_v8i16:
+; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp3 = zext <8 x i8> %arg to <8 x i16>
+  %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
+  ret <8 x i16> %tmp4
+}
+
+define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
+; Do not use SMULL if the BUILD_VECTOR element values are too big.
+; CHECK-LABEL: umull_noextvec_v8i8_v8i16:
+; CHECK: movz
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+  %tmp3 = zext <8 x i8> %arg to <8 x i16>
+  %tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
+; CHECK-LABEL: umull_extvec_v4i16_v4i32:
+; CHECK:  umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp3 = zext <4 x i16> %arg to <4 x i32>
+  %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
+; CHECK-LABEL: umull_extvec_v2i32_v2i64:
+; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp3 = zext <2 x i32> %arg to <2 x i64>
+  %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
+  ret <2 x i64> %tmp4
+}
+
+define i16 @smullWithInconsistentExtensions(<8 x i8> %vec) {
+; If one operand has a zero-extend and the other a sign-extend, smull
+; cannot be used.
+; CHECK-LABEL: smullWithInconsistentExtensions:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+  %1 = sext <8 x i8> %vec to <8 x i16>
+  %2 = mul <8 x i16> %1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %3 = extractelement <8 x i16> %2, i32 0
+  ret i16 %3
+}
+
+define void @distribute(i16* %dst, i8* %src, i32 %mul) nounwind {
+entry:
+; CHECK-LABEL: distribute:
+; CHECK: umull [[REG1:(v[0-9]+.8h)]], {{v[0-9]+}}.8b, [[REG2:(v[0-9]+.8b)]]
+; CHECK: umlal [[REG1]], {{v[0-9]+}}.8b, [[REG2]]
+  %0 = trunc i32 %mul to i8
+  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+  %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %3 = tail call <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8* %src, i32 1)
+  %4 = bitcast <16 x i8> %3 to <2 x double>
+  %5 = extractelement <2 x double> %4, i32 1
+  %6 = bitcast double %5 to <8 x i8>
+  %7 = zext <8 x i8> %6 to <8 x i16>
+  %8 = zext <8 x i8> %2 to <8 x i16>
+  %9 = extractelement <2 x double> %4, i32 0
+  %10 = bitcast double %9 to <8 x i8>
+  %11 = zext <8 x i8> %10 to <8 x i16>
+  %12 = add <8 x i16> %7, %11
+  %13 = mul <8 x i16> %12, %8
+  %14 = bitcast i16* %dst to i8*
+  tail call void @llvm.aarch64.neon.vst1.v8i16(i8* %14, <8 x i16> %13, i32 2)
+  ret void
+}
+
+declare <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8*, i32) nounwind readonly
+
+declare void @llvm.aarch64.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
+
diff --git a/test/CodeGen/AArch64/aarch64-wide-shuffle.ll b/test/CodeGen/AArch64/aarch64-wide-shuffle.ll
new file mode 100644
index 0000000..d06df7a
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-wide-shuffle.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define <4 x i16> @f(<4 x i32> %vqdmlal_v3.i, <8 x i16> %x5) {
+entry:
+  ; Check that we don't just dup the input vector. The code emitted is ext, dup, ext, ext
+  ; but only match the last three instructions as the first two could be combined to
+  ; a dup2 at some stage.
+  ; CHECK: dup
+  ; CHECK: ext
+  ; CHECK: ext
+  %x4 = extractelement <4 x i32> %vqdmlal_v3.i, i32 2
+  %vgetq_lane = trunc i32 %x4 to i16
+  %vecinit.i = insertelement <4 x i16> undef, i16 %vgetq_lane, i32 0
+  %vecinit2.i = insertelement <4 x i16> %vecinit.i, i16 %vgetq_lane, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vgetq_lane, i32 3
+  %vgetq_lane261 = extractelement <8 x i16> %x5, i32 0
+  %vset_lane267 = insertelement <4 x i16> %vecinit3.i, i16 %vgetq_lane261, i32 1
+  ret <4 x i16> %vset_lane267
+}
diff --git a/test/CodeGen/AArch64/aarch64_f16_be.ll b/test/CodeGen/AArch64/aarch64_f16_be.ll
new file mode 100644
index 0000000..7504439
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64_f16_be.ll
@@ -0,0 +1,67 @@
+; RUN: llc -mtriple=aarch64-linux-gnuabi -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-linux-gnuabi -O0 < %s | FileCheck %s --check-prefix=CHECK-BE
+
+define void @test_bitcast_v8f16_to_v4f32(<8 x half> %a) {
+; CHECK-LABEL: test_bitcast_v8f16_to_v4f32:
+; CHECK-NOT: st1
+
+; CHECK-BE-LABEL: test_bitcast_v8f16_to_v4f32:
+; CHECK-BE: st1
+
+  %x = alloca <4 x float>, align 16
+  %y = bitcast <8 x half> %a to <4 x float>
+  store <4 x float> %y, <4 x float>* %x, align 16
+  ret void
+}
+
+define void @test_bitcast_v8f16_to_v2f64(<8 x half> %a) {
+; CHECK-LABEL: test_bitcast_v8f16_to_v2f64:
+; CHECK-NOT: st1
+
+; CHECK-BE-LABEL: test_bitcast_v8f16_to_v2f64:
+; CHECK-BE: st1
+
+  %x = alloca <2 x double>, align 16
+  %y = bitcast <8 x half> %a to <2 x double>
+  store <2 x double> %y, <2 x double>* %x, align 16
+  ret void
+}
+
+define void @test_bitcast_v8f16_to_fp128(<8 x half> %a) {
+; CHECK-LABEL: test_bitcast_v8f16_to_fp128:
+; CHECK-NOT: st1
+
+; CHECK-BE-LABEL: test_bitcast_v8f16_to_fp128:
+; CHECK-BE: st1
+
+  %x = alloca fp128, align 16
+  %y = bitcast <8 x half> %a to fp128
+  store fp128 %y, fp128* %x, align 16
+  ret void
+}
+
+define void @test_bitcast_v4f16_to_v2f32(<4 x half> %a) {
+; CHECK-LABEL: test_bitcast_v4f16_to_v2f32:
+; CHECK-NOT: st1
+
+; CHECK-BE-LABEL: test_bitcast_v4f16_to_v2f32:
+; CHECK-BE: st1
+
+  %x = alloca <2 x float>, align 8
+  %y = bitcast <4 x half> %a to <2 x float>
+  store <2 x float> %y, <2 x float>* %x, align 8
+  ret void
+}
+
+define void @test_bitcast_v4f16_to_v1f64(<4 x half> %a) {
+; CHECK-LABEL: test_bitcast_v4f16_to_v1f64:
+; CHECK-NOT: st1
+
+; CHECK-BE-LABEL: test_bitcast_v4f16_to_v1f64:
+; CHECK-BE: st1
+
+  %x = alloca <1 x double>, align 8
+  %y = bitcast <4 x half> %a to <1 x double>
+  store <1 x double> %y, <1 x double>* %x, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/aarch64_tree_tests.ll b/test/CodeGen/AArch64/aarch64_tree_tests.ll
new file mode 100644
index 0000000..08e506a
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64_tree_tests.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s | FileCheck %s 
+
+; ModuleID = 'aarch64_tree_tests.bc'
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "arm64--linux-gnu"
+
+; CHECK-LABLE: @aarch64_tree_tests_and
+; CHECK: .hword	32768                   
+; CHECK: .hword	32767                   
+; CHECK: .hword	4664                    
+; CHECK: .hword	32767                   
+; CHECK: .hword	32768                   
+; CHECK: .hword	32768                   
+; CHECK: .hword	0                       
+; CHECK: .hword	0                      
+
+; Function Attrs: nounwind readnone
+define <8 x i16> @aarch64_tree_tests_and(<8 x i16> %a) {
+entry:
+  %and = and <8 x i16> <i16 0, i16 undef, i16 undef, i16 0, i16 0, i16 undef, i16 undef, i16 0>, %a
+  %ret = add <8 x i16> %and, <i16 -32768, i16 32767, i16 4664, i16 32767, i16 -32768, i16 -32768, i16 0, i16 0>
+  ret <8 x i16> %ret
+}
+
+; CHECK-LABLE: @aarch64_tree_tests_or
+; CHECK: .hword	32768                 
+; CHECK: .hword	32766
+; CHECK: .hword	4664     
+; CHECK: .hword	32766                
+; CHECK: .hword	32768 
+; CHECK: .hword	32768
+; CHECK: .hword	65535            
+; CHECK: .hword	65535
+
+; Function Attrs: nounwind readnone
+define <8 x i16> @aarch64_tree_tests_or(<8 x i16> %a) {
+entry:
+  %or = or <8 x i16> <i16 -1, i16 undef, i16 undef, i16 -1, i16 -1, i16 undef, i16 undef, i16 -1>, %a
+  %ret = add <8 x i16> %or, <i16 -32767, i16 32767, i16 4665, i16 32767, i16 -32767, i16 -32767, i16 0, i16 0>
+  ret <8 x i16> %ret
+}
+
diff --git a/test/CodeGen/AArch64/adc.ll b/test/CodeGen/AArch64/adc.ll
index 892573b..0488ee2 100644
--- a/test/CodeGen/AArch64/adc.ll
+++ b/test/CodeGen/AArch64/adc.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-apple-ios7.0 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
 
 define i128 @test_simple(i128 %a, i128 %b, i128 %c) {
 ; CHECK-LABEL: test_simple:
diff --git a/test/CodeGen/AArch64/analyzecmp.ll b/test/CodeGen/AArch64/analyzecmp.ll
new file mode 100644
index 0000000..8962505
--- /dev/null
+++ b/test/CodeGen/AArch64/analyzecmp.ll
@@ -0,0 +1,32 @@
+; RUN: llc -O3 -mcpu=cortex-a57 < %s | FileCheck %s 
+
+; CHECK-LABLE: @test
+; CHECK: tst [[CMP:x[0-9]+]], #0x8000000000000000
+; CHECK: csel [[R0:x[0-9]+]], [[S0:x[0-9]+]], [[S1:x[0-9]+]], eq
+; CHECK: csel [[R1:x[0-9]+]], [[S2:x[0-9]+]], [[S3:x[0-9]+]], eq
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "arm64--linux-gnueabi"
+
+define void @test(i64 %a, i64* %ptr1, i64* %ptr2) #0 align 2 {
+entry:
+  %conv = and i64 %a, 4294967295
+  %add = add nsw i64 %conv, -1
+  %div = sdiv i64 %add, 64
+  %rem = srem i64 %add, 64
+  %cmp = icmp slt i64 %rem, 0
+  br i1 %cmp, label %if.then, label %exit
+
+if.then:                                
+  %add2 = add nsw i64 %rem, 64
+  %add3 = add i64 %div, -1
+  br label %exit
+
+exit:                 
+  %__n = phi i64 [ %add3, %if.then ], [ %div, %entry ]
+  %__n.0 = phi i64 [ %add2, %if.then ], [ %rem, %entry ]
+  store i64 %__n, i64* %ptr1
+  store i64 %__n.0, i64* %ptr2
+  ret void 
+}
+
+
diff --git a/test/CodeGen/AArch64/and-mask-removal.ll b/test/CodeGen/AArch64/and-mask-removal.ll
new file mode 100644
index 0000000..f803b85
--- /dev/null
+++ b/test/CodeGen/AArch64/and-mask-removal.ll
@@ -0,0 +1,269 @@
+; RUN: llc -O0 -fast-isel=false -mtriple=arm64-apple-darwin  < %s  | FileCheck %s
+
+@board = common global [400 x i8] zeroinitializer, align 1
+@next_string = common global i32 0, align 4
+@string_number = common global [400 x i32] zeroinitializer, align 4
+
+; Function Attrs: nounwind ssp
+define void @new_position(i32 %pos) {
+entry:
+  %idxprom = sext i32 %pos to i64
+  %arrayidx = getelementptr inbounds [400 x i8]* @board, i64 0, i64 %idxprom
+  %tmp = load i8* %arrayidx, align 1
+  %.off = add i8 %tmp, -1
+  %switch = icmp ult i8 %.off, 2
+  br i1 %switch, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %tmp1 = load i32* @next_string, align 4
+  %arrayidx8 = getelementptr inbounds [400 x i32]* @string_number, i64 0, i64 %idxprom
+  store i32 %tmp1, i32* %arrayidx8, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+; CHECK-LABEL: new_position
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_0(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, 74
+  %1 = icmp ult i8 %0, -20
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_0
+; CHECK: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_1(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, 246
+  %1 = icmp uge i8 %0, 90
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_1
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_2(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, 227
+  %1 = icmp ne i8 %0, 179
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_2
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_3(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, 201
+  %1 = icmp eq i8 %0, 154
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_3
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_4(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, -79
+  %1 = icmp ne i8 %0, -40
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_4
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_5(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, 133
+  %1 = icmp uge i8 %0, -105
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_5
+; CHECK: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_6(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, -58
+  %1 = icmp uge i8 %0, 155
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_6
+; CHECK: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_7(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, 225
+  %1 = icmp ult i8 %0, 124
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_7
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+
+
+define zeroext i1 @test8_8(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, 190
+  %1 = icmp uge i8 %0, 1
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_8
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_0(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, -46989
+  %1 = icmp ne i16 %0, -41903
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_0
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_2(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, 16882
+  %1 = icmp ule i16 %0, -24837
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_2
+; CHECK: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_3(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, 29283
+  %1 = icmp ne i16 %0, 16947
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_3
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_4(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, -35551
+  %1 = icmp uge i16 %0, 15677
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_4
+; CHECK: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_5(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, -25214
+  %1 = icmp ne i16 %0, -1932
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_5
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_6(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, -32194
+  %1 = icmp uge i16 %0, -41215
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_6
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_7(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, 9272
+  %1 = icmp uge i16 %0, -42916
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_7
+; CHECK: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_8(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, -63749
+  %1 = icmp ne i16 %0, 6706
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_8
+; CHECK-NOT: and
+; CHECK: ret
+}
+
diff --git a/test/CodeGen/AArch64/andandshift.ll b/test/CodeGen/AArch64/andandshift.ll
new file mode 100644
index 0000000..e2c7a09
--- /dev/null
+++ b/test/CodeGen/AArch64/andandshift.ll
@@ -0,0 +1,28 @@
+; RUN: llc -O3 < %s | FileCheck %s 
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "arm64--linux-gnu"
+
+; Function Attrs: nounwind readnone
+define i32 @test1(i8 %a) {
+; CHECK-LABLE: @test1
+; CHECK: ubfx {{w[0-9]+}}, w0, #3, #5
+entry:
+  %conv = zext i8 %a to i32
+  %shr1 = lshr i32 %conv, 3
+  ret i32 %shr1
+}
+
+; Function Attrs: nounwind readnone
+define i32 @test2(i8 %a) {
+; CHECK-LABLE: @test2
+; CHECK: and {{w[0-9]+}}, w0, #0xff
+; CHECK: ubfx {{w[0-9]+}}, w0, #3, #5
+entry:
+  %conv = zext i8 %a to i32
+  %cmp = icmp ugt i8 %a, 47
+  %shr5 = lshr i32 %conv, 3
+  %retval.0 = select i1 %cmp, i32 %shr5, i32 %conv
+  ret i32 %retval.0
+}
+
+
diff --git a/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
index 2b083d8..e57a8c9 100644
--- a/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
+++ b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
@@ -11,34 +11,34 @@ if.then24:                                        ; preds = %entry
   unreachable
 
 if.else295:                                       ; preds = %entry
-  call void @llvm.dbg.declare(metadata !{i32* %do_tab_convert}, metadata !16), !dbg !18
+  call void @llvm.dbg.declare(metadata !{i32* %do_tab_convert}, metadata !16, metadata !{metadata !"0x102"}), !dbg !18
   store i32 0, i32* %do_tab_convert, align 4, !dbg !19
   unreachable
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.gv = !{!0}
 !llvm.dbg.sp = !{!1, !7, !10, !11, !12}
 
-!0 = metadata !{i32 589876, i32 0, metadata !1, metadata !"vsplive", metadata !"vsplive", metadata !"", metadata !2, i32 617, metadata !6, i32 1, i32 1, null, null} ; [ DW_TAG_variable ]
-!1 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"drt_vsprintf", metadata !"drt_vsprintf", metadata !"", i32 616, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 589865, metadata !20} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 589841, metadata !20, i32 12, metadata !"clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !5, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!0 = metadata !{metadata !"0x34\00vsplive\00vsplive\00\00617\001\001", metadata !1, metadata !2, metadata !6, null, null} ; [ DW_TAG_variable ]
+!1 = metadata !{metadata !"0x2e\00drt_vsprintf\00drt_vsprintf\00\00616\000\001\000\006\00256\000\000", metadata !20, metadata !2, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\0012\00clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)\001\00\000\00\000", metadata !20, metadata !21, metadata !21, null, null, null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !2, null, metadata !5, i32 0} ; [ DW_TAG_subroutine_type ]
 !5 = metadata !{metadata !6}
-!6 = metadata !{i32 589860, null, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"putc_mem", metadata !"putc_mem", metadata !"", i32 30, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!8 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !9, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !3} ; [ DW_TAG_base_type ]
+!7 = metadata !{metadata !"0x2e\00putc_mem\00putc_mem\00\0030\001\001\000\006\00256\000\000", metadata !20, metadata !2, metadata !8, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!8 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !2, null, metadata !9, i32 0} ; [ DW_TAG_subroutine_type ]
 !9 = metadata !{null}
-!10 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_double", metadata !"print_double", metadata !"", i32 203, metadata !4, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_number", metadata !"print_number", metadata !"", i32 75, metadata !4, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!12 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"get_flags", metadata !"get_flags", metadata !"", i32 508, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!10 = metadata !{metadata !"0x2e\00print_double\00print_double\00\00203\001\001\000\006\00256\000\000", metadata !20, metadata !2, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!11 = metadata !{metadata !"0x2e\00print_number\00print_number\00\0075\001\001\000\006\00256\000\000", metadata !20, metadata !2, metadata !4, i32 0, null, null, null, null} ; [ DW_TAG_subprogram ]
+!12 = metadata !{metadata !"0x2e\00get_flags\00get_flags\00\00508\001\001\000\006\00256\000\000", metadata !20, metadata !2, metadata !8, null, null, null, null, null} ; [ DW_TAG_subprogram ]
 !13 = metadata !{i32 653, i32 5, metadata !14, null}
-!14 = metadata !{i32 589835, metadata !20, metadata !15, i32 652, i32 35, i32 2} ; [ DW_TAG_lexical_block ]
-!15 = metadata !{i32 589835, metadata !20, metadata !1, i32 616, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{i32 590080, metadata !17, metadata !"do_tab_convert", metadata !2, i32 853, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!17 = metadata !{i32 589835, metadata !20, metadata !14, i32 850, i32 12, i32 33} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0xb\00652\0035\002", metadata !20, metadata !15} ; [ DW_TAG_lexical_block ]
+!15 = metadata !{metadata !"0xb\00616\001\000", metadata !20, metadata !1} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{metadata !"0x100\00do_tab_convert\00853\000", metadata !17, metadata !2, metadata !6} ; [ DW_TAG_auto_variable ]
+!17 = metadata !{metadata !"0xb\00850\0012\0033", metadata !20, metadata !14} ; [ DW_TAG_lexical_block ]
 !18 = metadata !{i32 853, i32 11, metadata !17, null}
 !19 = metadata !{i32 853, i32 29, metadata !17, null}
 !20 = metadata !{metadata !"print.i", metadata !"/Volumes/Ebi/echeng/radars/r9146594"}
diff --git a/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll b/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
index 8f99bc3..a83f164 100644
--- a/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
+++ b/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
@@ -12,7 +12,7 @@ entry:
 
 for.body:
 ; CHECK: for.body
-; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}]
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}, x{{[0-9]+}}]
 ; CHECK: add x[[REG:[0-9]+]],
 ; CHECK:                      x[[REG]], #1, lsl  #12
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
diff --git a/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
index 168e921..7d880f3 100644
--- a/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
+++ b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=arm64 -O0 < %s | FileCheck %s
-; RUN: llc -march=arm64 -O3 < %s | FileCheck %s
+; RUN: llc -march=arm64 -O0 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=arm64 -O3 -verify-machineinstrs < %s | FileCheck %s
 
 @.str = private unnamed_addr constant [9 x i8] c"%lf %lu\0A\00", align 1
 @.str1 = private unnamed_addr constant [8 x i8] c"%lf %u\0A\00", align 1
diff --git a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
index c4597d5..6266d1c 100644
--- a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
+++ b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
@@ -1,15 +1,36 @@
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s -check-prefix=GENERIC
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=true | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-NOOPT
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=false | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-OPT
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=true | FileCheck %s -check-prefix=GENERIC -check-prefix=GENERIC-NOOPT
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=false | FileCheck %s -check-prefix=GENERIC -check-prefix=GENERIC-OPT
 
 define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
 ; CHECK-LABEL: bar:
 ; CHECK: add.2d	v[[REG:[0-9]+]], v0, v1
 ; CHECK: add	d[[REG3:[0-9]+]], d[[REG]], d1
+; Without advanced copy optimization, we end up with cross register
+; banks copies that cannot be coalesced.
+; CHECK-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]]
+; With advanced copy optimization, we end up with just one copy
+; to insert the computed high part into the V register. 
+; CHECK-OPT-NOT: fmov
 ; CHECK: sub	d[[REG2:[0-9]+]], d[[REG]], d1
+; CHECK: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]]
+; CHECK-NOOPT: fmov d0, [[COPY_REG3]]
+; CHECK-OPT-NOT: fmov
+; CHECK: ins.d v0[1], [[COPY_REG2]]
+; CHECK-NEXT: ret
+;
 ; GENERIC-LABEL: bar:
 ; GENERIC: add	v[[REG:[0-9]+]].2d, v0.2d, v1.2d
 ; GENERIC: add	d[[REG3:[0-9]+]], d[[REG]], d1
+; GENERIC-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]]
+; GENERIC-OPT-NOT: fmov
 ; GENERIC: sub	d[[REG2:[0-9]+]], d[[REG]], d1
+; GENERIC: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]]
+; GENERIC-NOOPT: fmov d0, [[COPY_REG3]]
+; GENERIC-OPT-NOT: fmov
+; GENERIC: ins v0.d[1], [[COPY_REG2]]
+; GENERIC-NEXT: ret
   %add = add <2 x i64> %a, %b
   %vgetq_lane = extractelement <2 x i64> %add, i32 0
   %vgetq_lane2 = extractelement <2 x i64> %b, i32 0
@@ -65,3 +86,44 @@ define double @add_sub_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
   %retval = bitcast i64 %sub.i to double
   ret double %retval
 }
+define double @and_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: and_su64:
+; CHECK: and.8b v0, v1, v0
+; CHECK-NEXT: ret
+; GENERIC-LABEL: and_su64:
+; GENERIC: and v0.8b, v1.8b, v0.8b
+; GENERIC-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %or.i = and i64 %vecext1, %vecext
+  %retval = bitcast i64 %or.i to double
+  ret double %retval
+}
+
+define double @orr_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: orr_su64:
+; CHECK: orr.8b v0, v1, v0
+; CHECK-NEXT: ret
+; GENERIC-LABEL: orr_su64:
+; GENERIC: orr v0.8b, v1.8b, v0.8b
+; GENERIC-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %or.i = or i64 %vecext1, %vecext
+  %retval = bitcast i64 %or.i to double
+  ret double %retval
+}
+
+define double @xorr_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: xorr_su64:
+; CHECK: eor.8b v0, v1, v0
+; CHECK-NEXT: ret
+; GENERIC-LABEL: xorr_su64:
+; GENERIC: eor v0.8b, v1.8b, v0.8b
+; GENERIC-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %xor.i = xor i64 %vecext1, %vecext
+  %retval = bitcast i64 %xor.i to double
+  ret double %retval
+}
diff --git a/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll b/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll
index 1b2d543..1bb47fc 100644
--- a/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll
+++ b/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc -O0 -march=arm64 -aarch64-neon-syntax=apple -verify-machineinstrs < %s | FileCheck %s
 
 ; The following 2 test cases test shufflevector with beginning UNDEF mask.
 define <8 x i16> @test_vext_undef_traverse(<8 x i16> %in) {
diff --git a/test/CodeGen/AArch64/arm64-aapcs-be.ll b/test/CodeGen/AArch64/arm64-aapcs-be.ll
new file mode 100644
index 0000000..77e2b0f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-aapcs-be.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=aarch64_be-none-eabi -fast-isel=false < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-none-eabi -fast-isel=true < %s | FileCheck %s
+
+; Check narrow argument passing via stack - callee end
+define i32 @test_narrow_args_callee(i64 %x0, i64 %x1, i64 %x2, i64 %x3, i64 %x4, i64 %x5, i64 %x6, i64 %x7, i8 %c, i16 %s) #0 {
+entry:
+  %conv = zext i8 %c to i32
+  %conv1 = sext i16 %s to i32
+  %add = add nsw i32 %conv1, %conv
+; CHECK-LABEL: test_narrow_args_callee:
+; CHECK-DAG: ldrb w{{[0-9]}}, [sp, #7]
+; CHECK-DAG: ldr{{s?}}h w{{[0-9]}}, [sp, #14]
+  ret i32 %add
+}
+
+; Check narrow argument passing via stack - caller end
+define i32 @test_narrow_args_caller() #0 {
+entry:
+  %call = tail call i32 @test_narrow_args_callee(i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i8 8, i16 9)
+; CHECK-LABEL: test_narrow_args_caller:
+; CHECK-DAG: strh w{{[0-9]}}, [sp, #14]
+; CHECK-DAG: strb w{{[0-9]}}, [sp, #7]
+  ret i32 %call
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-aapcs.ll b/test/CodeGen/AArch64/arm64-aapcs.ll
index ccf1371..41c3ad5 100644
--- a/test/CodeGen/AArch64/arm64-aapcs.ll
+++ b/test/CodeGen/AArch64/arm64-aapcs.ll
@@ -109,3 +109,45 @@ entry:
 ; CHECK: ldr {{q[0-9]+}}, [sp]
   ret <2 x double> %varg_stack;
 }
+
+; Check that f16 can be passed and returned (ACLE 2.0 extension)
+define half @test_half(float, half %arg) {
+; CHECK-LABEL: test_half:
+; CHECK: mov v0.16b, v1.16b
+  ret half %arg;
+}
+
+; Check that f16 constants are materialized correctly
+define half @test_half_const() {
+; CHECK-LABEL: test_half_const:
+; CHECK: ldr h0, [x{{[0-9]+}}, :lo12:{{.*}}]
+  ret half 0xH4248
+}
+
+; Check that v4f16 can be passed and returned in registers
+define <4 x half> @test_v4_half_register(float, <4 x half> %arg) {
+; CHECK-LABEL: test_v4_half_register:
+; CHECK: mov v0.16b, v1.16b
+  ret <4 x half> %arg;
+}
+
+; Check that v8f16 can be passed and returned in registers
+define <8 x half> @test_v8_half_register(float, <8 x half> %arg) {
+; CHECK-LABEL: test_v8_half_register:
+; CHECK: mov v0.16b, v1.16b
+  ret <8 x half> %arg;
+}
+
+; Check that v4f16 can be passed and returned on the stack
+define <4 x half> @test_v4_half_stack([8 x <2 x double>], <4 x half> %arg) {
+; CHECK-LABEL: test_v4_half_stack:
+; CHECK: ldr d0, [sp]
+  ret <4 x half> %arg;
+}
+
+; Check that v8f16 can be passed and returned on the stack
+define <8 x half> @test_v8_half_stack([8 x <2 x double>], <8 x half> %arg) {
+; CHECK-LABEL: test_v8_half_stack:
+; CHECK: ldr q0, [sp]
+  ret <8 x half> %arg;
+}
diff --git a/test/CodeGen/AArch64/arm64-abi.ll b/test/CodeGen/AArch64/arm64-abi.ll
index a955029..8a6b64d 100644
--- a/test/CodeGen/AArch64/arm64-abi.ll
+++ b/test/CodeGen/AArch64/arm64-abi.ll
@@ -1,7 +1,5 @@
-; RUN: llc < %s -debug -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
-; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
-; REQUIRES: asserts
-target triple = "arm64-apple-darwin"
+; RUN: llc     -mtriple=arm64-apple-darwin -mcpu=cyclone -enable-misched=false < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=arm64-apple-darwin                                     < %s | FileCheck --check-prefix=FAST %s
 
 ; rdar://9932559
 define i64 @i8i16callee(i64 %a1, i64 %a2, i64 %a3, i8 signext %a4, i16 signext %a5, i64 %a6, i64 %a7, i64 %a8, i8 signext %b1, i16 signext %b2, i8 signext %b3, i8 signext %b4) nounwind readnone noinline {
@@ -42,7 +40,7 @@ entry:
 
 define i32 @i8i16caller() nounwind readnone {
 entry:
-; CHECK: i8i16caller
+; CHECK-LABEL: i8i16caller
 ; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
 ; They are i8, i16, i8 and i8.
 ; CHECK-DAG: strb {{w[0-9]+}}, [sp, #5]
@@ -50,7 +48,7 @@ entry:
 ; CHECK-DAG: strh {{w[0-9]+}}, [sp, #2]
 ; CHECK-DAG: strb {{w[0-9]+}}, [sp]
 ; CHECK: bl
-; FAST: i8i16caller
+; FAST-LABEL: i8i16caller
 ; FAST: strb {{w[0-9]+}}, [sp]
 ; FAST: strh {{w[0-9]+}}, [sp, #2]
 ; FAST: strb {{w[0-9]+}}, [sp, #4]
@@ -64,7 +62,7 @@ entry:
 ; rdar://12651543
 define double @circle_center([2 x float] %a) nounwind ssp {
   %call = tail call double @ext([2 x float] %a) nounwind
-; CHECK: circle_center
+; CHECK-LABEL: circle_center
 ; CHECK: bl
   ret double %call
 }
@@ -75,10 +73,10 @@ declare double @ext([2 x float])
 ; A double argument will be passed on stack, so vecotr should be at sp+16.
 define double @fixed_4i(<4 x i32>* nocapture %in) nounwind {
 entry:
-; CHECK: fixed_4i
+; CHECK-LABEL: fixed_4i
 ; CHECK: str [[REG_1:q[0-9]+]], [sp, #16]
-; FAST: fixed_4i
-; FAST: sub sp, sp, #64
+; FAST-LABEL: fixed_4i
+; FAST: sub sp, sp
 ; FAST: mov x[[ADDR:[0-9]+]], sp
 ; FAST: str [[REG_1:q[0-9]+]], [x[[ADDR]], #16]
   %0 = load <4 x i32>* %in, align 16
@@ -93,7 +91,7 @@ declare double @args_vec_4i(double, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>,
 define void @test1(float %f1, double %d1, double %d2, double %d3, double %d4,
        double %d5, double %d6, double %d7, double %d8, i32 %i) nounwind ssp {
 entry:
-; CHECK: test1
+; CHECK-LABEL: test1
 ; CHECK: ldr [[REG_1:d[0-9]+]], [sp]
 ; CHECK: scvtf [[REG_2:s[0-9]+]], w0
 ; CHECK: fadd s0, [[REG_2]], s0
@@ -110,7 +108,7 @@ entry:
 define void @test2(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
             i32 %i7, i32 %i8, i32 %i9, float %d1) nounwind ssp {
 entry:
-; CHECK: test2
+; CHECK-LABEL: test2
 ; CHECK: scvtf [[REG_2:s[0-9]+]], w0
 ; CHECK: fadd s0, [[REG_2]], s0
 ; CHECK: ldr [[REG_1:s[0-9]+]], [sp]
@@ -129,9 +127,9 @@ entry:
 ; Check alignment on stack for v64, f64, i64, f32, i32.
 define double @test3(<2 x i32>* nocapture %in) nounwind {
 entry:
-; CHECK: test3
+; CHECK-LABEL: test3
 ; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
-; FAST: test3
+; FAST-LABEL: test3
 ; FAST: sub sp, sp, #32
 ; FAST: mov x[[ADDR:[0-9]+]], sp
 ; FAST: str [[REG_1:d[0-9]+]], [x[[ADDR]], #8]
@@ -146,7 +144,7 @@ declare double @args_vec_2i(double, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>,
 
 define double @test4(double* nocapture %in) nounwind {
 entry:
-; CHECK: test4
+; CHECK-LABEL: test4
 ; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
 ; CHECK: str [[REG_2:w[0-9]+]], [sp]
 ; CHECK: orr w0, wzr, #0x3
@@ -161,7 +159,7 @@ declare double @args_f64(double, double, double, double, double, double, double,
 
 define i64 @test5(i64* nocapture %in) nounwind {
 entry:
-; CHECK: test5
+; CHECK-LABEL: test5
 ; CHECK: strb [[REG_3:w[0-9]+]], [sp, #16]
 ; CHECK: str [[REG_1:x[0-9]+]], [sp, #8]
 ; CHECK: str [[REG_2:w[0-9]+]], [sp]
@@ -175,7 +173,7 @@ declare i64 @args_i64(i64, i64, i64, i64, i64, i64, i64, i64, i32, i64,
 
 define i32 @test6(float* nocapture %in) nounwind {
 entry:
-; CHECK: test6
+; CHECK-LABEL: test6
 ; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
 ; CHECK: str [[REG_1:s[0-9]+]], [sp, #4]
 ; CHECK: strh [[REG_3:w[0-9]+]], [sp]
@@ -192,7 +190,7 @@ declare i32 @args_f32(i32, i32, i32, i32, i32, i32, i32, i32,
 
 define i32 @test7(i32* nocapture %in) nounwind {
 entry:
-; CHECK: test7
+; CHECK-LABEL: test7
 ; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
 ; CHECK: str [[REG_1:w[0-9]+]], [sp, #4]
 ; CHECK: strh [[REG_3:w[0-9]+]], [sp]
@@ -206,13 +204,13 @@ declare i32 @args_i32(i32, i32, i32, i32, i32, i32, i32, i32, i16 signext, i32,
 
 define i32 @test8(i32 %argc, i8** nocapture %argv) nounwind {
 entry:
-; CHECK: test8
+; CHECK-LABEL: test8
 ; CHECK: strb {{w[0-9]+}}, [sp, #3]
 ; CHECK: strb wzr, [sp, #2]
 ; CHECK: strb {{w[0-9]+}}, [sp, #1]
 ; CHECK: strb wzr, [sp]
 ; CHECK: bl
-; FAST: test8
+; FAST-LABEL: test8
 ; FAST: strb {{w[0-9]+}}, [sp]
 ; FAST: strb {{w[0-9]+}}, [sp, #1]
 ; FAST: strb {{w[0-9]+}}, [sp, #2]
diff --git a/test/CodeGen/AArch64/arm64-abi_align.ll b/test/CodeGen/AArch64/arm64-abi_align.ll
index 44c5a07..deb740e 100644
--- a/test/CodeGen/AArch64/arm64-abi_align.ll
+++ b/test/CodeGen/AArch64/arm64-abi_align.ll
@@ -34,7 +34,7 @@ target triple = "arm64-apple-darwin"
 ; structs with size < 8 bytes, passed via i64 in x1 and x2
 define i32 @f38(i32 %i, i64 %s1.coerce, i64 %s2.coerce) #0 {
 entry:
-; CHECK: f38
+; CHECK-LABEL: f38
 ; CHECK: add w[[A:[0-9]+]], w1, w0
 ; CHECK: add {{w[0-9]+}}, w[[A]], w2
   %s1.sroa.0.0.extract.trunc = trunc i64 %s1.coerce to i32
@@ -56,7 +56,7 @@ entry:
 
 define i32 @caller38() #1 {
 entry:
-; CHECK: caller38
+; CHECK-LABEL: caller38
 ; CHECK: ldr x1,
 ; CHECK: ldr x2,
   %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4
@@ -72,7 +72,7 @@ declare i32 @f38_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
 ; i9 at [sp]
 define i32 @caller38_stack() #1 {
 entry:
-; CHECK: caller38_stack
+; CHECK-LABEL: caller38_stack
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
 ; CHECK: movz w[[C:[0-9]+]], #0x9
 ; CHECK: str w[[C]], [sp]
@@ -87,7 +87,7 @@ entry:
 ; passed via i128 in x1 and x3
 define i32 @f39(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
 entry:
-; CHECK: f39
+; CHECK-LABEL: f39
 ; CHECK: add w[[A:[0-9]+]], w1, w0
 ; CHECK: add {{w[0-9]+}}, w[[A]], w3
   %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
@@ -109,7 +109,7 @@ entry:
 
 define i32 @caller39() #1 {
 entry:
-; CHECK: caller39
+; CHECK-LABEL: caller39
 ; CHECK: ldp x1, x2,
 ; CHECK: ldp x3, x4,
   %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16
@@ -125,7 +125,7 @@ declare i32 @f39_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
 ; passed on stack at [sp+16] and [sp+32]
 define i32 @caller39_stack() #1 {
 entry:
-; CHECK: caller39_stack
+; CHECK-LABEL: caller39_stack
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
 ; CHECK: movz w[[C:[0-9]+]], #0x9
@@ -141,7 +141,7 @@ entry:
 ; passed via i128 in x1 and x3
 define i32 @f40(i32 %i, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce) #0 {
 entry:
-; CHECK: f40
+; CHECK-LABEL: f40
 ; CHECK: add w[[A:[0-9]+]], w1, w0
 ; CHECK: add {{w[0-9]+}}, w[[A]], w3
   %s1.coerce.fca.0.extract = extractvalue [2 x i64] %s1.coerce, 0
@@ -165,7 +165,7 @@ entry:
 
 define i32 @caller40() #1 {
 entry:
-; CHECK: caller40
+; CHECK-LABEL: caller40
 ; CHECK: ldp x1, x2,
 ; CHECK: ldp x3, x4,
   %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4
@@ -181,7 +181,7 @@ declare i32 @f40_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
 ; passed on stack at [sp+8] and [sp+24]
 define i32 @caller40_stack() #1 {
 entry:
-; CHECK: caller40_stack
+; CHECK-LABEL: caller40_stack
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #24]
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
 ; CHECK: movz w[[C:[0-9]+]], #0x9
@@ -197,7 +197,7 @@ entry:
 ; passed via i128 in x1 and x3
 define i32 @f41(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
 entry:
-; CHECK: f41
+; CHECK-LABEL: f41
 ; CHECK: add w[[A:[0-9]+]], w1, w0
 ; CHECK: add {{w[0-9]+}}, w[[A]], w3
   %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
@@ -219,7 +219,7 @@ entry:
 
 define i32 @caller41() #1 {
 entry:
-; CHECK: caller41
+; CHECK-LABEL: caller41
 ; CHECK: ldp x1, x2,
 ; CHECK: ldp x3, x4,
   %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
@@ -235,7 +235,7 @@ declare i32 @f41_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
 ; passed on stack at [sp+16] and [sp+32]
 define i32 @caller41_stack() #1 {
 entry:
-; CHECK: caller41_stack
+; CHECK-LABEL: caller41_stack
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
 ; CHECK: movz w[[C:[0-9]+]], #0x9
@@ -250,7 +250,7 @@ entry:
 ; structs with size of 22 bytes, passed indirectly in x1 and x2
 define i32 @f42(i32 %i, %struct.s42* nocapture %s1, %struct.s42* nocapture %s2) #2 {
 entry:
-; CHECK: f42
+; CHECK-LABEL: f42
 ; CHECK: ldr w[[A:[0-9]+]], [x1]
 ; CHECK: ldr w[[B:[0-9]+]], [x2]
 ; CHECK: add w[[C:[0-9]+]], w[[A]], w0
@@ -280,7 +280,7 @@ entry:
 ; For s1, we allocate a 22-byte space, pass its address via x1
 define i32 @caller42() #3 {
 entry:
-; CHECK: caller42
+; CHECK-LABEL: caller42
 ; CHECK: str {{x[0-9]+}}, [sp, #48]
 ; CHECK: str {{q[0-9]+}}, [sp, #32]
 ; CHECK: str {{x[0-9]+}}, [sp, #16]
@@ -290,7 +290,7 @@ entry:
 ; Space for s1 is allocated at sp+32
 ; Space for s2 is allocated at sp
 
-; FAST: caller42
+; FAST-LABEL: caller42
 ; FAST: sub sp, sp, #96
 ; Space for s1 is allocated at fp-24 = sp+72
 ; Space for s2 is allocated at sp+48
@@ -316,7 +316,7 @@ declare i32 @f42_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
 
 define i32 @caller42_stack() #3 {
 entry:
-; CHECK: caller42_stack
+; CHECK-LABEL: caller42_stack
 ; CHECK: mov x29, sp
 ; CHECK: sub sp, sp, #96
 ; CHECK: stur {{x[0-9]+}}, [x29, #-16]
@@ -333,7 +333,7 @@ entry:
 ; CHECK: movz w[[C:[0-9]+]], #0x9
 ; CHECK: str w[[C]], [sp]
 
-; FAST: caller42_stack
+; FAST-LABEL: caller42_stack
 ; Space for s1 is allocated at fp-24
 ; Space for s2 is allocated at fp-48
 ; FAST: sub x[[A:[0-9]+]], x29, #24
@@ -359,12 +359,12 @@ entry:
 ; passed indirectly in x1 and x2
 define i32 @f43(i32 %i, %struct.s43* nocapture %s1, %struct.s43* nocapture %s2) #2 {
 entry:
-; CHECK: f43
+; CHECK-LABEL: f43
 ; CHECK: ldr w[[A:[0-9]+]], [x1]
 ; CHECK: ldr w[[B:[0-9]+]], [x2]
 ; CHECK: add w[[C:[0-9]+]], w[[A]], w0
 ; CHECK: add {{w[0-9]+}}, w[[C]], w[[B]]
-; FAST: f43
+; FAST-LABEL: f43
 ; FAST: ldr w[[A:[0-9]+]], [x1]
 ; FAST: ldr w[[B:[0-9]+]], [x2]
 ; FAST: add w[[C:[0-9]+]], w[[A]], w0
@@ -388,7 +388,7 @@ entry:
 
 define i32 @caller43() #3 {
 entry:
-; CHECK: caller43
+; CHECK-LABEL: caller43
 ; CHECK: str {{q[0-9]+}}, [sp, #48]
 ; CHECK: str {{q[0-9]+}}, [sp, #32]
 ; CHECK: str {{q[0-9]+}}, [sp, #16]
@@ -398,7 +398,7 @@ entry:
 ; Space for s1 is allocated at sp+32
 ; Space for s2 is allocated at sp
 
-; FAST: caller43
+; FAST-LABEL: caller43
 ; FAST: mov x29, sp
 ; Space for s1 is allocated at sp+32
 ; Space for s2 is allocated at sp
@@ -428,7 +428,7 @@ declare i32 @f43_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
 
 define i32 @caller43_stack() #3 {
 entry:
-; CHECK: caller43_stack
+; CHECK-LABEL: caller43_stack
 ; CHECK: mov x29, sp
 ; CHECK: sub sp, sp, #96
 ; CHECK: stur {{q[0-9]+}}, [x29, #-16]
@@ -445,7 +445,7 @@ entry:
 ; CHECK: movz w[[C:[0-9]+]], #0x9
 ; CHECK: str w[[C]], [sp]
 
-; FAST: caller43_stack
+; FAST-LABEL: caller43_stack
 ; FAST: sub sp, sp, #96
 ; Space for s1 is allocated at fp-32 = sp+64
 ; Space for s2 is allocated at sp+32
@@ -481,13 +481,13 @@ declare i32 @callee_i128_split(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
 
 define i32 @i128_split() {
 entry:
-; CHECK: i128_split
+; CHECK-LABEL: i128_split
 ; "i128 %0" should be on stack at [sp].
 ; "i32 8" should be on stack at [sp, #16].
 ; CHECK: str {{w[0-9]+}}, [sp, #16]
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp]
-; FAST: i128_split
-; FAST: sub sp, sp, #48
+; FAST-LABEL: i128_split
+; FAST: sub sp, sp
 ; FAST: mov x[[ADDR:[0-9]+]], sp
 ; FAST: str {{w[0-9]+}}, [x[[ADDR]], #16]
 ; Load/Store opt is disabled with -O0, so the i128 is split.
@@ -504,14 +504,16 @@ declare i32 @callee_i64(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
 
 define i32 @i64_split() {
 entry:
-; CHECK: i64_split
+; CHECK-LABEL: i64_split
 ; "i64 %0" should be in register x7.
 ; "i32 8" should be on stack at [sp].
 ; CHECK: ldr x7, [{{x[0-9]+}}]
 ; CHECK: str {{w[0-9]+}}, [sp]
-; FAST: i64_split
+; FAST-LABEL: i64_split
 ; FAST: ldr x7, [{{x[0-9]+}}]
-; FAST: str {{w[0-9]+}}, [sp]
+; FAST: mov x[[R0:[0-9]+]], sp
+; FAST: orr w[[R1:[0-9]+]], wzr, #0x8
+; FAST: str w[[R1]], {{\[}}x[[R0]]{{\]}}
   %0 = load i64* bitcast (%struct.s41* @g41 to i64*), align 16
   %call = tail call i32 @callee_i64(i32 1, i32 2, i32 3, i32 4, i32 5,
                                     i32 6, i32 7, i64 %0, i32 8) #5
diff --git a/test/CodeGen/AArch64/arm64-addr-mode-folding.ll b/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
index 08fb8c9..74bb398 100644
--- a/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
+++ b/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 -mtriple arm64-apple-ios3 %s -o - | FileCheck %s
+; RUN: llc -O3 -mtriple arm64-apple-ios3 -aarch64-gep-opt=false %s -o - | FileCheck %s
 ; <rdar://problem/13621857>
 
 @block = common global i8* null, align 8
diff --git a/test/CodeGen/AArch64/arm64-addrmode.ll b/test/CodeGen/AArch64/arm64-addrmode.ll
index 700fba8..5433a8c 100644
--- a/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -37,9 +37,8 @@ define void @t3() {
 
 ; base + unsigned offset (> imm12 * size of type in bytes)
 ; CHECK: @t4
-; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #8, lsl #12
-; CHECK: ldr xzr, [
-; CHECK: [[ADDREG]]]
+; CHECK: orr w[[NUM:[0-9]+]], wzr, #0x8000
+; CHECK: ldr xzr, [x{{[0-9]+}}, x[[NUM]]]
 ; CHECK: ret
 define void @t4() {
   %incdec.ptr = getelementptr inbounds i64* @object, i64 4096
@@ -60,9 +59,8 @@ define void @t5(i64 %a) {
 ; base + reg + imm
 ; CHECK: @t6
 ; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, x{{[0-9]+}}, lsl #3
-; CHECK-NEXT: add [[ADDREG]], [[ADDREG]], #8, lsl #12
-; CHECK: ldr xzr, [
-; CHECK: [[ADDREG]]]
+; CHECK-NEXT: orr w[[NUM:[0-9]+]], wzr, #0x8000
+; CHECK: ldr xzr, [x{{[0-9]+}}, x[[NUM]]]
 ; CHECK: ret
 define void @t6(i64 %a) {
   %tmp1 = getelementptr inbounds i64* @object, i64 %a
@@ -70,3 +68,114 @@ define void @t6(i64 %a) {
   %tmp = load volatile i64* %incdec.ptr, align 8
   ret void
 }
+
+; Test base + wide immediate
+define void @t7(i64 %a) {
+; CHECK-LABEL: t7:
+; CHECK: orr w[[NUM:[0-9]+]], wzr, #0xffff
+; CHECK-NEXT: ldr xzr, [x0, x[[NUM]]]
+  %1 = add i64 %a, 65535   ;0xffff
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t8(i64 %a) {
+; CHECK-LABEL: t8:
+; CHECK: movn [[REG:x[0-9]+]], #0x1235
+; CHECK-NEXT: ldr xzr, [x0, [[REG]]]
+  %1 = sub i64 %a, 4662   ;-4662 is 0xffffffffffffedca
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t9(i64 %a) {
+; CHECK-LABEL: t9:
+; CHECK: movn [[REG:x[0-9]+]], #0x1235, lsl #16
+; CHECK-NEXT: ldr xzr, [x0, [[REG]]]
+  %1 = add i64 -305463297, %a   ;-305463297 is 0xffffffffedcaffff
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t10(i64 %a) {
+; CHECK-LABEL: t10:
+; CHECK: movz [[REG:x[0-9]+]], #0x123, lsl #48
+; CHECK-NEXT: ldr xzr, [x0, [[REG]]]
+  %1 = add i64 %a, 81909218222800896   ;0x123000000000000
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t11(i64 %a) {
+; CHECK-LABEL: t11:
+; CHECK: movz w[[NUM:[0-9]+]], #0x123, lsl #16
+; CHECK: movk w[[NUM:[0-9]+]], #0x4567
+; CHECK-NEXT: ldr xzr, [x0, x[[NUM]]]
+  %1 = add i64 %a, 19088743   ;0x1234567
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+; Test some boundaries that should not use movz/movn/orr
+define void @t12(i64 %a) {
+; CHECK-LABEL: t12:
+; CHECK: add [[REG:x[0-9]+]], x0, #4095
+; CHECK-NEXT: ldr xzr, {{\[}}[[REG]]]
+  %1 = add i64 %a, 4095   ;0xfff
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t13(i64 %a) {
+; CHECK-LABEL: t13:
+; CHECK: sub [[REG:x[0-9]+]], x0, #4095
+; CHECK-NEXT: ldr xzr, {{\[}}[[REG]]]
+  %1 = add i64 %a, -4095   ;-0xfff
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t14(i64 %a) {
+; CHECK-LABEL: t14:
+; CHECK: add [[REG:x[0-9]+]], x0, #291, lsl #12
+; CHECK-NEXT: ldr xzr, {{\[}}[[REG]]]
+  %1 = add i64 %a, 1191936   ;0x123000
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t15(i64 %a) {
+; CHECK-LABEL: t15:
+; CHECK: sub [[REG:x[0-9]+]], x0, #291, lsl #12
+; CHECK-NEXT: ldr xzr, {{\[}}[[REG]]]
+  %1 = add i64 %a, -1191936   ;0xFFFFFFFFFFEDD000
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t16(i64 %a) {
+; CHECK-LABEL: t16:
+; CHECK: ldr xzr, [x0, #28672]
+  %1 = add i64 %a, 28672   ;0x7000
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t17(i64 %a) {
+; CHECK-LABEL: t17:
+; CHECK: ldur xzr, [x0, #-256]
+  %1 = add i64 %a, -256   ;-0x100
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-bcc.ll b/test/CodeGen/AArch64/arm64-bcc.ll
new file mode 100644
index 0000000..138ae90
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-bcc.ll
@@ -0,0 +1,60 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin  | FileCheck %s
+; Checks for conditional branch b.vs
+
+; Function Attrs: nounwind
+define i32 @add(i32, i32) {
+entry:
+  %2 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %0, i32 %1)
+  %3 = extractvalue { i32, i1 } %2, 1
+  br i1 %3, label %6, label %4
+
+; <label>:4                                       ; preds = %entry
+  %5 = extractvalue { i32, i1 } %2, 0
+  ret i32 %5
+
+; <label>:6                                       ; preds = %entry
+  tail call void @llvm.trap()
+  unreachable
+; CHECK: b.vs
+}
+
+%S64 = type <{ i64 }>
+%S32 = type <{ i32 }>
+%Sstruct = type <{ %S64, %S32 }>
+
+; Checks for compfail when optimizing csincr-cbz sequence
+
+define { i64, i1 } @foo(i64* , %Sstruct* , i1, i64) {
+entry:
+  %.sroa.0 = alloca i72, align 16
+  %.count.value = getelementptr inbounds %Sstruct* %1, i64 0, i32 0, i32 0
+  %4 = load i64* %.count.value, align 8
+  %.repeatedValue.value = getelementptr inbounds %Sstruct* %1, i64 0, i32 1, i32 0
+  %5 = load i32* %.repeatedValue.value, align 8
+  %6 = icmp eq i64 %4, 0
+  br label %7
+
+; <label>:7                                      ; preds = %entry
+  %.mask58 = and i32 %5, -2048
+  %8 = icmp eq i32 %.mask58, 55296
+  %.not134 = xor i1 %8, true
+  %9 = icmp eq i32 %5, 1114112
+  %or.cond135 = and i1 %9, %.not134
+  br i1 %or.cond135, label %10, label %.loopexit
+
+; <label>:10                                      ; preds = %7
+  %11 = and i32 %5, -2048
+  %12 = icmp eq i32 %11, 55296
+  br i1 %12, label %.loopexit, label %10
+
+
+.loopexit:                                        ; preds = %.entry,%7,%10
+  tail call void @llvm.trap()
+  unreachable
+}
+
+; Function Attrs: nounwind readnone
+declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32)
+
+; Function Attrs: noreturn nounwind
+declare void @llvm.trap()
diff --git a/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll b/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
index f0e968b..d2985f4 100644
--- a/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
+++ b/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -O1 -o - | FileCheck %s
-; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -O0 -fast-isel=true -o - | FileCheck %s
+; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -O1 -o - | FileCheck %s
+; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -O0 -fast-isel=true -o - | FileCheck %s
 
 ; CHECK-LABEL: test_i64_f64:
 define void @test_i64_f64(double* %p, i64* %q) {
diff --git a/test/CodeGen/AArch64/arm64-big-endian-eh.ll b/test/CodeGen/AArch64/arm64-big-endian-eh.ll
index 93e7da9..a51703a 100644
--- a/test/CodeGen/AArch64/arm64-big-endian-eh.ll
+++ b/test/CodeGen/AArch64/arm64-big-endian-eh.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple arm64_be-linux-gnu -filetype obj < %s | llvm-objdump -s - | FileCheck %s
+; RUN: llc -mtriple aarch64_be-linux-gnu -filetype obj < %s | llvm-objdump -s - | FileCheck %s
 
 ; ARM EHABI for big endian
 ; This test case checks whether CIE length record is laid out in big endian format.
diff --git a/test/CodeGen/AArch64/arm64-big-endian-varargs.ll b/test/CodeGen/AArch64/arm64-big-endian-varargs.ll
index d7b26b9..db1f48c 100644
--- a/test/CodeGen/AArch64/arm64-big-endian-varargs.ll
+++ b/test/CodeGen/AArch64/arm64-big-endian-varargs.ll
@@ -3,7 +3,7 @@
 ; Vararg saving must save Q registers using the equivalent of STR/STP.
 
 target datalayout = "E-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "arm64_be-arm-none-eabi"
+target triple = "aarch64_be-arm-none-eabi"
 
 %struct.__va_list = type { i8*, i8*, i8*, i32, i32 }
 
diff --git a/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
index 1dcccf1..cc9badc 100644
--- a/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
+++ b/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
-; RUN: llc -mtriple arm64_be < %s -fast-isel=true -aarch64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple aarch64_be < %s -fast-isel=true -aarch64-load-store-opt=false -o - | FileCheck %s
 
 ; CHECK-LABEL: test_i64_f64:
 define i64 @test_i64_f64(double %p) {
diff --git a/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
index 9a12b7a..d72d0a5 100644
--- a/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
+++ b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
-; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -fast-isel=true -O0 -o - | FileCheck %s
+; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -fast-isel=true -O0 -o - | FileCheck %s
 
 ; CHECK-LABEL: test_i64_f64:
 declare i64 @test_i64_f64_helper(double %p)
diff --git a/test/CodeGen/AArch64/arm64-cse.ll b/test/CodeGen/AArch64/arm64-cse.ll
index 5d62cfe..b74ece8 100644
--- a/test/CodeGen/AArch64/arm64-cse.ll
+++ b/test/CodeGen/AArch64/arm64-cse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 < %s -aarch64-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc -O3 < %s -aarch64-atomic-cfg-tidy=0 -aarch64-gep-opt=false | FileCheck %s
 target triple = "arm64-apple-ios"
 
 ; rdar://12462006
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
index 6eed48b..2eb6307 100644
--- a/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
@@ -1,8 +1,4 @@
 ; RUN: llc -mcpu=cyclone < %s | FileCheck %s
-
-; r208640 broke ppc64/Linux self-hosting; xfailing while this is worked on.
-; XFAIL: *
-
 target datalayout = "e-i64:64-n32:64-S128"
 target triple = "arm64-apple-ios"
 
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
deleted file mode 100644
index ce132c6..0000000
--- a/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-; RUN: llc -O3 < %s | FileCheck %s
-; RUN: llc -O3 -addr-sink-using-gep=1 < %s | FileCheck %s
-; Test case for a DAG combiner bug where we combined an indexed load
-; with an extension (sext, zext, or any) into a regular extended load,
-; i.e., dropping the indexed value.
-; <rdar://problem/16389332>
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "arm64-apple-ios"
-
-%class.A = type { i64, i64 }
-%class.C = type { i64 }
-
-; CHECK-LABEL: XX:
-; CHECK: ldr
-define i32 @XX(%class.A* %K, i1 %tst, i32* %addr, %class.C** %ppC, %class.C* %pC) {
-entry:
-  br i1 %tst, label %if.then, label %lor.rhs.i
-
-lor.rhs.i:                                        ; preds = %entry
-  %tmp = load i32* %addr, align 4
-  %y.i.i.i = getelementptr inbounds %class.A* %K, i64 0, i32 1
-  %tmp1 = load i64* %y.i.i.i, align 8
-  %U.sroa.3.8.extract.trunc.i = trunc i64 %tmp1 to i32
-  %div11.i = sdiv i32 %U.sroa.3.8.extract.trunc.i, 17
-  %add12.i = add nsw i32 0, %div11.i
-  %U.sroa.3.12.extract.shift.i = lshr i64 %tmp1, 32
-  %U.sroa.3.12.extract.trunc.i = trunc i64 %U.sroa.3.12.extract.shift.i to i32
-  %div15.i = sdiv i32 %U.sroa.3.12.extract.trunc.i, 13
-  %add16.i = add nsw i32 %add12.i, %div15.i
-  %rem.i.i = srem i32 %add16.i, %tmp
-  %idxprom = sext i32 %rem.i.i to i64
-  %arrayidx = getelementptr inbounds %class.C** %ppC, i64 %idxprom
-  %tobool533 = icmp eq %class.C* %pC, null
-  br i1 %tobool533, label %while.end, label %while.body
-
-if.then:                                          ; preds = %entry
-  ret i32 42
-
-while.body:                                       ; preds = %lor.rhs.i
-  ret i32 5
-
-while.end:                                        ; preds = %lor.rhs.i
-  %tmp3 = load %class.C** %arrayidx, align 8
-  ret i32 50
-}
diff --git a/test/CodeGen/AArch64/arm64-extern-weak.ll b/test/CodeGen/AArch64/arm64-extern-weak.ll
index a239403..06bd927 100644
--- a/test/CodeGen/AArch64/arm64-extern-weak.ll
+++ b/test/CodeGen/AArch64/arm64-extern-weak.ll
@@ -1,16 +1,23 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -o - < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -o - < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck --check-prefix=CHECK-STATIC %s
 ; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large -o - < %s | FileCheck --check-prefix=CHECK-LARGE %s
 
 declare extern_weak i32 @var()
 
 define i32()* @foo() {
 ; The usual ADRP/ADD pair can't be used for a weak reference because it must
-; evaluate to 0 if the symbol is undefined. We use a litpool entry.
+; evaluate to 0 if the symbol is undefined. We use a GOT entry for PIC
+; otherwise a litpool entry.
   ret i32()* @var
 
 ; CHECK: adrp x[[VAR:[0-9]+]], :got:var
 ; CHECK: ldr x0, [x[[VAR]], :got_lo12:var]
 
+; CHECK-STATIC: .LCPI0_0:
+; CHECK-STATIC-NEXT: .xword  var
+; CHECK-STATIC: adrp x[[VAR:[0-9]+]], .LCPI0_0
+; CHECK-STATIC: ldr x0, [x[[VAR]], :lo12:.LCPI0_0]
+
   ; In the large model, the usual relocations are absolute and can
   ; materialise 0.
 ; CHECK-LARGE: movz x0, #:abs_g3:var
@@ -29,6 +36,11 @@ define i32* @bar() {
 ; CHECK: add x0, [[ARR_VAR]], #20
   ret i32* %addr
 
+; CHECK-STATIC: .LCPI1_0:
+; CHECK-STATIC-NEXT: .xword arr_var
+; CHECK-STATIC: ldr [[BASE:x[0-9]+]], [{{x[0-9]+}}, :lo12:.LCPI1_0]
+; CHECK-STATIC: add x0, [[BASE]], #20
+
   ; In the large model, the usual relocations are absolute and can
   ; materialise 0.
 ; CHECK-LARGE: movz [[ARR_VAR:x[0-9]+]], #:abs_g3:arr_var
@@ -44,6 +56,9 @@ define i32* @wibble() {
 ; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var
 ; CHECK: add x0, [[BASE]], :lo12:defined_weak_var
 
+; CHECK-STATIC: adrp [[BASE:x[0-9]+]], defined_weak_var
+; CHECK-STATIC: add x0, [[BASE]], :lo12:defined_weak_var
+
 ; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var
 ; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var
 ; CHECK-LARGE: movk x0, #:abs_g1_nc:defined_weak_var
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll b/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
index ebd847e..d81bc7c 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 @sortlist = common global [5001 x i32] zeroinitializer, align 16
 @sortlist2 = common global [5001 x i64] zeroinitializer, align 16
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll b/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
index 1706e9e..a841702 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
@@ -1,5 +1,5 @@
 ; This test should cause the TargetMaterializeAlloca to be invoked
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 %struct.S1Ty = type { i64 }
 %struct.S2Ty = type { %struct.S1Ty, %struct.S1Ty }
@@ -15,9 +15,8 @@ define void @main() nounwind {
 entry:
 ; CHECK: main
 ; CHECK: mov x29, sp
-; CHECK: mov x[[REG:[0-9]+]], sp
-; CHECK-NEXT: orr x[[REG1:[0-9]+]], xzr, #0x8
-; CHECK-NEXT: add x0, x[[REG]], x[[REG1]]
+; CHECK: mov [[REG:x[0-9]+]], sp
+; CHECK-NEXT: add x0, [[REG]], #8
   %E = alloca %struct.S2Ty, align 4
   %B = getelementptr inbounds %struct.S2Ty* %E, i32 0, i32 1
   call void @takeS1(%struct.S1Ty* %B)
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-br.ll b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
index 37a8295..f896d85 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-br.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -mcpu=cyclone | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -mcpu=cyclone -verify-machineinstrs < %s | FileCheck %s
 
 define void @branch1() nounwind uwtable ssp {
   %x = alloca i32, align 4
@@ -95,7 +95,7 @@ entry:
   store i64 %d, i64* %d.addr, align 8
   %0 = load i16* %b.addr, align 2
 ; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
+; CHECK: cmp w0, #0
 ; CHECK: b.eq LBB4_2
   %conv = trunc i16 %0 to i1
   br i1 %conv, label %if.then, label %if.end
@@ -107,7 +107,7 @@ if.then:                                          ; preds = %entry
 if.end:                                           ; preds = %if.then, %entry
   %1 = load i32* %c.addr, align 4
 ; CHECK: and w[[REG:[0-9]+]], w{{[0-9]+}}, #0x1
-; CHECK: subs w{{[0-9]+}}, w[[REG]], #0
+; CHECK: cmp w[[REG]], #0
 ; CHECK: b.eq LBB4_4
   %conv1 = trunc i32 %1 to i1
   br i1 %conv1, label %if.then3, label %if.end4
@@ -118,7 +118,7 @@ if.then3:                                         ; preds = %if.end
 
 if.end4:                                          ; preds = %if.then3, %if.end
   %2 = load i64* %d.addr, align 8
-; CHECK: subs w{{[0-9]+}}, w{{[0-9]+}}, #0
+; CHECK: cmp w{{[0-9]+}}, #0
 ; CHECK: b.eq LBB4_6
   %conv5 = trunc i64 %2 to i1
   br i1 %conv5, label %if.then7, label %if.end8
@@ -137,11 +137,10 @@ declare void @foo1()
 ; rdar://15174028
 define i32 @trunc64(i64 %foo) nounwind {
 ; CHECK: trunc64
-; CHECK: orr  [[REG:x[0-9]+]], xzr, #0x1
-; CHECK: and  [[REG2:x[0-9]+]], x0, [[REG]]
-; CHECK: mov  x[[REG3:[0-9]+]], [[REG2]]
-; CHECK: and  [[REG4:w[0-9]+]], w[[REG3]], #0x1
-; CHECK: subs {{w[0-9]+}}, [[REG4]], #0
+; CHECK: and  [[REG1:x[0-9]+]], x0, #0x1
+; CHECK: mov  x[[REG2:[0-9]+]], [[REG1]]
+; CHECK: and  [[REG3:w[0-9]+]], w[[REG2]], #0x1
+; CHECK: cmp  [[REG3]], #0
 ; CHECK: b.eq LBB5_2
   %a = and i64 %foo, 1
   %b = trunc i64 %a to i1
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-call.ll b/test/CodeGen/AArch64/arm64-fast-isel-call.ll
index 8d756ae..f1e2c40 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-call.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-call.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64_be-linux-gnu | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -O0 -fast-isel-abort -fast-isel-abort-args -code-model=small -verify-machineinstrs -mtriple=arm64-apple-darwin   < %s | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -fast-isel-abort-args -code-model=large -verify-machineinstrs -mtriple=arm64-apple-darwin   < %s | FileCheck %s --check-prefix=LARGE
+; RUN: llc -O0 -fast-isel-abort -fast-isel-abort-args -code-model=small -verify-machineinstrs -mtriple=aarch64_be-linux-gnu < %s | FileCheck %s --check-prefix=CHECK-BE
 
 define void @call0() nounwind {
 entry:
@@ -8,8 +9,12 @@ entry:
 
 define void @foo0() nounwind {
 entry:
-; CHECK: foo0
-; CHECK: bl _call0
+; CHECK-LABEL: foo0
+; CHECK:       bl _call0
+; LARGE-LABEL: foo0
+; LARGE:       adrp [[REG0:x[0-9]+]], _call0@GOTPAGE
+; LARGE:       ldr  [[REG1:x[0-9]+]], {{\[}}[[REG0]], _call0@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr  [[REG1]]
   call void @call0()
   ret void
 }
@@ -24,10 +29,10 @@ entry:
 
 define i32 @foo1(i32 %a) nounwind {
 entry:
-; CHECK: foo1
-; CHECK: stur w0, [x29, #-4]
-; CHECK-NEXT: ldur w0, [x29, #-4]
-; CHECK-NEXT: bl _call1
+; CHECK-LABEL: foo1
+; CHECK:       stur w0, [x29, #-4]
+; CHECK-NEXT:  ldur w0, [x29, #-4]
+; CHECK-NEXT:  bl _call1
   %a.addr = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
   %tmp = load i32* %a.addr, align 4
@@ -37,10 +42,10 @@ entry:
 
 define i32 @sext_(i8 %a, i16 %b) nounwind {
 entry:
-; CHECK: @sext_
-; CHECK: sxtb w0, w0
-; CHECK: sxth w1, w1
-; CHECK: bl _foo_sext_
+; CHECK-LABEL: sext_
+; CHECK:       sxtb w0, w0
+; CHECK:       sxth w1, w1
+; CHECK:       bl _foo_sext_
   call void @foo_sext_(i8 signext %a, i16 signext %b)
   ret i32 0
 }
@@ -49,9 +54,9 @@ declare void @foo_sext_(i8 %a, i16 %b)
 
 define i32 @zext_(i8 %a, i16 %b) nounwind {
 entry:
-; CHECK: @zext_
-; CHECK: uxtb w0, w0
-; CHECK: uxth w1, w1
+; CHECK-LABEL: zext_
+; CHECK:       uxtb w0, w0
+; CHECK:       uxth w1, w1
   call void @foo_zext_(i8 zeroext %a, i16 zeroext %b)
   ret i32 0
 }
@@ -60,10 +65,10 @@ declare void @foo_zext_(i8 %a, i16 %b)
 
 define i32 @t1(i32 %argc, i8** nocapture %argv) {
 entry:
-; CHECK: @t1
+; CHECK-LABEL: @t1
 ; The last parameter will be passed on stack via i8.
-; CHECK: strb w{{[0-9]+}}, [sp]
-; CHECK-NEXT: bl _bar
+; CHECK:       strb w{{[0-9]+}}, [sp]
+; CHECK:       bl _bar
   %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70, i8 zeroext 28, i8 zeroext 39, i8 zeroext -41)
   ret i32 0
 }
@@ -73,18 +78,19 @@ declare i32 @bar(i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8
 ; Test materialization of integers.  Target-independent selector handles this.
 define i32 @t2() {
 entry:
-; CHECK: @t2
-; CHECK: movz x0, #0
-; CHECK: orr w1, wzr, #0xfffffff8
-; CHECK: orr w[[REG:[0-9]+]], wzr, #0x3ff
-; CHECK: orr w[[REG2:[0-9]+]], wzr, #0x2
-; CHECK: movz w[[REG3:[0-9]+]], #0
-; CHECK: orr w[[REG4:[0-9]+]], wzr, #0x1
-; CHECK: uxth w2, w[[REG]]
-; CHECK: sxtb w3, w[[REG2]]
-; CHECK: and w4, w[[REG3]], #0x1
-; CHECK: and w5, w[[REG4]], #0x1
-; CHECK: bl	_func2
+; CHECK-LABEL: t2
+; CHECK:       mov [[REG1:x[0-9]+]], xzr
+; CHECK:       orr w1, wzr, #0xfffffff8
+; CHECK:       orr [[REG2:w[0-9]+]], wzr, #0x3ff
+; CHECK:       orr [[REG3:w[0-9]+]], wzr, #0x2
+; CHECK:       mov [[REG4:w[0-9]+]], wzr
+; CHECK:       orr [[REG5:w[0-9]+]], wzr, #0x1
+; CHECK:       mov x0, [[REG1]]
+; CHECK:       uxth w2, [[REG2]]
+; CHECK:       sxtb w3, [[REG3]]
+; CHECK:       and w4, [[REG4]], #0x1
+; CHECK:       and w5, [[REG5]], #0x1
+; CHECK:       bl _func2
   %call = call i32 @func2(i64 zeroext 0, i32 signext -8, i16 zeroext 1023, i8 signext -254, i1 zeroext 0, i1 zeroext 1)
   ret i32 0
 }
@@ -94,7 +100,170 @@ declare i32 @func2(i64 zeroext, i32 signext, i16 zeroext, i8 signext, i1 zeroext
 declare void @callee_b0f(i8 %bp10, i8 %bp11, i8 %bp12, i8 %bp13, i8 %bp14, i8 %bp15, i8 %bp17, i8 %bp18, i8 %bp19)
 define void @caller_b1f() {
 entry:
-  ; CHECK-BE: strb w{{.*}}, [sp, #7]
+; CHECK-BE-LABEL: caller_b1f
+; CHECK-BE:       strb w{{.*}}, [sp, #7]
   call void @callee_b0f(i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 42)
   ret void
 }
+
+define zeroext i1 @call_arguments1(i1 %a1, i1 %a2, i1 %a3, i1 %a4, i1 %a5, i1 %a6, i1 %a7, i1 %a8) {
+; CHECK-LABEL: call_arguments1
+; CHECK:       and {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:  and {{w[0-9]+}}, w2, w3
+; CHECK-NEXT:  and {{w[0-9]+}}, w4, w5
+; CHECK-NEXT:  and {{w[0-9]+}}, w6, w7
+  %1 = and i1 %a1, %a2
+  %2 = and i1 %a3, %a4
+  %3 = and i1 %a5, %a6
+  %4 = and i1 %a7, %a8
+  %5 = and i1 %1, %2
+  %6 = and i1 %3, %4
+  %7 = and i1 %5, %6
+  ret i1 %7
+}
+
+define i32 @call_arguments2(i8 zeroext %a1, i8 zeroext %a2, i8 zeroext %a3, i8 zeroext %a4, i8 signext %a5, i8 signext %a6, i8 signext %a7, i8 signext %a8) {
+; CHECK-LABEL: call_arguments2
+; CHECK:       add {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:  add {{w[0-9]+}}, w2, w3
+; CHECK-NEXT:  add {{w[0-9]+}}, w4, w5
+; CHECK-NEXT:  add {{w[0-9]+}}, w6, w7
+  %a1z = zext i8 %a1 to i32
+  %a2z = zext i8 %a2 to i32
+  %a3z = zext i8 %a3 to i32
+  %a4z = zext i8 %a4 to i32
+  %a5s = sext i8 %a5 to i32
+  %a6s = sext i8 %a6 to i32
+  %a7s = sext i8 %a7 to i32
+  %a8s = sext i8 %a8 to i32
+  %1 = add i32 %a1z, %a2z
+  %2 = add i32 %a3z, %a4z
+  %3 = add i32 %a5s, %a6s
+  %4 = add i32 %a7s, %a8s
+  %5 = add i32 %1, %2
+  %6 = add i32 %3, %4
+  %7 = add i32 %5, %6
+  ret i32 %7
+}
+
+define i32 @call_arguments3(i16 zeroext %a1, i16 zeroext %a2, i16 zeroext %a3, i16 zeroext %a4, i16 signext %a5, i16 signext %a6, i16 signext %a7, i16 signext %a8) {
+; CHECK-LABEL: call_arguments3
+; CHECK:       add {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:  add {{w[0-9]+}}, w2, w3
+; CHECK-NEXT:  add {{w[0-9]+}}, w4, w5
+; CHECK-NEXT:  add {{w[0-9]+}}, w6, w7
+  %a1z = zext i16 %a1 to i32
+  %a2z = zext i16 %a2 to i32
+  %a3z = zext i16 %a3 to i32
+  %a4z = zext i16 %a4 to i32
+  %a5s = sext i16 %a5 to i32
+  %a6s = sext i16 %a6 to i32
+  %a7s = sext i16 %a7 to i32
+  %a8s = sext i16 %a8 to i32
+  %1 = add i32 %a1z, %a2z
+  %2 = add i32 %a3z, %a4z
+  %3 = add i32 %a5s, %a6s
+  %4 = add i32 %a7s, %a8s
+  %5 = add i32 %1, %2
+  %6 = add i32 %3, %4
+  %7 = add i32 %5, %6
+  ret i32 %7
+}
+
+define i32 @call_arguments4(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) {
+; CHECK-LABEL: call_arguments4
+; CHECK:       add {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:  add {{w[0-9]+}}, w2, w3
+; CHECK-NEXT:  add {{w[0-9]+}}, w4, w5
+; CHECK-NEXT:  add {{w[0-9]+}}, w6, w7
+  %1 = add i32 %a1, %a2
+  %2 = add i32 %a3, %a4
+  %3 = add i32 %a5, %a6
+  %4 = add i32 %a7, %a8
+  %5 = add i32 %1, %2
+  %6 = add i32 %3, %4
+  %7 = add i32 %5, %6
+  ret i32 %7
+}
+
+define i64 @call_arguments5(i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, i64 %a8) {
+; CHECK-LABEL: call_arguments5
+; CHECK:       add {{x[0-9]+}}, x0, x1
+; CHECK-NEXT:  add {{x[0-9]+}}, x2, x3
+; CHECK-NEXT:  add {{x[0-9]+}}, x4, x5
+; CHECK-NEXT:  add {{x[0-9]+}}, x6, x7
+  %1 = add i64 %a1, %a2
+  %2 = add i64 %a3, %a4
+  %3 = add i64 %a5, %a6
+  %4 = add i64 %a7, %a8
+  %5 = add i64 %1, %2
+  %6 = add i64 %3, %4
+  %7 = add i64 %5, %6
+  ret i64 %7
+}
+
+define float @call_arguments6(float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7, float %a8) {
+; CHECK-LABEL: call_arguments6
+; CHECK:       fadd {{s[0-9]+}}, s0, s1
+; CHECK-NEXT:  fadd {{s[0-9]+}}, s2, s3
+; CHECK-NEXT:  fadd {{s[0-9]+}}, s4, s5
+; CHECK-NEXT:  fadd {{s[0-9]+}}, s6, s7
+  %1 = fadd float %a1, %a2
+  %2 = fadd float %a3, %a4
+  %3 = fadd float %a5, %a6
+  %4 = fadd float %a7, %a8
+  %5 = fadd float %1, %2
+  %6 = fadd float %3, %4
+  %7 = fadd float %5, %6
+  ret float %7
+}
+
+define double @call_arguments7(double %a1, double %a2, double %a3, double %a4, double %a5, double %a6, double %a7, double %a8) {
+; CHECK-LABEL: call_arguments7
+; CHECK:       fadd {{d[0-9]+}}, d0, d1
+; CHECK-NEXT:  fadd {{d[0-9]+}}, d2, d3
+; CHECK-NEXT:  fadd {{d[0-9]+}}, d4, d5
+; CHECK-NEXT:  fadd {{d[0-9]+}}, d6, d7
+  %1 = fadd double %a1, %a2
+  %2 = fadd double %a3, %a4
+  %3 = fadd double %a5, %a6
+  %4 = fadd double %a7, %a8
+  %5 = fadd double %1, %2
+  %6 = fadd double %3, %4
+  %7 = fadd double %5, %6
+  ret double %7
+}
+
+define i64 @call_arguments8(i32 %a1, i64 %a2, i32 %a3, i64 %a4) {
+; CHECK-LABEL: call_arguments8
+; CHECK:       ubfx  [[REG1:x[0-9]+]], {{x[0-9]+}}, #0, #32
+; CHECK:       ubfx  [[REG2:x[0-9]+]], {{x[0-9]+}}, #0, #32
+; CHECK:       add {{x[0-9]+}}, [[REG1]], x1
+; CHECK-NEXT:  add {{x[0-9]+}}, [[REG2]], x3
+  %aa1 = zext i32 %a1 to i64
+  %aa3 = zext i32 %a3 to i64
+  %1 = add i64 %aa1, %a2
+  %2 = add i64 %aa3, %a4
+  %3 = add i64 %1, %2
+  ret i64 %3
+}
+
+define void @call_arguments9(i8 %a1, i16 %a2, i32 %a3, i64 %a4, float %a5, double %a6, i64 %a7, double %a8) {
+; CHECK-LABEL: call_arguments9
+  ret void
+}
+
+; Test that we use the correct register class for the branch.
+define void @call_blr(i64 %Fn, i1 %c) {
+; CHECK-LABEL: call_blr
+; CHECK:       blr
+  br i1 %c, label %bb1, label %bb2
+bb1:
+  %1 = inttoptr i64 %Fn to void (i64)*
+  br label %bb2
+bb2:
+  %2 = phi void (i64)* [ %1, %bb1 ], [ undef, %0 ]
+  call void %2(i64 1)
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll b/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
index c5417de..e515184 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -mcpu=cyclone | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin -mcpu=cyclone < %s | FileCheck %s
 
 ;; Test various conversions.
 define zeroext i32 @trunc_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
 entry:
-; CHECK: trunc_
+; CHECK-LABEL: trunc_
 ; CHECK: sub sp, sp, #16
 ; CHECK: strb w0, [sp, #15]
 ; CHECK: strh w1, [sp, #12]
@@ -17,7 +17,6 @@ entry:
 ; CHECK: ldrh w0, [sp, #12]
 ; CHECK: strb w0, [sp, #15]
 ; CHECK: ldrb w0, [sp, #15]
-; CHECK: uxtb w0, w0
 ; CHECK: add sp, sp, #16
 ; CHECK: ret
   %a.addr = alloca i8, align 1
@@ -44,21 +43,18 @@ entry:
 
 define i64 @zext_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
 entry:
-; CHECK: zext_
+; CHECK-LABEL: zext_
 ; CHECK: sub sp, sp, #16
 ; CHECK: strb w0, [sp, #15]
 ; CHECK: strh w1, [sp, #12]
 ; CHECK: str w2, [sp, #8]
 ; CHECK: str x3, [sp]
 ; CHECK: ldrb w0, [sp, #15]
-; CHECK: uxtb w0, w0
 ; CHECK: strh w0, [sp, #12]
 ; CHECK: ldrh w0, [sp, #12]
-; CHECK: uxth w0, w0
 ; CHECK: str w0, [sp, #8]
 ; CHECK: ldr w0, [sp, #8]
 ; CHECK: mov x3, x0
-; CHECK: ubfx x3, x3, #0, #32
 ; CHECK: str x3, [sp]
 ; CHECK: ldr x0, [sp]
 ; CHECK: ret
@@ -85,37 +81,35 @@ entry:
 
 define i32 @zext_i1_i32(i1 zeroext %a) nounwind ssp {
 entry:
-; CHECK: @zext_i1_i32
-; CHECK: and w0, w0, #0x1
+; CHECK-LABEL: zext_i1_i32
+; CHECK-NOT:   and w0, w0, #0x1
+; CHECK:       ret
   %conv = zext i1 %a to i32
   ret i32 %conv;
 }
 
 define i64 @zext_i1_i64(i1 zeroext %a) nounwind ssp {
 entry:
-; CHECK: @zext_i1_i64
-; CHECK: and w0, w0, #0x1
+; CHECK-LABEL: zext_i1_i64
+; CHECK-NOT:   and w0, w0, #0x1
+; CHECK:       ret
   %conv = zext i1 %a to i64
   ret i64 %conv;
 }
 
 define i64 @sext_(i8 signext %a, i16 signext %b, i32 %c, i64 %d) nounwind ssp {
 entry:
-; CHECK: sext_
+; CHECK-LABEL: sext_
 ; CHECK: sub sp, sp, #16
 ; CHECK: strb w0, [sp, #15]
 ; CHECK: strh w1, [sp, #12]
 ; CHECK: str w2, [sp, #8]
 ; CHECK: str x3, [sp]
-; CHECK: ldrb w0, [sp, #15]
-; CHECK: sxtb w0, w0
+; CHECK: ldrsb w0, [sp, #15]
 ; CHECK: strh w0, [sp, #12]
-; CHECK: ldrh w0, [sp, #12]
-; CHECK: sxth w0, w0
+; CHECK: ldrsh w0, [sp, #12]
 ; CHECK: str w0, [sp, #8]
-; CHECK: ldr w0, [sp, #8]
-; CHECK: mov x3, x0
-; CHECK: sxtw x3, w3
+; CHECK: ldrsw x3, [sp, #8]
 ; CHECK: str x3, [sp]
 ; CHECK: ldr x0, [sp]
 ; CHECK: ret
@@ -161,8 +155,9 @@ define zeroext i64 @sext_i16_i64(i16 zeroext %in) {
 ; Test sext i1 to i32
 define i32 @sext_i1_i32(i1 signext %a) nounwind ssp {
 entry:
-; CHECK: sext_i1_i32
-; CHECK: sbfx w0, w0, #0, #1
+; CHECK-LABEL: sext_i1_i32
+; CHECK-NOT:   sbfx w0, w0, #0, #1
+; CHECK:       ret
   %conv = sext i1 %a to i32
   ret i32 %conv
 }
@@ -170,7 +165,7 @@ entry:
 ; Test sext i1 to i16
 define signext i16 @sext_i1_i16(i1 %a) nounwind ssp {
 entry:
-; CHECK: sext_i1_i16
+; CHECK-LABEL: sext_i1_i16
 ; CHECK: sbfx w0, w0, #0, #1
   %conv = sext i1 %a to i16
   ret i16 %conv
@@ -179,7 +174,7 @@ entry:
 ; Test sext i1 to i8
 define signext i8 @sext_i1_i8(i1 %a) nounwind ssp {
 entry:
-; CHECK: sext_i1_i8
+; CHECK-LABEL: sext_i1_i8
 ; CHECK: sbfx w0, w0, #0, #1
   %conv = sext i1 %a to i8
   ret i8 %conv
@@ -188,7 +183,7 @@ entry:
 ; Test fpext
 define double @fpext_(float %a) nounwind ssp {
 entry:
-; CHECK: fpext_
+; CHECK-LABEL: fpext_
 ; CHECK: fcvt d0, s0
   %conv = fpext float %a to double
   ret double %conv
@@ -197,7 +192,7 @@ entry:
 ; Test fptrunc
 define float @fptrunc_(double %a) nounwind ssp {
 entry:
-; CHECK: fptrunc_
+; CHECK-LABEL: fptrunc_
 ; CHECK: fcvt s0, d0
   %conv = fptrunc double %a to float
   ret float %conv
@@ -206,7 +201,7 @@ entry:
 ; Test fptosi
 define i32 @fptosi_ws(float %a) nounwind ssp {
 entry:
-; CHECK: fptosi_ws
+; CHECK-LABEL: fptosi_ws
 ; CHECK: fcvtzs w0, s0
   %conv = fptosi float %a to i32
   ret i32 %conv
@@ -215,7 +210,7 @@ entry:
 ; Test fptosi
 define i32 @fptosi_wd(double %a) nounwind ssp {
 entry:
-; CHECK: fptosi_wd
+; CHECK-LABEL: fptosi_wd
 ; CHECK: fcvtzs w0, d0
   %conv = fptosi double %a to i32
   ret i32 %conv
@@ -224,7 +219,7 @@ entry:
 ; Test fptoui
 define i32 @fptoui_ws(float %a) nounwind ssp {
 entry:
-; CHECK: fptoui_ws
+; CHECK-LABEL: fptoui_ws
 ; CHECK: fcvtzu w0, s0
   %conv = fptoui float %a to i32
   ret i32 %conv
@@ -233,7 +228,7 @@ entry:
 ; Test fptoui
 define i32 @fptoui_wd(double %a) nounwind ssp {
 entry:
-; CHECK: fptoui_wd
+; CHECK-LABEL: fptoui_wd
 ; CHECK: fcvtzu w0, d0
   %conv = fptoui double %a to i32
   ret i32 %conv
@@ -242,7 +237,7 @@ entry:
 ; Test sitofp
 define float @sitofp_sw_i1(i1 %a) nounwind ssp {
 entry:
-; CHECK: sitofp_sw_i1
+; CHECK-LABEL: sitofp_sw_i1
 ; CHECK: sbfx w0, w0, #0, #1
 ; CHECK: scvtf s0, w0
   %conv = sitofp i1 %a to float
@@ -252,7 +247,7 @@ entry:
 ; Test sitofp
 define float @sitofp_sw_i8(i8 %a) nounwind ssp {
 entry:
-; CHECK: sitofp_sw_i8
+; CHECK-LABEL: sitofp_sw_i8
 ; CHECK: sxtb w0, w0
 ; CHECK: scvtf s0, w0
   %conv = sitofp i8 %a to float
@@ -262,9 +257,7 @@ entry:
 ; Test sitofp
 define float @sitofp_sw_i16(i16 %a) nounwind ssp {
 entry:
-; CHECK: sitofp_sw_i16
-; CHECK: sxth w0, w0
-; CHECK: scvtf s0, w0
+; CHECK-LABEL: sitofp_sw_i16
   %conv = sitofp i16 %a to float
   ret float %conv
 }
@@ -272,7 +265,7 @@ entry:
 ; Test sitofp
 define float @sitofp_sw(i32 %a) nounwind ssp {
 entry:
-; CHECK: sitofp_sw
+; CHECK-LABEL: sitofp_sw
 ; CHECK: scvtf s0, w0
   %conv = sitofp i32 %a to float
   ret float %conv
@@ -281,7 +274,7 @@ entry:
 ; Test sitofp
 define float @sitofp_sx(i64 %a) nounwind ssp {
 entry:
-; CHECK: sitofp_sx
+; CHECK-LABEL: sitofp_sx
 ; CHECK: scvtf s0, x0
   %conv = sitofp i64 %a to float
   ret float %conv
@@ -290,7 +283,7 @@ entry:
 ; Test sitofp
 define double @sitofp_dw(i32 %a) nounwind ssp {
 entry:
-; CHECK: sitofp_dw
+; CHECK-LABEL: sitofp_dw
 ; CHECK: scvtf d0, w0
   %conv = sitofp i32 %a to double
   ret double %conv
@@ -299,7 +292,7 @@ entry:
 ; Test sitofp
 define double @sitofp_dx(i64 %a) nounwind ssp {
 entry:
-; CHECK: sitofp_dx
+; CHECK-LABEL: sitofp_dx
 ; CHECK: scvtf d0, x0
   %conv = sitofp i64 %a to double
   ret double %conv
@@ -308,7 +301,7 @@ entry:
 ; Test uitofp
 define float @uitofp_sw_i1(i1 %a) nounwind ssp {
 entry:
-; CHECK: uitofp_sw_i1
+; CHECK-LABEL: uitofp_sw_i1
 ; CHECK: and w0, w0, #0x1
 ; CHECK: ucvtf s0, w0
   %conv = uitofp i1 %a to float
@@ -318,9 +311,7 @@ entry:
 ; Test uitofp
 define float @uitofp_sw_i8(i8 %a) nounwind ssp {
 entry:
-; CHECK: uitofp_sw_i8
-; CHECK: uxtb w0, w0
-; CHECK: ucvtf s0, w0
+; CHECK-LABEL: uitofp_sw_i8
   %conv = uitofp i8 %a to float
   ret float %conv
 }
@@ -328,9 +319,7 @@ entry:
 ; Test uitofp
 define float @uitofp_sw_i16(i16 %a) nounwind ssp {
 entry:
-; CHECK: uitofp_sw_i16
-; CHECK: uxth w0, w0
-; CHECK: ucvtf s0, w0
+; CHECK-LABEL: uitofp_sw_i16
   %conv = uitofp i16 %a to float
   ret float %conv
 }
@@ -338,7 +327,7 @@ entry:
 ; Test uitofp
 define float @uitofp_sw(i32 %a) nounwind ssp {
 entry:
-; CHECK: uitofp_sw
+; CHECK-LABEL: uitofp_sw
 ; CHECK: ucvtf s0, w0
   %conv = uitofp i32 %a to float
   ret float %conv
@@ -347,7 +336,7 @@ entry:
 ; Test uitofp
 define float @uitofp_sx(i64 %a) nounwind ssp {
 entry:
-; CHECK: uitofp_sx
+; CHECK-LABEL: uitofp_sx
 ; CHECK: ucvtf s0, x0
   %conv = uitofp i64 %a to float
   ret float %conv
@@ -356,7 +345,7 @@ entry:
 ; Test uitofp
 define double @uitofp_dw(i32 %a) nounwind ssp {
 entry:
-; CHECK: uitofp_dw
+; CHECK-LABEL: uitofp_dw
 ; CHECK: ucvtf d0, w0
   %conv = uitofp i32 %a to double
   ret double %conv
@@ -365,7 +354,7 @@ entry:
 ; Test uitofp
 define double @uitofp_dx(i64 %a) nounwind ssp {
 entry:
-; CHECK: uitofp_dx
+; CHECK-LABEL: uitofp_dx
 ; CHECK: ucvtf d0, x0
   %conv = uitofp i64 %a to double
   ret double %conv
@@ -373,7 +362,7 @@ entry:
 
 define i32 @i64_trunc_i32(i64 %a) nounwind ssp {
 entry:
-; CHECK: i64_trunc_i32
+; CHECK-LABEL: i64_trunc_i32
 ; CHECK: mov x1, x0
   %conv = trunc i64 %a to i32
   ret i32 %conv
@@ -381,7 +370,7 @@ entry:
 
 define zeroext i16 @i64_trunc_i16(i64 %a) nounwind ssp {
 entry:
-; CHECK: i64_trunc_i16
+; CHECK-LABEL: i64_trunc_i16
 ; CHECK: mov x[[REG:[0-9]+]], x0
 ; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xffff
 ; CHECK: uxth w0, [[REG2]]
@@ -391,7 +380,7 @@ entry:
 
 define zeroext i8 @i64_trunc_i8(i64 %a) nounwind ssp {
 entry:
-; CHECK: i64_trunc_i8
+; CHECK-LABEL: i64_trunc_i8
 ; CHECK: mov x[[REG:[0-9]+]], x0
 ; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xff
 ; CHECK: uxtb w0, [[REG2]]
@@ -401,7 +390,7 @@ entry:
 
 define zeroext i1 @i64_trunc_i1(i64 %a) nounwind ssp {
 entry:
-; CHECK: i64_trunc_i1
+; CHECK-LABEL: i64_trunc_i1
 ; CHECK: mov x[[REG:[0-9]+]], x0
 ; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0x1
 ; CHECK: and w0, [[REG2]], #0x1
@@ -411,7 +400,7 @@ entry:
 
 ; rdar://15101939
 define void @stack_trunc() nounwind {
-; CHECK: stack_trunc
+; CHECK-LABEL: stack_trunc
 ; CHECK: sub  sp, sp, #16
 ; CHECK: ldr  [[REG:x[0-9]+]], [sp]
 ; CHECK: mov  x[[REG2:[0-9]+]], [[REG]]
@@ -428,15 +417,36 @@ define void @stack_trunc() nounwind {
 
 define zeroext i64 @zext_i8_i64(i8 zeroext %in) {
 ; CHECK-LABEL: zext_i8_i64:
-; CHECK: mov x[[TMP:[0-9]+]], x0
-; CHECK: ubfx x0, x[[TMP]], #0, #8
+; CHECK-NOT:   ubfx x0, {{x[0-9]+}}, #0, #8
+; CHECK:       ret
   %big = zext i8 %in to i64
   ret i64 %big
 }
 define zeroext i64 @zext_i16_i64(i16 zeroext %in) {
 ; CHECK-LABEL: zext_i16_i64:
-; CHECK: mov x[[TMP:[0-9]+]], x0
-; CHECK: ubfx x0, x[[TMP]], #0, #16
+; CHECK-NOT:   ubfx x0, {{x[0-9]+}}, #0, #16
+; CHECK:       ret
   %big = zext i16 %in to i64
   ret i64 %big
 }
+
+define float @bitcast_i32_to_float(i32 %a) {
+  %1 = bitcast i32 %a to float
+  ret float %1
+}
+
+define double @bitcast_i64_to_double(i64 %a) {
+  %1 = bitcast i64 %a to double
+  ret double %1
+}
+
+define i32 @bitcast_float_to_i32(float %a) {
+  %1 = bitcast float %a to i32
+  ret i32 %1
+}
+
+define i64 @bitcast_double_to_i64(double %a) {
+  %1 = bitcast double %a to i64
+  ret i64 %1
+}
+
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll b/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll
index f030596..111b6bd 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll
@@ -1,146 +1,162 @@
-; RUN: llc < %s -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
-define zeroext i1 @fcmp_float1(float %a) nounwind ssp {
-entry:
-; CHECK-LABEL: @fcmp_float1
-; CHECK: fcmp s0, #0.0
-; CHECK: cset w{{[0-9]+}}, ne
-  %cmp = fcmp une float %a, 0.000000e+00
-  ret i1 %cmp
+define zeroext i1 @fcmp_float1(float %a) {
+; CHECK-LABEL: fcmp_float1
+; CHECK:       fcmp s0, #0.0
+; CHECK-NEXT:  cset {{w[0-9]+}}, ne
+  %1 = fcmp une float %a, 0.000000e+00
+  ret i1 %1
 }
 
-define zeroext i1 @fcmp_float2(float %a, float %b) nounwind ssp {
-entry:
-; CHECK-LABEL: @fcmp_float2
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, ne
-  %cmp = fcmp une float %a, %b
-  ret i1 %cmp
+define zeroext i1 @fcmp_float2(float %a, float %b) {
+; CHECK-LABEL: fcmp_float2
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, ne
+  %1 = fcmp une float %a, %b
+  ret i1 %1
 }
 
-define zeroext i1 @fcmp_double1(double %a) nounwind ssp {
-entry:
-; CHECK-LABEL: @fcmp_double1
-; CHECK: fcmp d0, #0.0
-; CHECK: cset w{{[0-9]+}}, ne
-  %cmp = fcmp une double %a, 0.000000e+00
-  ret i1 %cmp
+define zeroext i1 @fcmp_double1(double %a) {
+; CHECK-LABEL: fcmp_double1
+; CHECK:       fcmp d0, #0.0
+; CHECK-NEXT:  cset {{w[0-9]+}}, ne
+  %1 = fcmp une double %a, 0.000000e+00
+  ret i1 %1
 }
 
-define zeroext i1 @fcmp_double2(double %a, double %b) nounwind ssp {
-entry:
-; CHECK-LABEL: @fcmp_double2
-; CHECK: fcmp d0, d1
-; CHECK: cset w{{[0-9]+}}, ne
-  %cmp = fcmp une double %a, %b
-  ret i1 %cmp
+define zeroext i1 @fcmp_double2(double %a, double %b) {
+; CHECK-LABEL: fcmp_double2
+; CHECK:       fcmp d0, d1
+; CHECK-NEXT:  cset {{w[0-9]+}}, ne
+  %1 = fcmp une double %a, %b
+  ret i1 %1
 }
 
 ; Check each fcmp condition
-define float @fcmp_oeq(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_oeq
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, eq
-  %cmp = fcmp oeq float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ogt(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ogt
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, gt
-  %cmp = fcmp ogt float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_oge(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_oge
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, ge
-  %cmp = fcmp oge float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_olt(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_olt
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, mi
-  %cmp = fcmp olt float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ole(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ole
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, ls
-  %cmp = fcmp ole float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ord(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ord
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, vc
-  %cmp = fcmp ord float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_uno(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_uno
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, vs
-  %cmp = fcmp uno float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ugt(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ugt
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, hi
-  %cmp = fcmp ugt float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_uge(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_uge
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, pl
-  %cmp = fcmp uge float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ult(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ult
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, lt
-  %cmp = fcmp ult float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ule(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ule
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, le
-  %cmp = fcmp ule float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_une(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_une
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, ne
-  %cmp = fcmp une float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
+define zeroext i1 @fcmp_false(float %a) {
+; CHECK-LABEL: fcmp_false
+; CHECK:       mov {{w[0-9]+}}, wzr
+  %1 = fcmp ogt float %a, %a
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_oeq(float %a, float %b) {
+; CHECK-LABEL: fcmp_oeq
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, eq
+  %1 = fcmp oeq float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ogt(float %a, float %b) {
+; CHECK-LABEL: fcmp_ogt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, gt
+  %1 = fcmp ogt float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_oge(float %a, float %b) {
+; CHECK-LABEL: fcmp_oge
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, ge
+  %1 = fcmp oge float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_olt(float %a, float %b) {
+; CHECK-LABEL: fcmp_olt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, mi
+  %1 = fcmp olt float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ole(float %a, float %b) {
+; CHECK-LABEL: fcmp_ole
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, ls
+  %1 = fcmp ole float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_one(float %a, float %b) {
+; CHECK-LABEL: fcmp_one
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], mi
+; CHECK-NEXT:  csinc {{w[0-9]+}}, [[REG]], wzr, le
+  %1 = fcmp one float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ord(float %a, float %b) {
+; CHECK-LABEL: fcmp_ord
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, vc
+  %1 = fcmp ord float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_uno(float %a, float %b) {
+; CHECK-LABEL: fcmp_uno
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, vs
+  %1 = fcmp uno float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ueq(float %a, float %b) {
+; CHECK-LABEL: fcmp_ueq
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], eq
+; CHECK-NEXT:  csinc {{w[0-9]+}}, [[REG]], wzr, vc
+  %1 = fcmp ueq float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ugt(float %a, float %b) {
+; CHECK-LABEL: fcmp_ugt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, hi
+  %1 = fcmp ugt float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_uge(float %a, float %b) {
+; CHECK-LABEL: fcmp_uge
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, pl
+  %1 = fcmp uge float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ult(float %a, float %b) {
+; CHECK-LABEL: fcmp_ult
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, lt
+  %1 = fcmp ult float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ule(float %a, float %b) {
+; CHECK-LABEL: fcmp_ule
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, le
+  %1 = fcmp ule float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_une(float %a, float %b) {
+; CHECK-LABEL: fcmp_une
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, ne
+  %1 = fcmp une float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_true(float %a) {
+; CHECK-LABEL: fcmp_true
+; CHECK:       orr {{w[0-9]+}}, wzr, #0x1
+  %1 = fcmp ueq float %a, %a
+  ret i1 %1
 }
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-gv.ll b/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
index dc4d895..1a4e8ea 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 ; Test load/store of global value from global offset table.
 @seed = common global i64 0, align 8
@@ -6,9 +6,9 @@
 define void @Initrand() nounwind {
 entry:
 ; CHECK: @Initrand
-; CHECK: adrp x[[REG:[0-9]+]], _seed@GOTPAGE
-; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed@GOTPAGEOFF]
-; CHECK: str x{{[0-9]+}}, [x[[REG2]]]
+; CHECK: adrp [[REG:x[0-9]+]], _seed@GOTPAGE
+; CHECK: ldr  [[REG2:x[0-9]+]], {{\[}}[[REG]], _seed@GOTPAGEOFF{{\]}}
+; CHECK: str  {{x[0-9]+}}, {{\[}}[[REG2]]{{\]}}
   store i64 74755, i64* @seed, align 8
   ret void
 }
@@ -16,17 +16,16 @@ entry:
 define i32 @Rand() nounwind {
 entry:
 ; CHECK: @Rand
-; CHECK: adrp x[[REG:[0-9]+]], _seed@GOTPAGE
-; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed@GOTPAGEOFF]
-; CHECK: movz x[[REG3:[0-9]+]], #0x51d
-; CHECK: ldr x[[REG4:[0-9]+]], [x[[REG2]]]
-; CHECK: mul x[[REG5:[0-9]+]], x[[REG4]], x[[REG3]]
-; CHECK: movz x[[REG6:[0-9]+]], #0x3619
-; CHECK: add x[[REG7:[0-9]+]], x[[REG5]], x[[REG6]]
-; CHECK: orr x[[REG8:[0-9]+]], xzr, #0xffff
-; CHECK: and x[[REG9:[0-9]+]], x[[REG7]], x[[REG8]]
-; CHECK: str x[[REG9]], [x[[REG]]]
-; CHECK: ldr x{{[0-9]+}}, [x[[REG]]]
+; CHECK: adrp [[REG1:x[0-9]+]], _seed@GOTPAGE
+; CHECK: ldr  [[REG2:x[0-9]+]], {{\[}}[[REG1]], _seed@GOTPAGEOFF{{\]}}
+; CHECK: movz [[REG3:x[0-9]+]], #0x3619
+; CHECK: movz [[REG4:x[0-9]+]], #0x51d
+; CHECK: ldr  [[REG5:x[0-9]+]], {{\[}}[[REG2]]{{\]}}
+; CHECK: mul  [[REG6:x[0-9]+]], [[REG5]], [[REG4]]
+; CHECK: add  [[REG7:x[0-9]+]], [[REG6]], [[REG3]]
+; CHECK: and  [[REG8:x[0-9]+]], [[REG7]], #0xffff
+; CHECK: str  [[REG8]], {{\[}}[[REG1]]{{\]}}
+; CHECK: ldr  {{x[0-9]+}}, {{\[}}[[REG1]]{{\]}}
   %0 = load i64* @seed, align 8
   %mul = mul nsw i64 %0, 1309
   %add = add nsw i64 %mul, 13849
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll b/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
index 971be5c..245c70e 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 define i32 @icmp_eq_imm(i32 %a) nounwind ssp {
 entry:
-; CHECK: icmp_eq_imm
-; CHECK: cmp  w0, #31
-; CHECK: cset w0, eq
+; CHECK-LABEL: icmp_eq_imm
+; CHECK:       cmp w0, #31
+; CHECK-NEXT:  cset w0, eq
   %cmp = icmp eq i32 %a, 31
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -12,19 +12,19 @@ entry:
 
 define i32 @icmp_eq_neg_imm(i32 %a) nounwind ssp {
 entry:
-; CHECK: icmp_eq_neg_imm
-; CHECK: cmn  w0, #7
-; CHECK: cset w0, eq
+; CHECK-LABEL: icmp_eq_neg_imm
+; CHECK:       cmn w0, #7
+; CHECK-NEXT:  cset w0, eq
   %cmp = icmp eq i32 %a, -7
   %conv = zext i1 %cmp to i32
   ret i32 %conv
 }
 
-define i32 @icmp_eq(i32 %a, i32 %b) nounwind ssp {
+define i32 @icmp_eq_i32(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_eq
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, eq
+; CHECK-LABEL: icmp_eq_i32
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, eq
   %cmp = icmp eq i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -32,19 +32,39 @@ entry:
 
 define i32 @icmp_ne(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_ne
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, ne
+; CHECK-LABEL: icmp_ne
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, ne
   %cmp = icmp ne i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
 }
 
+define i32 @icmp_eq_ptr(i8* %a) {
+entry:
+; CHECK-LABEL: icmp_eq_ptr
+; CHECK:       cmp x0, #0
+; CHECK-NEXT:  cset {{.+}}, eq
+  %cmp = icmp eq i8* %a, null
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ne_ptr(i8* %a) {
+entry:
+; CHECK-LABEL: icmp_ne_ptr
+; CHECK:       cmp x0, #0
+; CHECK-NEXT:  cset {{.+}}, ne
+  %cmp = icmp ne i8* %a, null
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
 define i32 @icmp_ugt(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_ugt
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, hi
+; CHECK-LABEL: icmp_ugt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, hi
   %cmp = icmp ugt i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -52,9 +72,9 @@ entry:
 
 define i32 @icmp_uge(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_uge
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, hs
+; CHECK-LABEL: icmp_uge
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, hs
   %cmp = icmp uge i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -62,9 +82,9 @@ entry:
 
 define i32 @icmp_ult(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_ult
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, lo
+; CHECK-LABEL: icmp_ult
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, lo
   %cmp = icmp ult i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -72,9 +92,9 @@ entry:
 
 define i32 @icmp_ule(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_ule
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, ls
+; CHECK-LABEL: icmp_ule
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, ls
   %cmp = icmp ule i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -82,9 +102,9 @@ entry:
 
 define i32 @icmp_sgt(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_sgt
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, gt
+; CHECK-LABEL: icmp_sgt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, gt
   %cmp = icmp sgt i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -92,9 +112,9 @@ entry:
 
 define i32 @icmp_sge(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_sge
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, ge
+; CHECK-LABEL: icmp_sge
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, ge
   %cmp = icmp sge i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -102,9 +122,9 @@ entry:
 
 define i32 @icmp_slt(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_slt
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, lt
+; CHECK-LABEL: icmp_slt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, lt
   %cmp = icmp slt i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -112,9 +132,9 @@ entry:
 
 define i32 @icmp_sle(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_sle
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, le
+; CHECK-LABEL: icmp_sle
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, le
   %cmp = icmp sle i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -122,9 +142,9 @@ entry:
 
 define i32 @icmp_i64(i64 %a, i64 %b) nounwind ssp {
 entry:
-; CHECK: icmp_i64
-; CHECK: cmp  x0, x1
-; CHECK: cset w{{[0-9]+}}, le
+; CHECK-LABEL: icmp_i64
+; CHECK:       cmp  x0, x1
+; CHECK-NEXT:  cset w{{[0-9]+}}, le
   %cmp = icmp sle i64 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -132,33 +152,30 @@ entry:
 
 define zeroext i1 @icmp_eq_i16(i16 %a, i16 %b) nounwind ssp {
 entry:
-; CHECK: icmp_eq_i16
-; CHECK: sxth w0, w0
-; CHECK: sxth w1, w1
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, eq
+; CHECK-LABEL: icmp_eq_i16
+; CHECK:       sxth w0, w0
+; CHECK:       cmp w0, w1, sxth
+; CHECK-NEXT:  cset w0, eq
   %cmp = icmp eq i16 %a, %b
   ret i1 %cmp
 }
 
 define zeroext i1 @icmp_eq_i8(i8 %a, i8 %b) nounwind ssp {
 entry:
-; CHECK: icmp_eq_i8
-; CHECK: sxtb w0, w0
-; CHECK: sxtb w1, w1
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, eq
+; CHECK-LABEL: icmp_eq_i8
+; CHECK:       sxtb w0, w0
+; CHECK-NEXT:  cmp w0, w1, sxtb
+; CHECK-NEXT:  cset w0, eq
   %cmp = icmp eq i8 %a, %b
   ret i1 %cmp
 }
 
 define i32 @icmp_i16_unsigned(i16 %a, i16 %b) nounwind {
 entry:
-; CHECK: icmp_i16_unsigned
-; CHECK: uxth w0, w0
-; CHECK: uxth w1, w1
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, lo
+; CHECK-LABEL: icmp_i16_unsigned
+; CHECK:       uxth w0, w0
+; CHECK-NEXT:  cmp w0, w1, uxth
+; CHECK-NEXT:  cset w0, lo
   %cmp = icmp ult i16 %a, %b
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
@@ -166,24 +183,34 @@ entry:
 
 define i32 @icmp_i8_signed(i8 %a, i8 %b) nounwind {
 entry:
-; CHECK: @icmp_i8_signed
-; CHECK: sxtb w0, w0
-; CHECK: sxtb w1, w1
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, gt
+; CHECK-LABEL: icmp_i8_signed
+; CHECK:       sxtb w0, w0
+; CHECK-NEXT:  cmp w0, w1, sxtb
+; CHECK-NEXT:  cset w0, gt
   %cmp = icmp sgt i8 %a, %b
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
 }
 
+define i32 @icmp_i1_signed(i1 %a, i1 %b) nounwind {
+entry:
+; CHECK-LABEL: icmp_i1_signed
+; CHECK:       sbfx [[REG1:w[0-9]+]], w0, #0, #1
+; CHECK-NEXT:  sbfx [[REG2:w[0-9]+]], w1, #0, #1
+; CHECK-NEXT:  cmp  [[REG1]], [[REG2]]
+; CHECK-NEXT:  cset w0, gt
+  %cmp = icmp sgt i1 %a, %b
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
 
 define i32 @icmp_i16_signed_const(i16 %a) nounwind {
 entry:
-; CHECK: icmp_i16_signed_const
-; CHECK: sxth w0, w0
-; CHECK: cmn  w0, #233
-; CHECK: cset w0, lt
-; CHECK: and w0, w0, #0x1
+; CHECK-LABEL: icmp_i16_signed_const
+; CHECK:       sxth w0, w0
+; CHECK-NEXT:  cmn w0, #233
+; CHECK-NEXT:  cset w0, lt
+; CHECK-NEXT:  and w0, w0, #0x1
   %cmp = icmp slt i16 %a, -233
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
@@ -191,11 +218,11 @@ entry:
 
 define i32 @icmp_i8_signed_const(i8 %a) nounwind {
 entry:
-; CHECK: icmp_i8_signed_const
-; CHECK: sxtb w0, w0
-; CHECK: cmp  w0, #124
-; CHECK: cset w0, gt
-; CHECK: and w0, w0, #0x1
+; CHECK-LABEL: icmp_i8_signed_const
+; CHECK:       sxtb w0, w0
+; CHECK-NEXT:  cmp w0, #124
+; CHECK-NEXT:  cset w0, gt
+; CHECK-NEXT:  and w0, w0, #0x1
   %cmp = icmp sgt i8 %a, 124
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
@@ -203,11 +230,11 @@ entry:
 
 define i32 @icmp_i1_unsigned_const(i1 %a) nounwind {
 entry:
-; CHECK: icmp_i1_unsigned_const
-; CHECK: and w0, w0, #0x1
-; CHECK: cmp  w0, #0
-; CHECK: cset w0, lo
-; CHECK: and w0, w0, #0x1
+; CHECK-LABEL: icmp_i1_unsigned_const
+; CHECK:       and w0, w0, #0x1
+; CHECK-NEXT:  cmp w0, #0
+; CHECK-NEXT:  cset w0, lo
+; CHECK-NEXT:  and w0, w0, #0x1
   %cmp = icmp ult i1 %a, 0
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll b/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll
index 70335ac..a5f4524 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 @fn.table = internal global [2 x i8*] [i8* blockaddress(@fn, %ZERO), i8* blockaddress(@fn, %ONE)], align 8
 
 define i32 @fn(i32 %target) nounwind {
 entry:
-; CHECK: @fn
+; CHECK-LABEL: fn
   %retval = alloca i32, align 4
   %target.addr = alloca i32, align 4
   store i32 %target, i32* %target.addr, align 4
@@ -29,8 +29,8 @@ return:                                           ; preds = %ONE, %ZERO
   ret i32 %2
 
 indirectgoto:                                     ; preds = %entry
-; CHECK: ldr x0, [sp]
-; CHECK: br x0
+; CHECK:      ldr [[REG:x[0-9]+]], [sp]
+; CHECK-NEXT: br [[REG]]
   %indirect.goto.dest = phi i8* [ %1, %entry ]
   indirectbr i8* %indirect.goto.dest, [label %ZERO, label %ONE]
 }
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
index 1152988..9ac3e44 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=arm64-apple-ios | FileCheck %s --check-prefix=ARM64
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -relocation-model=dynamic-no-pic -mtriple=arm64-apple-ios < %s | FileCheck %s --check-prefix=ARM64
 
 @message = global [80 x i8] c"The LLVM Compiler Infrastructure\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", align 16
 @temp = common global [80 x i8] zeroinitializer, align 16
@@ -7,7 +7,7 @@ define void @t1() {
 ; ARM64-LABEL: t1
 ; ARM64: adrp x8, _message@PAGE
 ; ARM64: add x0, x8, _message@PAGEOFF
-; ARM64: movz w9, #0
+; ARM64: mov w9, wzr
 ; ARM64: movz x2, #0x50
 ; ARM64: uxtb w1, w9
 ; ARM64: bl _memset
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll b/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
index ffac131..1dea5d9 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
@@ -1,27 +1,41 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 ; Materialize using fmov
-define void @float_(float* %value) {
-; CHECK: @float_
-; CHECK: fmov s0, #1.25000000
-  store float 1.250000e+00, float* %value, align 4
-  ret void
+define float @fmov_float1() {
+; CHECK-LABEL: fmov_float1
+; CHECK:       fmov s0, #1.25000000
+  ret float 1.250000e+00
 }
 
-define void @double_(double* %value) {
-; CHECK: @double_
-; CHECK: fmov d0, #1.25000000
-  store double 1.250000e+00, double* %value, align 8
-  ret void
+define float @fmov_float2() {
+; CHECK-LABEL: fmov_float2
+; CHECK:       fmov s0, wzr
+  ret float 0.0e+00
+}
+
+define double @fmov_double1() {
+; CHECK-LABEL: fmov_double1
+; CHECK:       fmov d0, #1.25000000
+  ret double 1.250000e+00
+}
+
+define double @fmov_double2() {
+; CHECK-LABEL: fmov_double2
+; CHECK:       fmov d0, xzr
+  ret double 0.0e+00
 }
 
 ; Materialize from constant pool
-define float @float_cp() {
-; CHECK: @float_cp
+define float @cp_float() {
+; CHECK-LABEL: cp_float
+; CHECK:       adrp [[REG:x[0-9]+]], {{lCPI[0-9]+_0}}@PAGE
+; CHECK-NEXT:  ldr s0, {{\[}}[[REG]], {{lCPI[0-9]+_0}}@PAGEOFF{{\]}}
   ret float 0x400921FB60000000
 }
 
-define double @double_cp() {
-; CHECK: @double_cp
+define double @cp_double() {
+; CHECK-LABEL: cp_double
+; CHECK:       adrp [[REG:x[0-9]+]], {{lCPI[0-9]+_0}}@PAGE
+; CHECK-NEXT:  ldr d0, {{\[}}[[REG]], {{lCPI[0-9]+_0}}@PAGEOFF{{\]}}
   ret double 0x400921FB54442D18
 }
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll b/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll
index 483d179..81daa7c 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-apple-ios -O0 %s -o - | FileCheck %s
+; RUN: llc -O0 -verify-machineinstrs -mtriple=aarch64-apple-ios < %s | FileCheck %s
 
 ; Fast-isel can't do vector conversions yet, but it was emitting some highly
 ; suspect UCVTFUWDri MachineInstrs.
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-rem.ll b/test/CodeGen/AArch64/arm64-fast-isel-rem.ll
index d5bdbaa..26f9afa 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-rem.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-rem.ll
@@ -1,7 +1,6 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 ; RUN: llc %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -print-machineinstrs=expand-isel-pseudos -o /dev/null 2> %t
 ; RUN: FileCheck %s < %t --check-prefix=CHECK-SSA
-; REQUIRES: asserts
 
 ; CHECK-SSA-LABEL: Machine code for function t1
 
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-ret.ll b/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
index d91fd28..f84c755 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 ;; Test returns.
 define void @t0() nounwind ssp {
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-select.ll b/test/CodeGen/AArch64/arm64-fast-isel-select.ll
deleted file mode 100644
index 1cc207f..0000000
--- a/test/CodeGen/AArch64/arm64-fast-isel-select.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-define i32 @t1(i32 %c) nounwind readnone {
-entry:
-; CHECK: @t1
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
-  %0 = icmp sgt i32 %c, 1
-  %1 = select i1 %0, i32 123, i32 357
-  ret i32 %1
-}
-
-define i64 @t2(i32 %c) nounwind readnone {
-entry:
-; CHECK: @t2
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
-  %0 = icmp sgt i32 %c, 1
-  %1 = select i1 %0, i64 123, i64 357
-  ret i64 %1
-}
-
-define i32 @t3(i1 %c, i32 %a, i32 %b) nounwind readnone {
-entry:
-; CHECK: @t3
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
-  %0 = select i1 %c, i32 %a, i32 %b
-  ret i32 %0
-}
-
-define i64 @t4(i1 %c, i64 %a, i64 %b) nounwind readnone {
-entry:
-; CHECK: @t4
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
-  %0 = select i1 %c, i64 %a, i64 %b
-  ret i64 %0
-}
-
-define float @t5(i1 %c, float %a, float %b) nounwind readnone {
-entry:
-; CHECK: @t5
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: fcsel s0, s0, s1, ne
-  %0 = select i1 %c, float %a, float %b
-  ret float %0
-}
-
-define double @t6(i1 %c, double %a, double %b) nounwind readnone {
-entry:
-; CHECK: @t6
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: fcsel d0, d0, d1, ne
-  %0 = select i1 %c, double %a, double %b
-  ret double %0
-}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-store.ll b/test/CodeGen/AArch64/arm64-fast-isel-store.ll
new file mode 100644
index 0000000..9494d55
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-store.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mtriple=aarch64-unknown-unknown                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-unknown-unknown -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+define void @store_i8(i8* %a) {
+; CHECK-LABEL: store_i8
+; CHECK: strb  wzr, [x0]
+  store i8 0, i8* %a
+  ret void
+}
+
+define void @store_i16(i16* %a) {
+; CHECK-LABEL: store_i16
+; CHECK: strh  wzr, [x0]
+  store i16 0, i16* %a
+  ret void
+}
+
+define void @store_i32(i32* %a) {
+; CHECK-LABEL: store_i32
+; CHECK: str  wzr, [x0]
+  store i32 0, i32* %a
+  ret void
+}
+
+define void @store_i64(i64* %a) {
+; CHECK-LABEL: store_i64
+; CHECK: str  xzr, [x0]
+  store i64 0, i64* %a
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel.ll b/test/CodeGen/AArch64/arm64-fast-isel.ll
index 0194b3a..4349946 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 define void @t0(i32 %a) nounwind {
 entry:
@@ -66,8 +66,7 @@ entry:
 define void @t4(i32 *%ptr) nounwind {
 entry:
 ; CHECK-LABEL: t4:
-; CHECK: movz w8, #0
-; CHECK: stur w8, [x0, #-4]
+; CHECK: stur wzr, [x0, #-4]
 ; CHECK: ret
   %0 = getelementptr i32 *%ptr, i32 -1
   store i32 0, i32* %0, align 4
@@ -77,8 +76,7 @@ entry:
 define void @t5(i32 *%ptr) nounwind {
 entry:
 ; CHECK-LABEL: t5:
-; CHECK: movz w8, #0
-; CHECK: stur w8, [x0, #-256]
+; CHECK: stur wzr, [x0, #-256]
 ; CHECK: ret
   %0 = getelementptr i32 *%ptr, i32 -64
   store i32 0, i32* %0, align 4
diff --git a/test/CodeGen/AArch64/arm64-frameaddr.ll b/test/CodeGen/AArch64/arm64-frameaddr.ll
deleted file mode 100644
index 469078c..0000000
--- a/test/CodeGen/AArch64/arm64-frameaddr.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define i8* @t() nounwind {
-entry:
-; CHECK-LABEL: t:
-; CHECK: stp x29, x30, [sp, #-16]!
-; CHECK: mov x29, sp
-; CHECK: mov x0, x29
-; CHECK: ldp x29, x30, [sp], #16
-; CHECK: ret
-	%0 = call i8* @llvm.frameaddress(i32 0)
-        ret i8* %0
-}
-
-declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-indexed-memory.ll b/test/CodeGen/AArch64/arm64-indexed-memory.ll
index e501c6e..a8620f4 100644
--- a/test/CodeGen/AArch64/arm64-indexed-memory.ll
+++ b/test/CodeGen/AArch64/arm64-indexed-memory.ll
@@ -349,3 +349,15 @@ define i8* @preidx8sext64(i8* %src, i64* %out) {
   store i64 %ext, i64* %out, align 4
   ret i8* %ptr
 }
+
+; This test checks if illegal post-index is generated
+
+define i64* @postidx_clobber(i64* %addr) nounwind noinline ssp {
+; CHECK-LABEL: postidx_clobber:
+; CHECK-NOT: str     x0, [x0], #8
+; ret
+ %paddr = bitcast i64* %addr to i64**
+ store i64* %addr, i64** %paddr
+ %newaddr = getelementptr i64* %addr, i32 1
+ ret i64* %newaddr
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm.ll b/test/CodeGen/AArch64/arm64-inline-asm.ll
index d76cca3..9c8bcaa 100644
--- a/test/CodeGen/AArch64/arm64-inline-asm.ll
+++ b/test/CodeGen/AArch64/arm64-inline-asm.ll
@@ -87,13 +87,17 @@ entry:
   ret i32 %1
 }
 
-define i32 @constraint_J(i32 %i, i32 %j) nounwind {
+define i32 @constraint_J(i32 %i, i32 %j, i64 %k) nounwind {
 entry:
   ; CHECK-LABEL: constraint_J:
   %0 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -16773120) nounwind
-  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #4278194176
+  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #-16773120
   %1 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -1) nounwind
-  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #4294967295
+  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #-1
+  %2 = tail call i64 asm sideeffect "sub ${0:x}, ${1:x}, $2", "=r,r,J"(i64 %k, i32 -1) nounwind
+  ; CHECK: sub   {{x[0-9]+}}, {{x[0-9]+}}, #-1
+  %3 = tail call i64 asm sideeffect "sub ${0:x}, ${1:x}, $2", "=r,r,J"(i64 %k, i64 -1) nounwind
+  ; CHECK: sub   {{x[0-9]+}}, {{x[0-9]+}}, #-1
   ret i32 %1
 }
 
diff --git a/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll b/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll
new file mode 100644
index 0000000..d39722b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone < %s | FileCheck %s
+
+; Test that scratch registers are spilled around patchpoints
+; CHECK: InlineAsm End
+; CHECK-NEXT: mov x{{[0-9]+}}, x16
+; CHECK-NEXT: mov x{{[0-9]+}}, x17
+; CHECK-NEXT: Ltmp
+; CHECK-NEXT: nop
+define void @clobberScratch(i32* %p) {
+  %v = load i32* %p
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v)
+  store i32 %v, i32* %p
+  ret void
+}
+
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+
diff --git a/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
new file mode 100644
index 0000000..8f79f80
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
@@ -0,0 +1,118 @@
+; RUN: llc -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone            < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone -fast-isel < %s | FileCheck %s --check-prefix=FAST
+
+; One argument will be passed in register, the other will be pushed on the stack.
+; Return value in x0.
+define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen:
+; CHECK:       Ltmp
+; CHECK:       str x{{.+}}, [sp]
+; CHECK-NEXT:  mov  x0, x{{.+}}
+; CHECK:       Ltmp
+; CHECK-NEXT:  movz  x16, #0xffff, lsl #32
+; CHECK-NEXT:  movk  x16, #0xdead, lsl #16
+; CHECK-NEXT:  movk  x16, #0xbeef
+; CHECK-NEXT:  blr x16
+; FAST-LABEL:  jscall_patchpoint_codegen:
+; FAST:        Ltmp
+; FAST:        str x{{.+}}, [sp]
+; FAST:        Ltmp
+; FAST-NEXT:   movz  x16, #0xffff, lsl #32
+; FAST-NEXT:   movk  x16, #0xdead, lsl #16
+; FAST-NEXT:   movk  x16, #0xbeef
+; FAST-NEXT:   blr x16
+  %resolveCall2 = inttoptr i64 281474417671919 to i8*
+  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
+  %resolveCall3 = inttoptr i64 244837814038255 to i8*
+  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
+  ret void
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen2(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen2:
+; CHECK:       Ltmp
+; CHECK:       orr w[[REG:[0-9]+]], wzr, #0x6
+; CHECK-NEXT:  str x[[REG]], [sp, #24]
+; CHECK-NEXT:  orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK-NEXT:  str w[[REG]], [sp, #16]
+; CHECK-NEXT:  orr w[[REG:[0-9]+]], wzr, #0x2
+; CHECK-NEXT:  str x[[REG]], [sp]
+; CHECK:       Ltmp
+; CHECK-NEXT:  movz  x16, #0xffff, lsl #32
+; CHECK-NEXT:  movk  x16, #0xdead, lsl #16
+; CHECK-NEXT:  movk  x16, #0xbeef
+; CHECK-NEXT:  blr x16
+; FAST-LABEL:  jscall_patchpoint_codegen2:
+; FAST:        Ltmp
+; FAST:        orr [[REG1:x[0-9]+]], xzr, #0x2
+; FAST-NEXT:   orr [[REG2:w[0-9]+]], wzr, #0x4
+; FAST-NEXT:   orr [[REG3:x[0-9]+]], xzr, #0x6
+; FAST-NEXT:   str [[REG1]], [sp]
+; FAST-NEXT:   str [[REG2]], [sp, #16]
+; FAST-NEXT:   str [[REG3]], [sp, #24]
+; FAST:        Ltmp
+; FAST-NEXT:   movz  x16, #0xffff, lsl #32
+; FAST-NEXT:   movk  x16, #0xdead, lsl #16
+; FAST-NEXT:   movk  x16, #0xbeef
+; FAST-NEXT:   blr x16
+  %call = inttoptr i64 281474417671919 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
+  ret i64 %result
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen3(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen3:
+; CHECK:       Ltmp
+; CHECK:       movz  w[[REG:[0-9]+]], #0xa
+; CHECK-NEXT:  str x[[REG]], [sp, #48]
+; CHECK-NEXT:  orr w[[REG:[0-9]+]], wzr, #0x8
+; CHECK-NEXT:  str w[[REG]], [sp, #36]
+; CHECK-NEXT:  orr w[[REG:[0-9]+]], wzr, #0x6
+; CHECK-NEXT:  str x[[REG]], [sp, #24]
+; CHECK-NEXT:  orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK-NEXT:  str w[[REG]], [sp, #16]
+; CHECK-NEXT:  orr w[[REG:[0-9]+]], wzr, #0x2
+; CHECK-NEXT:  str x[[REG]], [sp]
+; CHECK:       Ltmp
+; CHECK-NEXT:  movz  x16, #0xffff, lsl #32
+; CHECK-NEXT:  movk  x16, #0xdead, lsl #16
+; CHECK-NEXT:  movk  x16, #0xbeef
+; CHECK-NEXT:  blr x16
+; FAST-LABEL:  jscall_patchpoint_codegen3:
+; FAST:        Ltmp
+; FAST:        orr [[REG1:x[0-9]+]], xzr, #0x2
+; FAST-NEXT:   orr [[REG2:w[0-9]+]], wzr, #0x4
+; FAST-NEXT:   orr [[REG3:x[0-9]+]], xzr, #0x6
+; FAST-NEXT:   orr [[REG4:w[0-9]+]], wzr, #0x8
+; FAST-NEXT:   movz [[REG5:x[0-9]+]], #0xa
+; FAST-NEXT:   str [[REG1]], [sp]
+; FAST-NEXT:   str [[REG2]], [sp, #16]
+; FAST-NEXT:   str [[REG3]], [sp, #24]
+; FAST-NEXT:   str [[REG4]], [sp, #36]
+; FAST-NEXT:   str [[REG5]], [sp, #48]
+; FAST:        Ltmp
+; FAST-NEXT:   movz  x16, #0xffff, lsl #32
+; FAST-NEXT:   movk  x16, #0xdead, lsl #16
+; FAST-NEXT:   movk  x16, #0xbeef
+; FAST-NEXT:   blr x16
+  %call = inttoptr i64 281474417671919 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
+  ret i64 %result
+}
+
+; CHECK-LABEL: test_i16:
+; CHECK: ldrh [[BREG:w[0-9]+]], [sp]
+; CHECK: add {{w[0-9]+}}, w0, [[BREG]]
+define webkit_jscc zeroext i16 @test_i16(i16 zeroext %a, i16 zeroext %b) {
+  %sum = add i16 %a, %b
+  ret i16 %sum
+}
+
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
+
diff --git a/test/CodeGen/AArch64/arm64-patchpoint.ll b/test/CodeGen/AArch64/arm64-patchpoint.ll
index 039cdfc..278cba5 100644
--- a/test/CodeGen/AArch64/arm64-patchpoint.ll
+++ b/test/CodeGen/AArch64/arm64-patchpoint.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone                             < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone -fast-isel -fast-isel-abort < %s | FileCheck %s
 
 ; Trivial patchpoint codegen
 ;
@@ -41,73 +42,6 @@ entry:
   ret void
 }
 
-; Test the webkit_jscc calling convention.
-; One argument will be passed in register, the other will be pushed on the stack.
-; Return value in x0.
-define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen:
-; CHECK:      Ltmp
-; CHECK:      str x{{.+}}, [sp]
-; CHECK-NEXT: mov  x0, x{{.+}}
-; CHECK:      Ltmp
-; CHECK-NEXT: movz  x16, #0xffff, lsl #32
-; CHECK-NEXT: movk  x16, #0xdead, lsl #16
-; CHECK-NEXT: movk  x16, #0xbeef
-; CHECK-NEXT: blr x16
-  %resolveCall2 = inttoptr i64 281474417671919 to i8*
-  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
-  %resolveCall3 = inttoptr i64 244837814038255 to i8*
-  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
-  ret void
-}
-
-; Test if the arguments are properly aligned and that we don't store undef arguments.
-define i64 @jscall_patchpoint_codegen2(i64 %callee) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen2:
-; CHECK:      Ltmp
-; CHECK:      orr w{{.+}}, wzr, #0x6
-; CHECK-NEXT: str x{{.+}}, [sp, #24]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
-; CHECK-NEXT: str w{{.+}}, [sp, #16]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x2
-; CHECK-NEXT: str x{{.+}}, [sp]
-; CHECK:      Ltmp
-; CHECK-NEXT: movz  x16, #0xffff, lsl #32
-; CHECK-NEXT: movk  x16, #0xdead, lsl #16
-; CHECK-NEXT: movk  x16, #0xbeef
-; CHECK-NEXT: blr x16
-  %call = inttoptr i64 281474417671919 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
-  ret i64 %result
-}
-
-; Test if the arguments are properly aligned and that we don't store undef arguments.
-define i64 @jscall_patchpoint_codegen3(i64 %callee) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen3:
-; CHECK:      Ltmp
-; CHECK:      movz  w{{.+}}, #0xa
-; CHECK-NEXT: str x{{.+}}, [sp, #48]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x8
-; CHECK-NEXT: str w{{.+}}, [sp, #36]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x6
-; CHECK-NEXT: str x{{.+}}, [sp, #24]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
-; CHECK-NEXT: str w{{.+}}, [sp, #16]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x2
-; CHECK-NEXT: str x{{.+}}, [sp]
-; CHECK:      Ltmp
-; CHECK-NEXT: movz  x16, #0xffff, lsl #32
-; CHECK-NEXT: movk  x16, #0xdead, lsl #16
-; CHECK-NEXT: movk  x16, #0xbeef
-; CHECK-NEXT: blr x16
-  %call = inttoptr i64 281474417671919 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
-  ret i64 %result
-}
-
 ; Test patchpoints reusing the same TargetConstant.
 ; <rdar:15390785> Assertion failed: (CI.getNumArgOperands() >= NumArgs + 4)
 ; There is no way to verify this, since it depends on memory allocation.
@@ -144,28 +78,7 @@ entry:
   ret void
 }
 
-; Test that scratch registers are spilled around patchpoints
-; CHECK: InlineAsm End
-; CHECK-NEXT: mov x{{[0-9]+}}, x16
-; CHECK-NEXT: mov x{{[0-9]+}}, x17
-; CHECK-NEXT: Ltmp
-; CHECK-NEXT: nop
-define void @clobberScratch(i32* %p) {
-  %v = load i32* %p
-  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v)
-  store i32 %v, i32* %p
-  ret void
-}
-
 declare void @llvm.experimental.stackmap(i64, i32, ...)
 declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
 declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
 
-; CHECK-LABEL: test_i16:
-; CHECK: ldrh [[BREG:w[0-9]+]], [sp]
-; CHECK: add w0, w0, [[BREG]]
-define webkit_jscc i16 @test_i16(i16 zeroext %a, i16 zeroext %b) {
-  %sum = add i16 %a, %b
-  ret i16 %sum
-}
diff --git a/test/CodeGen/AArch64/arm64-popcnt.ll b/test/CodeGen/AArch64/arm64-popcnt.ll
index 2afade2..117ab3a 100644
--- a/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=aarch64 -mattr -neon -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-NONEON %s
 
 define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
   %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
@@ -8,6 +9,13 @@ define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
 ; CHECK: uaddlv.8b	h0, v0
 ; CHECK: fmov w0, s0
 ; CHECK: ret
+; CHECK-NONEON-LABEL: cnt32_advsimd
+; CHECK-NONEON-NOT: 8b
+; CHECK-NONEON: and w{{[0-9]+}}, w{{[0-9]+}}, #0x55555555
+; CHECK-NONEON: and w{{[0-9]+}}, w{{[0-9]+}}, #0x33333333
+; CHECK-NONEON: and w{{[0-9]+}}, w{{[0-9]+}}, #0xf0f0f0f
+; CHECK-NONEON: mul
+
 }
 
 define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
@@ -18,6 +26,12 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
 ; CHECK: uaddlv.8b	h0, v0
 ; CHECK: fmov	w0, s0
 ; CHECK: ret
+; CHECK-NONEON-LABEL: cnt64_advsimd
+; CHECK-NONEON-NOT: 8b
+; CHECK-NONEON: and x{{[0-9]+}}, x{{[0-9]+}}, #0x5555555555555555
+; CHECK-NONEON: and x{{[0-9]+}}, x{{[0-9]+}}, #0x3333333333333333
+; CHECK-NONEON: and x{{[0-9]+}}, x{{[0-9]+}}, #0xf0f0f0f0f0f0f0f
+; CHECK-NONEON: mul
 }
 
 ; Do not use AdvSIMD when -mno-implicit-float is specified.
diff --git a/test/CodeGen/AArch64/arm64-prefetch.ll b/test/CodeGen/AArch64/arm64-prefetch.ll
index b2e06ed..9dc6301 100644
--- a/test/CodeGen/AArch64/arm64-prefetch.ll
+++ b/test/CodeGen/AArch64/arm64-prefetch.ll
@@ -17,6 +17,15 @@ entry:
   ; CHECK: prfum pldl1keep
   call void @llvm.prefetch(i8* %tmp, i32 0, i32 3, i32 1)
 
+  ; CHECK: prfum plil1strm
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 0, i32 0)
+  ; CHECK: prfum plil3keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 1, i32 0)
+  ; CHECK: prfum plil2keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 2, i32 0)
+  ; CHECK: prfum plil1keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 3, i32 0)
+
   ; CHECK: prfum pstl1strm
   call void @llvm.prefetch(i8* %tmp, i32 1, i32 0, i32 1)
   ; CHECK: prfum pstl3keep
@@ -57,26 +66,52 @@ entry:
   %arrayidx12 = getelementptr inbounds i32* %tmp10, i64 %idxprom
   %tmp11 = bitcast i32* %arrayidx12 to i8*
 
-  ; CHECK: prfm pstl1strm
-  call void @llvm.prefetch(i8* %tmp11, i32 1, i32 0, i32 1)
+
+  ; CHECK: prfm plil1strm
+  call void @llvm.prefetch(i8* %tmp11, i32 0, i32 0, i32 0)
   %tmp12 = load i32** @a, align 8, !tbaa !3
   %arrayidx15 = getelementptr inbounds i32* %tmp12, i64 %idxprom
-  %tmp13 = bitcast i32* %arrayidx15 to i8*
+  %tmp13 = bitcast i32* %arrayidx3 to i8*
 
-  ; CHECK: prfm pstl3keep
-  call void @llvm.prefetch(i8* %tmp13, i32 1, i32 1, i32 1)
+  ; CHECK: prfm plil3keep
+  call void @llvm.prefetch(i8* %tmp13, i32 0, i32 1, i32 0)
   %tmp14 = load i32** @a, align 8, !tbaa !3
   %arrayidx18 = getelementptr inbounds i32* %tmp14, i64 %idxprom
-  %tmp15 = bitcast i32* %arrayidx18 to i8*
+  %tmp15 = bitcast i32* %arrayidx6 to i8*
 
-  ; CHECK: prfm pstl2keep
-  call void @llvm.prefetch(i8* %tmp15, i32 1, i32 2, i32 1)
+  ; CHECK: prfm plil2keep
+  call void @llvm.prefetch(i8* %tmp15, i32 0, i32 2, i32 0)
   %tmp16 = load i32** @a, align 8, !tbaa !3
   %arrayidx21 = getelementptr inbounds i32* %tmp16, i64 %idxprom
-  %tmp17 = bitcast i32* %arrayidx21 to i8*
+  %tmp17 = bitcast i32* %arrayidx9 to i8*
+
+  ; CHECK: prfm plil1keep
+  call void @llvm.prefetch(i8* %tmp17, i32 0, i32 3, i32 0)
+  %tmp18 = load i32** @a, align 8, !tbaa !3
+  %arrayidx24 = getelementptr inbounds i32* %tmp18, i64 %idxprom
+  %tmp19 = bitcast i32* %arrayidx12 to i8*
+
+
+  ; CHECK: prfm pstl1strm
+  call void @llvm.prefetch(i8* %tmp19, i32 1, i32 0, i32 1)
+  %tmp20 = load i32** @a, align 8, !tbaa !3
+  %arrayidx27 = getelementptr inbounds i32* %tmp20, i64 %idxprom
+  %tmp21 = bitcast i32* %arrayidx15 to i8*
+
+  ; CHECK: prfm pstl3keep
+  call void @llvm.prefetch(i8* %tmp21, i32 1, i32 1, i32 1)
+  %tmp22 = load i32** @a, align 8, !tbaa !3
+  %arrayidx30 = getelementptr inbounds i32* %tmp22, i64 %idxprom
+  %tmp23 = bitcast i32* %arrayidx18 to i8*
+
+  ; CHECK: prfm pstl2keep
+  call void @llvm.prefetch(i8* %tmp23, i32 1, i32 2, i32 1)
+  %tmp24 = load i32** @a, align 8, !tbaa !3
+  %arrayidx33 = getelementptr inbounds i32* %tmp24, i64 %idxprom
+  %tmp25 = bitcast i32* %arrayidx21 to i8*
 
   ; CHECK: prfm pstl1keep
-  call void @llvm.prefetch(i8* %tmp17, i32 1, i32 3, i32 1)
+  call void @llvm.prefetch(i8* %tmp25, i32 1, i32 3, i32 1)
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/arm64-scvt.ll b/test/CodeGen/AArch64/arm64-scvt.ll
index 2e006cf..8baaf22 100644
--- a/test/CodeGen/AArch64/arm64-scvt.ll
+++ b/test/CodeGen/AArch64/arm64-scvt.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -mcpu=cortex-a57 | FileCheck --check-prefix=CHECK-A57 %s
 ; rdar://13082402
 
 define float @t1(i32* nocapture %src) nounwind ssp {
@@ -409,6 +410,10 @@ define float @sfct1(i8* nocapture %sp0) {
 ; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
 ; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
 ; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+; CHECK-A57-LABEL: sfct1:
+; CHECK-A57: ldrsb w[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-A57-NEXT: scvtf [[REG:s[0-9]+]], w[[REGNUM]]
+; CHECK-A57-NEXT: fmul s0, [[REG]], [[REG]]
 entry:
   %addr = getelementptr i8* %sp0, i64 1
   %pix_sp0.0.copyload = load i8* %addr, align 1
@@ -466,6 +471,10 @@ define float @sfct5(i8* nocapture %sp0, i64 %offset) {
 ; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
 ; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
 ; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+; CHECK-A57-LABEL: sfct5:
+; CHECK-A57: ldrsb w[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-A57-NEXT: scvtf [[REG:s[0-9]+]], w[[REGNUM]]
+; CHECK-A57-NEXT: fmul s0, [[REG]], [[REG]]
 entry:
   %addr = getelementptr i8* %sp0, i64 %offset
   %pix_sp0.0.copyload = load i8* %addr, align 1
@@ -536,6 +545,10 @@ define double @sfct10(i16* nocapture %sp0) {
 ; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
 ; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
 ; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+; CHECK-A57-LABEL: sfct10:
+; CHECK-A57: ldrsh w[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-A57-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-A57-NEXT: fmul d0, [[REG]], [[REG]]
 entry:
   %addr = getelementptr i16* %sp0, i64 1
   %pix_sp0.0.copyload = load i16* %addr, align 1
@@ -592,6 +605,10 @@ define double @sfct14(i16* nocapture %sp0, i64 %offset) {
 ; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
 ; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
 ; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+; CHECK-A57-LABEL: sfct14:
+; CHECK-A57: ldrsh w[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-A57-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-A57-NEXT: fmul d0, [[REG]], [[REG]]
 entry:
   %addr = getelementptr i16* %sp0, i64 %offset
   %pix_sp0.0.copyload = load i16* %addr, align 1
@@ -636,6 +653,10 @@ entry:
 ; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
 ; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
 ; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+; CHECK-A57-LABEL: sfct17:
+; CHECK-A57: ldursb w[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-A57-NEXT: scvtf [[REG:s[0-9]+]], w[[REGNUM]]
+; CHECK-A57-NEXT: fmul s0, [[REG]], [[REG]]
   %bitcast = ptrtoint i8* %sp0 to i64
   %add = add i64 %bitcast, -1
   %addr = inttoptr i64 %add to i8*
@@ -713,6 +734,10 @@ define double @sfct22(i16* nocapture %sp0) {
 ; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
 ; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
 ; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+; CHECK-A57-LABEL: sfct22:
+; CHECK-A57: ldursh w[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-A57-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-A57-NEXT: fmul d0, [[REG]], [[REG]]
   %bitcast = ptrtoint i16* %sp0 to i64
   %add = add i64 %bitcast, 1
   %addr = inttoptr i64 %add to i16*
diff --git a/test/CodeGen/AArch64/arm64-setcc-int-to-fp-combine.ll b/test/CodeGen/AArch64/arm64-setcc-int-to-fp-combine.ll
new file mode 100644
index 0000000..67283b6
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-setcc-int-to-fp-combine.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -asm-verbose=false -mtriple=arm64-apple-ios | FileCheck %s
+
+define <4 x float> @foo(<4 x float> %val, <4 x float> %test) nounwind {
+; CHECK-LABEL: foo:
+; CHECK-NEXT: fcmeq.4s  v0, v0, v1
+; CHECK-NEXT: fmov.4s v1, #1.00000000
+; CHECK-NEXT: and.16b v0, v0, v1
+; CHECK-NEXT: ret
+  %cmp = fcmp oeq <4 x float> %val, %test
+  %ext = zext <4 x i1> %cmp to <4 x i32>
+  %result = sitofp <4 x i32> %ext to <4 x float>
+  ret <4 x float> %result
+}
+; Make sure the operation doesn't try to get folded when the sizes don't match,
+; as that ends up crashing later when trying to form a bitcast operation for
+; the folded nodes.
+define void @foo1(<4 x float> %val, <4 x float> %test, <4 x double>* %p) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK: movi.4s
+; CHECK: scvtf.2d
+; CHECK: scvtf.2d
+  %cmp = fcmp oeq <4 x float> %val, %test
+  %ext = zext <4 x i1> %cmp to <4 x i32>
+  %result = sitofp <4 x i32> %ext to <4 x double>
+  store <4 x double> %result, <4 x double>* %p
+  ret void
+}
+
+; Fold explicit AND operations when the constant isn't a splat of a single
+; scalar value like what the zext creates.
+define <4 x float> @foo2(<4 x float> %val, <4 x float> %test) nounwind {
+; CHECK-LABEL: lCPI2_0:
+; CHECK-NEXT: .long 1065353216
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .long 1065353216
+; CHECK-NEXT: .long 0
+; CHECK-LABEL: foo2:
+; CHECK: adrp  x8, lCPI2_0@PAGE
+; CHECK: ldr q2, [x8, lCPI2_0@PAGEOFF]
+; CHECK-NEXT:  fcmeq.4s  v0, v0, v1
+; CHECK-NEXT:  and.16b v0, v0, v2
+  %cmp = fcmp oeq <4 x float> %val, %test
+  %ext = zext <4 x i1> %cmp to <4 x i32>
+  %and = and <4 x i32> %ext, <i32 255, i32 256, i32 257, i32 258>
+  %result = sitofp <4 x i32> %and to <4 x float>
+  ret <4 x float> %result
+}
diff --git a/test/CodeGen/AArch64/arm64-shifted-sext.ll b/test/CodeGen/AArch64/arm64-shifted-sext.ll
index b7b4e5d..71f15b1 100644
--- a/test/CodeGen/AArch64/arm64-shifted-sext.ll
+++ b/test/CodeGen/AArch64/arm64-shifted-sext.ll
@@ -166,8 +166,8 @@ entry:
 define i32 @extendedLeftShiftshortTointBy16(i16 signext %a) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: extendedLeftShiftshortTointBy16:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: lsl w0, [[REG]], #16
+; CHECK: lsl [[REG:w[0-9]+]], w0, #16
+; CHECK: add w0, [[REG]], #16, lsl #12
   %inc = add i16 %a, 1
   %conv2 = zext i16 %inc to i32
   %shl = shl nuw i32 %conv2, 16
diff --git a/test/CodeGen/AArch64/arm64-stackmap.ll b/test/CodeGen/AArch64/arm64-stackmap.ll
index 2c7c6ae..144c2fd 100644
--- a/test/CodeGen/AArch64/arm64-stackmap.ll
+++ b/test/CodeGen/AArch64/arm64-stackmap.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin                             < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -fast-isel -fast-isel-abort < %s | FileCheck %s
 ;
 ; Note: Print verbose stackmaps using -debug-only=stackmaps.
 
diff --git a/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll b/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll
new file mode 100644
index 0000000..a7f5215
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -enable-aa-sched-mi | FileCheck %s
+; Check that the scheduler moves the load from a[1] past the store into a[2].
+@a = common global i32* null, align 8
+@m = common global i32 0, align 4
+
+; Function Attrs: nounwind
+define i32 @func(i32 %i, i32 %j, i32 %k) #0 {
+entry:
+; CHECK: ldr {{w[0-9]+}}, [x[[REG:[0-9]+]], #4]
+; CHECK: str {{w[0-9]+}}, [x[[REG]], #8]
+  %0 = load i32** @a, align 8, !tbaa !1
+  %arrayidx = getelementptr inbounds i32* %0, i64 2
+  store i32 %i, i32* %arrayidx, align 4, !tbaa !5
+  %arrayidx1 = getelementptr inbounds i32* %0, i64 1
+  %1 = load i32* %arrayidx1, align 4, !tbaa !5
+  %add = add nsw i32 %k, %i
+  store i32 %add, i32* @m, align 4, !tbaa !5
+  ret i32 %1
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.6.0 "}
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"any pointer", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!5 = metadata !{metadata !6, metadata !6, i64 0}
+!6 = metadata !{metadata !"int", metadata !3, i64 0}
diff --git a/test/CodeGen/AArch64/arm64-vabs.ll b/test/CodeGen/AArch64/arm64-vabs.ll
index 5afc8d9..fae2b90 100644
--- a/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/test/CodeGen/AArch64/arm64-vabs.ll
@@ -802,3 +802,73 @@ define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
   %res1 = zext <2 x i32> %res to <2 x i64>
   ret <2 x i64> %res1
 }
+
+define <2 x i32> @abspattern1(<2 x i32> %a) nounwind {
+; CHECK-LABEL: abspattern1:
+; CHECK: abs.2s
+; CHECK-NEXT: ret
+        %tmp1neg = sub <2 x i32> zeroinitializer, %a
+        %b = icmp sge <2 x i32> %a, zeroinitializer
+        %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg
+        ret <2 x i32> %abs
+}
+
+define <4 x i16> @abspattern2(<4 x i16> %a) nounwind {
+; CHECK-LABEL: abspattern2:
+; CHECK: abs.4h
+; CHECK-NEXT: ret
+        %tmp1neg = sub <4 x i16> zeroinitializer, %a
+        %b = icmp sgt <4 x i16> %a, zeroinitializer
+        %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg
+        ret <4 x i16> %abs
+}
+
+define <8 x i8> @abspattern3(<8 x i8> %a) nounwind {
+; CHECK-LABEL: abspattern3:
+; CHECK: abs.8b
+; CHECK-NEXT: ret
+        %tmp1neg = sub <8 x i8> zeroinitializer, %a
+        %b = icmp slt <8 x i8> %a, zeroinitializer
+        %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a
+        ret <8 x i8> %abs
+}
+
+define <4 x i32> @abspattern4(<4 x i32> %a) nounwind {
+; CHECK-LABEL: abspattern4:
+; CHECK: abs.4s
+; CHECK-NEXT: ret
+        %tmp1neg = sub <4 x i32> zeroinitializer, %a
+        %b = icmp sge <4 x i32> %a, zeroinitializer
+        %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
+        ret <4 x i32> %abs
+}
+
+define <8 x i16> @abspattern5(<8 x i16> %a) nounwind {
+; CHECK-LABEL: abspattern5:
+; CHECK: abs.8h
+; CHECK-NEXT: ret
+        %tmp1neg = sub <8 x i16> zeroinitializer, %a
+        %b = icmp sgt <8 x i16> %a, zeroinitializer
+        %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
+        ret <8 x i16> %abs
+}
+
+define <16 x i8> @abspattern6(<16 x i8> %a) nounwind {
+; CHECK-LABEL: abspattern6:
+; CHECK: abs.16b
+; CHECK-NEXT: ret
+        %tmp1neg = sub <16 x i8> zeroinitializer, %a
+        %b = icmp slt <16 x i8> %a, zeroinitializer
+        %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
+        ret <16 x i8> %abs
+}
+
+define <2 x i64> @abspattern7(<2 x i64> %a) nounwind {
+; CHECK-LABEL: abspattern7:
+; CHECK: abs.2d
+; CHECK-NEXT: ret
+        %tmp1neg = sub <2 x i64> zeroinitializer, %a
+        %b = icmp sle <2 x i64> %a, zeroinitializer
+        %abs = select <2 x i1> %b, <2 x i64> %tmp1neg, <2 x i64> %a
+        ret <2 x i64> %abs
+}
diff --git a/test/CodeGen/AArch64/arm64-vcvt_f.ll b/test/CodeGen/AArch64/arm64-vcvt_f.ll
index d244958..1f393c2 100644
--- a/test/CodeGen/AArch64/arm64-vcvt_f.ll
+++ b/test/CodeGen/AArch64/arm64-vcvt_f.ll
@@ -66,17 +66,17 @@ define i16 @to_half(float %in) {
 ; CHECK-LABEL: to_half:
 ; CHECK: fcvt h[[HALFVAL:[0-9]+]], s0
 ; CHECK: fmov {{w[0-9]+}}, {{s[0-9]+}}
-  %res = call i16 @llvm.convert.to.fp16(float %in)
+  %res = call i16 @llvm.convert.to.fp16.f32(float %in)
   ret i16 %res
 }
 
 define float @from_half(i16 %in) {
 ; CHECK-LABEL: from_half:
-; CHECK: fmov s[[HALFVAL:[0-9]+]], {{w[0-9]+}}
-; CHECK: fcvt s0, h[[HALFVAL]]
-  %res = call float @llvm.convert.from.fp16(i16 %in)
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
+; CHECK: fcvt s0, {{h[0-9]+}}
+  %res = call float @llvm.convert.from.fp16.f32(i16 %in)
   ret float %res
 }
 
-declare float @llvm.convert.from.fp16(i16) #1
-declare i16 @llvm.convert.to.fp16(float) #1
+declare float @llvm.convert.from.fp16.f32(i16) #1
+declare i16 @llvm.convert.to.fp16.f32(float) #1
diff --git a/test/CodeGen/AArch64/arm64-vector-ext.ll b/test/CodeGen/AArch64/arm64-vector-ext.ll
index 650ff1e..5bee161 100644
--- a/test/CodeGen/AArch64/arm64-vector-ext.ll
+++ b/test/CodeGen/AArch64/arm64-vector-ext.ll
@@ -14,3 +14,14 @@ define void @func30(%T0_30 %v0, %T1_30* %p1) {
   store %T1_30 %r, %T1_30* %p1
   ret void
 }
+
+; Extend from v1i1 was crashing things (PR20791). Make sure we do something
+; sensible instead.
+define <1 x i32> @autogen_SD7918() {
+; CHECK-LABEL: autogen_SD7918
+; CHECK: movi d0, #0000000000000000
+; CHECK-NEXT: ret
+  %I29 = insertelement <1 x i1> zeroinitializer, i1 false, i32 0
+  %ZE = zext <1 x i1> %I29 to <1 x i32>
+  ret <1 x i32> %ZE
+}
diff --git a/test/CodeGen/AArch64/arm64-xaluo.ll b/test/CodeGen/AArch64/arm64-xaluo.ll
index 0c300de..59ce684 100644
--- a/test/CodeGen/AArch64/arm64-xaluo.ll
+++ b/test/CodeGen/AArch64/arm64-xaluo.ll
@@ -1,13 +1,14 @@
-; RUN: llc < %s -march=arm64 -aarch64-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-atomic-cfg-tidy=0                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-atomic-cfg-tidy=0 -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; Get the actual value of the overflow bit.
 ;
-define i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) {
+define zeroext i1 @saddo1.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
-; CHECK-LABEL:  saddo.i32
-; CHECK:        adds w8, w0, w1
-; CHECK-NEXT:   cset w0, vs
+; CHECK-LABEL:  saddo1.i32
+; CHECK:        adds {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -15,11 +16,64 @@ entry:
   ret i1 %obit
 }
 
-define i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) {
+; Test the immediate version.
+define zeroext i1 @saddo2.i32(i32 %v1, i32* %res) {
 entry:
-; CHECK-LABEL:  saddo.i64
-; CHECK:        adds x8, x0, x1
-; CHECK-NEXT:   cset w0, vs
+; CHECK-LABEL:  saddo2.i32
+; CHECK:        adds {{w[0-9]+}}, w0, #4
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 4)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+; Test negative immediates.
+define zeroext i1 @saddo3.i32(i32 %v1, i32* %res) {
+entry:
+; CHECK-LABEL:  saddo3.i32
+; CHECK:        subs {{w[0-9]+}}, w0, #4
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 -4)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+; Test immediates that are too large to be encoded.
+define zeroext i1 @saddo4.i32(i32 %v1, i32* %res) {
+entry:
+; CHECK-LABEL:  saddo4.i32
+; CHECK:        adds {{w[0-9]+}}, w0, {{w[0-9]+}}
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 16777215)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+; Test shift folding.
+define zeroext i1 @saddo5.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  saddo5.i32
+; CHECK:        adds {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %lsl = shl i32 %v2, 16
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %lsl)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  saddo1.i64
+; CHECK:        adds {{x[0-9]+}}, x0, x1
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -27,11 +81,35 @@ entry:
   ret i1 %obit
 }
 
-define i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
+define zeroext i1 @saddo2.i64(i64 %v1, i64* %res) {
+entry:
+; CHECK-LABEL:  saddo2.i64
+; CHECK:        adds {{x[0-9]+}}, x0, #4
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 4)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo3.i64(i64 %v1, i64* %res) {
+entry:
+; CHECK-LABEL:  saddo3.i64
+; CHECK:        subs {{x[0-9]+}}, x0, #4
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -4)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
 ; CHECK-LABEL:  uaddo.i32
-; CHECK:        adds w8, w0, w1
-; CHECK-NEXT:   cset w0, hs
+; CHECK:        adds {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:   cset {{w[0-9]+}}, hs
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -39,11 +117,11 @@ entry:
   ret i1 %obit
 }
 
-define i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
+define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
 ; CHECK-LABEL:  uaddo.i64
-; CHECK:        adds x8, x0, x1
-; CHECK-NEXT:   cset w0, hs
+; CHECK:        adds {{x[0-9]+}}, x0, x1
+; CHECK-NEXT:   cset {{w[0-9]+}}, hs
   %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -51,11 +129,11 @@ entry:
   ret i1 %obit
 }
 
-define i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) {
+define zeroext i1 @ssubo1.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
-; CHECK-LABEL:  ssubo.i32
-; CHECK:        subs w8, w0, w1
-; CHECK-NEXT:   cset w0, vs
+; CHECK-LABEL:  ssubo1.i32
+; CHECK:        subs {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
   %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -63,11 +141,23 @@ entry:
   ret i1 %obit
 }
 
-define i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
+define zeroext i1 @ssubo2.i32(i32 %v1, i32* %res) {
+entry:
+; CHECK-LABEL:  ssubo2.i32
+; CHECK:        adds {{w[0-9]+}}, w0, #4
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 -4)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
 ; CHECK-LABEL:  ssubo.i64
-; CHECK:        subs x8, x0, x1
-; CHECK-NEXT:   cset w0, vs
+; CHECK:        subs {{x[0-9]+}}, x0, x1
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
   %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -75,11 +165,11 @@ entry:
   ret i1 %obit
 }
 
-define i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
+define zeroext i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
 ; CHECK-LABEL:  usubo.i32
-; CHECK:        subs w8, w0, w1
-; CHECK-NEXT:   cset w0, lo
+; CHECK:        subs {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:   cset {{w[0-9]+}}, lo
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -87,11 +177,11 @@ entry:
   ret i1 %obit
 }
 
-define i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
+define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
 ; CHECK-LABEL:  usubo.i64
-; CHECK:        subs x8, x0, x1
-; CHECK-NEXT:   cset w0, lo
+; CHECK:        subs {{x[0-9]+}}, x0, x1
+; CHECK-NEXT:   cset {{w[0-9]+}}, lo
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -99,13 +189,13 @@ entry:
   ret i1 %obit
 }
 
-define i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
+define zeroext i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
 ; CHECK-LABEL:  smulo.i32
-; CHECK:        smull x8, w0, w1
-; CHECK-NEXT:   lsr x9, x8, #32
-; CHECK-NEXT:   cmp w9, w8, asr #31
-; CHECK-NEXT:   cset w0, ne
+; CHECK:        smull x[[MREG:[0-9]+]], w0, w1
+; CHECK-NEXT:   lsr x[[SREG:[0-9]+]], x[[MREG]], #32
+; CHECK-NEXT:   cmp w[[SREG]], w[[MREG]], asr #31
+; CHECK-NEXT:   cset {{w[0-9]+}}, ne
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -113,13 +203,13 @@ entry:
   ret i1 %obit
 }
 
-define i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
+define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
 ; CHECK-LABEL:  smulo.i64
-; CHECK:        mul x8, x0, x1
-; CHECK-NEXT:   smulh x9, x0, x1
-; CHECK-NEXT:   cmp x9, x8, asr #63
-; CHECK-NEXT:   cset w0, ne
+; CHECK:        mul [[MREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   smulh [[HREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   cmp [[HREG]], [[MREG]], asr #63
+; CHECK-NEXT:   cset {{w[0-9]+}}, ne
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -127,12 +217,24 @@ entry:
   ret i1 %obit
 }
 
-define i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
+define zeroext i1 @smulo2.i64(i64 %v1, i64* %res) {
+entry:
+; CHECK-LABEL:  smulo2.i64
+; CHECK:        adds [[MREG:x[0-9]+]], x0, x0
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
 ; CHECK-LABEL:  umulo.i32
-; CHECK:        umull x8, w0, w1
-; CHECK-NEXT:   cmp xzr, x8, lsr #32
-; CHECK-NEXT:   cset w0, ne
+; CHECK:        umull [[MREG:x[0-9]+]], w0, w1
+; CHECK-NEXT:   cmp xzr, [[MREG]], lsr #32
+; CHECK-NEXT:   cset {{w[0-9]+}}, ne
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -140,13 +242,12 @@ entry:
   ret i1 %obit
 }
 
-define i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) {
+define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
 ; CHECK-LABEL:  umulo.i64
-; CHECK:        umulh x8, x0, x1
-; CHECK-NEXT:   cmp xzr, x8
-; CHECK-NEXT:   cset w8, ne
-; CHECK-NEXT:   mul x9, x0, x1
+; CHECK:        umulh [[MREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   cmp xzr, [[MREG]]
+; CHECK-NEXT:   cset {{w[0-9]+}}, ne
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -154,6 +255,18 @@ entry:
   ret i1 %obit
 }
 
+define zeroext i1 @umulo2.i64(i64 %v1, i64* %res) {
+entry:
+; CHECK-LABEL:  umulo2.i64
+; CHECK:        adds [[MREG:x[0-9]+]], x0, x0
+; CHECK-NEXT:   cset {{w[0-9]+}}, hs
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
 
 ;
 ; Check the use of the overflow bit in combination with a select instruction.
@@ -249,9 +362,9 @@ entry:
 define i32 @smulo.select.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  smulo.select.i32
-; CHECK:        smull    x8, w0, w1
-; CHECK-NEXT:   lsr     x9, x8, #32
-; CHECK-NEXT:   cmp     w9, w8, asr #31
+; CHECK:        smull   x[[MREG:[0-9]+]], w0, w1
+; CHECK-NEXT:   lsr     x[[SREG:[0-9]+]], x[[MREG]], #32
+; CHECK-NEXT:   cmp     w[[SREG]], w[[MREG]], asr #31
 ; CHECK-NEXT:   csel    w0, w0, w1, ne
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
@@ -262,9 +375,9 @@ entry:
 define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  smulo.select.i64
-; CHECK:        mul      x8, x0, x1
-; CHECK-NEXT:   smulh   x9, x0, x1
-; CHECK-NEXT:   cmp     x9, x8, asr #63
+; CHECK:        mul     [[MREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   smulh   [[HREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   cmp     [[HREG]], [[MREG]], asr #63
 ; CHECK-NEXT:   csel    x0, x0, x1, ne
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
@@ -275,8 +388,8 @@ entry:
 define i32 @umulo.select.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  umulo.select.i32
-; CHECK:        umull    x8, w0, w1
-; CHECK-NEXT:   cmp     xzr, x8, lsr #32
+; CHECK:        umull   [[MREG:x[0-9]+]], w0, w1
+; CHECK-NEXT:   cmp     xzr, [[MREG]], lsr #32
 ; CHECK-NEXT:   csel    w0, w0, w1, ne
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
@@ -287,8 +400,8 @@ entry:
 define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  umulo.select.i64
-; CHECK:        umulh   x8, x0, x1
-; CHECK-NEXT:   cmp     xzr, x8
+; CHECK:        umulh   [[MREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   cmp     xzr, [[MREG]]
 ; CHECK-NEXT:   csel    x0, x0, x1, ne
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
@@ -300,7 +413,7 @@ entry:
 ;
 ; Check the use of the overflow bit in combination with a branch instruction.
 ;
-define i1 @saddo.br.i32(i32 %v1, i32 %v2) {
+define zeroext i1 @saddo.br.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  saddo.br.i32
 ; CHECK:        cmn w0, w1
@@ -317,7 +430,7 @@ continue:
   ret i1 true
 }
 
-define i1 @saddo.br.i64(i64 %v1, i64 %v2) {
+define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  saddo.br.i64
 ; CHECK:        cmn x0, x1
@@ -334,7 +447,7 @@ continue:
   ret i1 true
 }
 
-define i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
+define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  uaddo.br.i32
 ; CHECK:        cmn w0, w1
@@ -351,7 +464,7 @@ continue:
   ret i1 true
 }
 
-define i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
+define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  uaddo.br.i64
 ; CHECK:        cmn x0, x1
@@ -368,7 +481,7 @@ continue:
   ret i1 true
 }
 
-define i1 @ssubo.br.i32(i32 %v1, i32 %v2) {
+define zeroext i1 @ssubo.br.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  ssubo.br.i32
 ; CHECK:        cmp w0, w1
@@ -385,7 +498,7 @@ continue:
   ret i1 true
 }
 
-define i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
+define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  ssubo.br.i64
 ; CHECK:        cmp x0, x1
@@ -402,7 +515,7 @@ continue:
   ret i1 true
 }
 
-define i1 @usubo.br.i32(i32 %v1, i32 %v2) {
+define zeroext i1 @usubo.br.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  usubo.br.i32
 ; CHECK:        cmp w0, w1
@@ -419,7 +532,7 @@ continue:
   ret i1 true
 }
 
-define i1 @usubo.br.i64(i64 %v1, i64 %v2) {
+define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  usubo.br.i64
 ; CHECK:        cmp x0, x1
@@ -436,12 +549,12 @@ continue:
   ret i1 true
 }
 
-define i1 @smulo.br.i32(i32 %v1, i32 %v2) {
+define zeroext i1 @smulo.br.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  smulo.br.i32
-; CHECK:        smull    x8, w0, w1
-; CHECK-NEXT:   lsr     x9, x8, #32
-; CHECK-NEXT:   cmp     w9, w8, asr #31
+; CHECK:        smull   x[[MREG:[0-9]+]], w0, w1
+; CHECK-NEXT:   lsr     x[[SREG:[0-9]+]], x8, #32
+; CHECK-NEXT:   cmp     w[[SREG]], w[[MREG]], asr #31
 ; CHECK-NEXT:   b.eq
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
@@ -455,12 +568,12 @@ continue:
   ret i1 true
 }
 
-define i1 @smulo.br.i64(i64 %v1, i64 %v2) {
+define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  smulo.br.i64
-; CHECK:        mul      x8, x0, x1
-; CHECK-NEXT:   smulh   x9, x0, x1
-; CHECK-NEXT:   cmp     x9, x8, asr #63
+; CHECK:        mul     [[MREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   smulh   [[HREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   cmp     [[HREG]], [[MREG]], asr #63
 ; CHECK-NEXT:   b.eq
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
@@ -474,11 +587,28 @@ continue:
   ret i1 true
 }
 
-define i1 @umulo.br.i32(i32 %v1, i32 %v2) {
+define zeroext i1 @smulo2.br.i64(i64 %v1) {
+entry:
+; CHECK-LABEL:  smulo2.br.i64
+; CHECK:        cmn  x0, x0
+; CHECK-NEXT:   b.vc
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @umulo.br.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  umulo.br.i32
-; CHECK:        umull    x8, w0, w1
-; CHECK-NEXT:   cmp     xzr, x8, lsr #32
+; CHECK:        umull   [[MREG:x[0-9]+]], w0, w1
+; CHECK-NEXT:   cmp     xzr, [[MREG]], lsr #32
 ; CHECK-NEXT:   b.eq
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
@@ -492,11 +622,11 @@ continue:
   ret i1 true
 }
 
-define i1 @umulo.br.i64(i64 %v1, i64 %v2) {
+define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  umulo.br.i64
-; CHECK:        umulh   x8, x0, x1
-; CHECK-NEXT:   cbz
+; CHECK:        umulh   [[REG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   {{cbz|cmp}}
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -509,6 +639,23 @@ continue:
   ret i1 true
 }
 
+define zeroext i1 @umulo2.br.i64(i64 %v1) {
+entry:
+; CHECK-LABEL:  umulo2.br.i64
+; CHECK:        cmn  x0, x0
+; CHECK-NEXT:   b.lo
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
 declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
 declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/atomic-ops.ll b/test/CodeGen/AArch64/atomic-ops.ll
index 26301b9..ef209e9 100644
--- a/test/CodeGen/AArch64/atomic-ops.ll
+++ b/test/CodeGen/AArch64/atomic-ops.ll
@@ -509,7 +509,7 @@ define i8 @test_atomic_load_min_i8(i8 %offset) nounwind {
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD_EXT]]
    ret i8 %old
 }
 
@@ -534,7 +534,7 @@ define i16 @test_atomic_load_min_i16(i16 %offset) nounwind {
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD_EXT]]
    ret i16 %old
 }
 
@@ -607,7 +607,7 @@ define i8 @test_atomic_load_max_i8(i8 %offset) nounwind {
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD_EXT]]
    ret i8 %old
 }
 
@@ -632,7 +632,7 @@ define i16 @test_atomic_load_max_i16(i16 %offset) nounwind {
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD_EXT]]
    ret i16 %old
 }
 
diff --git a/test/CodeGen/AArch64/br-undef-cond.ll b/test/CodeGen/AArch64/br-undef-cond.ll
new file mode 100644
index 0000000..12d0da2
--- /dev/null
+++ b/test/CodeGen/AArch64/br-undef-cond.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -verify-machineinstrs
+
+; Make sure we don't end up with a CBNZ of an undef v-/phys-reg.
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios"
+
+declare void @bar(i8*)
+
+define void @foo(i8* %m, i32 %off0) {
+.thread1653:
+  br i1 undef, label %0, label %.thread1880
+
+  %1 = icmp eq i32 undef, 0
+  %.not = xor i1 %1, true
+  %brmerge = or i1 %.not, undef
+  br i1 %brmerge, label %.thread1880, label %.thread1705
+
+.thread1705:
+  ret void
+
+.thread1880:
+  %m1652.ph = phi i8* [ %m, %0 ], [ null, %.thread1653 ]
+  call void @bar(i8* %m1652.ph)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/cmp-const-max.ll b/test/CodeGen/AArch64/cmp-const-max.ll
new file mode 100644
index 0000000..0431e39
--- /dev/null
+++ b/test/CodeGen/AArch64/cmp-const-max.ll
@@ -0,0 +1,36 @@
+; RUN: llc -verify-machineinstrs -aarch64-atomic-cfg-tidy=0 < %s -mtriple=aarch64-none-eabihf -fast-isel=false | FileCheck %s
+
+
+define i32 @ule_64_max(i64 %p) {
+entry:
+; CHECK-LABEL: ule_64_max:
+; CHECK: cmn x0, #1
+; CHECK: b.hi [[RET_ZERO:.LBB[0-9]+_[0-9]+]]
+  %cmp = icmp ule i64 %p, 18446744073709551615 ; 0xffffffffffffffff
+  br i1 %cmp, label %ret_one, label %ret_zero
+
+ret_one:
+  ret i32 1
+
+ret_zero:
+; CHECK: [[RET_ZERO]]:
+; CHECK-NEXT: mov w0, wzr
+  ret i32 0
+}
+
+define i32 @ugt_64_max(i64 %p) {
+entry:
+; CHECK-LABEL: ugt_64_max:
+; CHECK: cmn x0, #1
+; CHECK: b.ls [[RET_ZERO:.LBB[0-9]+_[0-9]+]]
+  %cmp = icmp ugt i64 %p, 18446744073709551615 ; 0xffffffffffffffff
+  br i1 %cmp, label %ret_one, label %ret_zero
+
+ret_one:
+  ret i32 1
+
+ret_zero:
+; CHECK: [[RET_ZERO]]:
+; CHECK-NEXT: mov w0, wzr
+  ret i32 0
+}
diff --git a/test/CodeGen/AArch64/cmpwithshort.ll b/test/CodeGen/AArch64/cmpwithshort.ll
new file mode 100644
index 0000000..14efdcc
--- /dev/null
+++ b/test/CodeGen/AArch64/cmpwithshort.ll
@@ -0,0 +1,46 @@
+; RUN: llc -O3 -march=aarch64 < %s | FileCheck %s 
+
+define i16 @test_1cmp_signed_1(i16* %ptr1) {
+; CHECK-LABLE: @test_1cmp_signed_1
+; CHECK: ldrsh
+; CHECK-NEXT: cmn
+entry:
+  %addr = getelementptr inbounds i16* %ptr1, i16 0
+  %val = load i16* %addr, align 2
+  %cmp = icmp eq i16 %val, -1
+  br i1 %cmp, label %if, label %if.then
+if:
+  ret i16 1
+if.then:
+  ret i16 0
+}
+
+define i16 @test_1cmp_signed_2(i16* %ptr1) {
+; CHECK-LABLE: @test_1cmp_signed_2
+; CHECK: ldrsh
+; CHECK-NEXT: cmn
+entry:
+  %addr = getelementptr inbounds i16* %ptr1, i16 0
+  %val = load i16* %addr, align 2
+  %cmp = icmp sge i16 %val, -1
+  br i1 %cmp, label %if, label %if.then
+if:
+  ret i16 1
+if.then:
+  ret i16 0
+}
+
+define i16 @test_1cmp_unsigned_1(i16* %ptr1) {
+; CHECK-LABLE: @test_1cmp_unsigned_1
+; CHECK: ldrsh
+; CHECK-NEXT: cmn
+entry:
+  %addr = getelementptr inbounds i16* %ptr1, i16 0
+  %val = load i16* %addr, align 2
+  %cmp = icmp uge i16 %val, -1
+  br i1 %cmp, label %if, label %if.then
+if:
+  ret i16 1
+if.then:
+  ret i16 0
+}                                           
diff --git a/test/CodeGen/AArch64/combine-comparisons-by-cse.ll b/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
new file mode 100644
index 0000000..df8dc87
--- /dev/null
+++ b/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
@@ -0,0 +1,413 @@
+; RUN: llc < %s -march=aarch64 -mtriple=aarch64-linux-gnu | FileCheck %s
+
+; marked as external to prevent possible optimizations
+@a = external global i32
+@b = external global i32
+@c = external global i32
+@d = external global i32
+
+; (a > 10 && b == c) || (a >= 10 && b == d)
+define i32 @combine_gt_ge_10() #0 {
+; CHECK-LABEL: combine_gt_ge_10
+; CHECK: cmp
+; CHECK: b.le
+; CHECK: ret
+; CHECK-NOT: cmp
+; CHECK: b.lt
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp sgt i32 %0, 10
+  br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
+
+land.lhs.true:                                    ; preds = %entry
+  %1 = load i32* @b, align 4
+  %2 = load i32* @c, align 4
+  %cmp1 = icmp eq i32 %1, %2
+  br i1 %cmp1, label %return, label %land.lhs.true3
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp2 = icmp sgt i32 %0, 9
+  br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3:                                   ; preds = %lor.lhs.false, %land.lhs.true
+  %3 = load i32* @b, align 4
+  %4 = load i32* @d, align 4
+  %cmp4 = icmp eq i32 %3, %4
+  br i1 %cmp4, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true3, %lor.lhs.false
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true3, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+; (a > 5 && b == c) || (a < 5 && b == d)
+define i32 @combine_gt_lt_5() #0 {
+; CHECK-LABEL: combine_gt_lt_5
+; CHECK: cmp
+; CHECK: b.le
+; CHECK: ret
+; CHECK-NOT: cmp
+; CHECK: b.ge
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp sgt i32 %0, 5
+  br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
+
+land.lhs.true:                                    ; preds = %entry
+  %1 = load i32* @b, align 4
+  %2 = load i32* @c, align 4
+  %cmp1 = icmp eq i32 %1, %2
+  br i1 %cmp1, label %return, label %if.end
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp2 = icmp slt i32 %0, 5
+  br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3:                                   ; preds = %lor.lhs.false
+  %3 = load i32* @b, align 4
+  %4 = load i32* @d, align 4
+  %cmp4 = icmp eq i32 %3, %4
+  br i1 %cmp4, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true3, %lor.lhs.false, %land.lhs.true
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true3, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+; (a < 5 && b == c) || (a <= 5 && b == d)
+define i32 @combine_lt_ge_5() #0 {
+; CHECK-LABEL: combine_lt_ge_5
+; CHECK: cmp
+; CHECK: b.ge
+; CHECK: ret
+; CHECK-NOT: cmp
+; CHECK: b.gt
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp slt i32 %0, 5
+  br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
+
+land.lhs.true:                                    ; preds = %entry
+  %1 = load i32* @b, align 4
+  %2 = load i32* @c, align 4
+  %cmp1 = icmp eq i32 %1, %2
+  br i1 %cmp1, label %return, label %land.lhs.true3
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp2 = icmp slt i32 %0, 6
+  br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3:                                   ; preds = %lor.lhs.false, %land.lhs.true
+  %3 = load i32* @b, align 4
+  %4 = load i32* @d, align 4
+  %cmp4 = icmp eq i32 %3, %4
+  br i1 %cmp4, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true3, %lor.lhs.false
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true3, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+; (a < 5 && b == c) || (a > 5 && b == d)
+define i32 @combine_lt_gt_5() #0 {
+; CHECK-LABEL: combine_lt_gt_5
+; CHECK: cmp
+; CHECK: b.ge
+; CHECK: ret
+; CHECK-NOT: cmp
+; CHECK: b.le
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp slt i32 %0, 5
+  br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
+
+land.lhs.true:                                    ; preds = %entry
+  %1 = load i32* @b, align 4
+  %2 = load i32* @c, align 4
+  %cmp1 = icmp eq i32 %1, %2
+  br i1 %cmp1, label %return, label %if.end
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp2 = icmp sgt i32 %0, 5
+  br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3:                                   ; preds = %lor.lhs.false
+  %3 = load i32* @b, align 4
+  %4 = load i32* @d, align 4
+  %cmp4 = icmp eq i32 %3, %4
+  br i1 %cmp4, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true3, %lor.lhs.false, %land.lhs.true
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true3, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+; (a > -5 && b == c) || (a < -5 && b == d)
+define i32 @combine_gt_lt_n5() #0 {
+; CHECK-LABEL: combine_gt_lt_n5
+; CHECK: cmn
+; CHECK: b.le
+; CHECK: ret
+; CHECK-NOT: cmn
+; CHECK: b.ge
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp sgt i32 %0, -5
+  br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
+
+land.lhs.true:                                    ; preds = %entry
+  %1 = load i32* @b, align 4
+  %2 = load i32* @c, align 4
+  %cmp1 = icmp eq i32 %1, %2
+  br i1 %cmp1, label %return, label %if.end
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp2 = icmp slt i32 %0, -5
+  br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3:                                   ; preds = %lor.lhs.false
+  %3 = load i32* @b, align 4
+  %4 = load i32* @d, align 4
+  %cmp4 = icmp eq i32 %3, %4
+  br i1 %cmp4, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true3, %lor.lhs.false, %land.lhs.true
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true3, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+; (a < -5 && b == c) || (a > -5 && b == d)
+define i32 @combine_lt_gt_n5() #0 {
+; CHECK-LABEL: combine_lt_gt_n5
+; CHECK: cmn
+; CHECK: b.ge
+; CHECK: ret
+; CHECK-NOT: cmn
+; CHECK: b.le
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp slt i32 %0, -5
+  br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
+
+land.lhs.true:                                    ; preds = %entry
+  %1 = load i32* @b, align 4
+  %2 = load i32* @c, align 4
+  %cmp1 = icmp eq i32 %1, %2
+  br i1 %cmp1, label %return, label %if.end
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp2 = icmp sgt i32 %0, -5
+  br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3:                                   ; preds = %lor.lhs.false
+  %3 = load i32* @b, align 4
+  %4 = load i32* @d, align 4
+  %cmp4 = icmp eq i32 %3, %4
+  br i1 %cmp4, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true3, %lor.lhs.false, %land.lhs.true
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true3, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+%struct.Struct = type { i64, i64 }
+
+@glob = internal unnamed_addr global %struct.Struct* null, align 8
+
+declare %struct.Struct* @Update(%struct.Struct*) #1
+
+; no checks for this case, it just should be processed without errors
+define void @combine_non_adjacent_cmp_br(%struct.Struct* nocapture readonly %hdCall) #0 {
+entry:
+  %size = getelementptr inbounds %struct.Struct* %hdCall, i64 0, i32 0
+  %0 = load i64* %size, align 8
+  br label %land.rhs
+
+land.rhs:
+  %rp.06 = phi i64 [ %0, %entry ], [ %sub, %while.body ]
+  %1 = load i64* inttoptr (i64 24 to i64*), align 8
+  %cmp2 = icmp sgt i64 %1, 0
+  br i1 %cmp2, label %while.body, label %while.end
+
+while.body:
+  %2 = load %struct.Struct** @glob, align 8
+  %call = tail call %struct.Struct* @Update(%struct.Struct* %2) #2
+  %sub = add nsw i64 %rp.06, -2
+  %cmp = icmp slt i64 %0, %rp.06
+  br i1 %cmp, label %land.rhs, label %while.end
+
+while.end:
+  ret void
+}
+
+; undefined external to prevent possible optimizations
+declare void @do_something() #1
+
+define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 {
+; CHECK-LABEL: do_nothing_if_resultant_opcodes_would_differ
+; CHECK: cmn
+; CHECK: b.gt
+; CHECK: cmp
+; CHECK: b.gt
+entry:
+  %0 = load i32* @a, align 4
+  %cmp4 = icmp slt i32 %0, -1
+  br i1 %cmp4, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body, %while.body.preheader
+  %i.05 = phi i32 [ %inc, %while.body ], [ %0, %while.body.preheader ]
+  tail call void @do_something() #2
+  %inc = add nsw i32 %i.05, 1
+  %cmp = icmp slt i32 %i.05, 0
+  br i1 %cmp, label %while.body, label %while.cond.while.end_crit_edge
+
+while.cond.while.end_crit_edge:                   ; preds = %while.body
+  %.pre = load i32* @a, align 4
+  br label %while.end
+
+while.end:                                        ; preds = %while.cond.while.end_crit_edge, %entry
+  %1 = phi i32 [ %.pre, %while.cond.while.end_crit_edge ], [ %0, %entry ]
+  %cmp1 = icmp slt i32 %1, 2
+  br i1 %cmp1, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %while.end
+  %2 = load i32* @b, align 4
+  %3 = load i32* @d, align 4
+  %cmp2 = icmp eq i32 %2, %3
+  br i1 %cmp2, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true, %while.end
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 123, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+define i32 @do_nothing_if_compares_can_not_be_adjusted_to_each_other() #0 {
+; CHECK-LABEL: do_nothing_if_compares_can_not_be_adjusted_to_each_other
+; CHECK: cmp
+; CHECK: b.gt
+; CHECK: cmn
+; CHECK: b.lt
+entry:
+  %0 = load i32* @a, align 4
+  %cmp4 = icmp slt i32 %0, 1
+  br i1 %cmp4, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body, %while.body.preheader
+  %i.05 = phi i32 [ %inc, %while.body ], [ %0, %while.body.preheader ]
+  tail call void @do_something() #2
+  %inc = add nsw i32 %i.05, 1
+  %cmp = icmp slt i32 %i.05, 0
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %1 = load i32* @c, align 4
+  %cmp1 = icmp sgt i32 %1, -3
+  br i1 %cmp1, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %while.end
+  %2 = load i32* @b, align 4
+  %3 = load i32* @d, align 4
+  %cmp2 = icmp eq i32 %2, %3
+  br i1 %cmp2, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true, %while.end
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 123, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+; Test in the following case, we don't hit 'cmp' and trigger a false positive
+; cmp  w19, #0
+; cinc w0, w19, gt
+; ...
+; fcmp d8, #0.0
+; b.gt .LBB0_5
+
+define i32 @fcmpri(i32 %argc, i8** nocapture readonly %argv) {
+
+; CHECK-LABEL: fcmpri:
+; CHECK: cmp w0, #2
+; CHECK: b.lt .LBB9_3
+; CHECK-NOT: cmp w0, #1
+; CHECK-NOT: b.le .LBB9_3
+
+; CHECK-LABEL-DAG: .LBB9_3
+; CHECK: cmp w19, #0
+; CHECK: fcmp d8, #0.0
+; CHECK: b.gt .LBB9_5
+; CHECK-NOT: cmp w19, #1
+; CHECK-NOT: b.ge .LBB9_5
+
+entry:
+  %cmp = icmp sgt i32 %argc, 1
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  %arrayidx = getelementptr inbounds i8** %argv, i64 1
+  %0 = load i8** %arrayidx, align 8
+  %cmp1 = icmp eq i8* %0, null
+  br i1 %cmp1, label %if.end, label %return
+
+if.end:                                           ; preds = %land.lhs.true, %entry
+  %call = call i32 @zoo(i32 1)
+  %call2 = call double @yoo(i32 -1)
+  %cmp4 = icmp sgt i32 %call, 0
+  %add = zext i1 %cmp4 to i32
+  %cond = add nsw i32 %add, %call
+  %call7 = call i32 @xoo(i32 %cond, i32 2)
+  %cmp9 = fcmp ogt double %call2, 0.000000e+00
+  br i1 %cmp9, label %cond.end14, label %cond.false12
+
+cond.false12:                                     ; preds = %if.end
+  %sub = fadd fast double %call2, -1.000000e+00
+  br label %cond.end14
+
+cond.end14:                                       ; preds = %if.end, %cond.false12
+  %cond15 = phi double [ %sub, %cond.false12 ], [ %call2, %if.end ]
+  %call16 = call i32 @woo(double %cond15, double -2.000000e+00)
+  br label %return
+
+return:                                           ; preds = %land.lhs.true, %cond.end14
+  %retval.0 = phi i32 [ 4, %cond.end14 ], [ 3, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+declare i32 @zoo(i32)
+
+declare double @yoo(i32)
+
+declare i32 @xoo(i32, i32)
+
+declare i32 @woo(double, double)
diff --git a/test/CodeGen/AArch64/cond-sel.ll b/test/CodeGen/AArch64/cond-sel.ll
index 5f81cba..dfc83aa 100644
--- a/test/CodeGen/AArch64/cond-sel.ll
+++ b/test/CodeGen/AArch64/cond-sel.ll
@@ -214,3 +214,20 @@ define void @test_csetm(i32 %lhs, i32 %rhs, i64 %lhs64) {
   ret void
 ; CHECK: ret
 }
+
+define <1 x i1> @test_wide_comparison(i32 %in) {
+; CHECK-LABEL: test_wide_comparison:
+; CHECK: cmp w0, #1234
+; CHECK: cset
+
+  %tmp = icmp sgt i32 %in, 1234
+  %res = select i1 %tmp, <1 x i1> <i1 1>, <1 x i1> zeroinitializer
+  ret <1 x i1> %res
+}
+
+define i32 @test_select_undef() {
+; CHECK-LABEL: test_select_undef:
+; CHECK: ret
+  %res = select i1 undef, i32 0, i32 42
+  ret i32 %res
+}
diff --git a/test/CodeGen/AArch64/dag-combine-invaraints.ll b/test/CodeGen/AArch64/dag-combine-invaraints.ll
new file mode 100644
index 0000000..115fc64
--- /dev/null
+++ b/test/CodeGen/AArch64/dag-combine-invaraints.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mtriple=arm64-apple-darwin8.0 -relocation-model=pic -O1 < %s | FileCheck %s
+
+@.str2 = private unnamed_addr constant [9 x i8] c"_%d____\0A\00", align 1
+
+; Function Attrs: nounwind ssp
+define i32 @main(i32 %argc, i8** %argv) #0 {
+main_:
+  %tmp = alloca i32, align 4
+  %i32T = alloca i32, align 4
+  %i32F = alloca i32, align 4
+  %i32X = alloca i32, align 4
+  store i32 0, i32* %tmp
+  store i32 15, i32* %i32T, align 4
+  store i32 5, i32* %i32F, align 4
+  %tmp6 = load i32* %tmp, align 4
+  %tmp7 = icmp ne i32 %tmp6, 0
+  %tmp8 = xor i1 %tmp7, true
+  %tmp9 = load i32* %i32T, align 4
+  %tmp10 = load i32* %i32F, align 4
+  %DHSelect = select i1 %tmp8, i32 %tmp9, i32 %tmp10
+  store i32 %DHSelect, i32* %i32X, align 4
+  %tmp15 = load i32* %i32X, align 4
+  %tmp17 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str2, i32 0, i32 0), i32 %tmp15)
+  ret i32 0
+
+; CHECK: main:
+; CHECK-DAG: movz
+; CHECK-DAG: orr
+; CHECK: csel
+}
+
+
+declare i32 @printf(i8*, ...) #1
+
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/AArch64/dont-take-over-the-world.ll b/test/CodeGen/AArch64/dont-take-over-the-world.ll
new file mode 100644
index 0000000..d9e13b7
--- /dev/null
+++ b/test/CodeGen/AArch64/dont-take-over-the-world.ll
@@ -0,0 +1,7 @@
+; RUN: not llc -mtriple=x86-64 2>&1 | FileCheck %s
+
+; To support "arm64" as a -march option, we need to register a second AArch64
+; target, but we have to be careful how we do that so that it doesn't become the
+; target of last resort when the specified triple is completely wrong.
+
+; CHECK: unable to get target for 'x86-64', see --version and --triple.
diff --git a/test/CodeGen/AArch64/extern-weak.ll b/test/CodeGen/AArch64/extern-weak.ll
index ce5c0f6..f647c4b 100644
--- a/test/CodeGen/AArch64/extern-weak.ll
+++ b/test/CodeGen/AArch64/extern-weak.ll
@@ -1,17 +1,24 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck --check-prefix=CHECK-STATIC %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large -o - %s | FileCheck --check-prefix=CHECK-LARGE %s
 
 declare extern_weak i32 @var()
 
 define i32()* @foo() {
 ; The usual ADRP/ADD pair can't be used for a weak reference because it must
-; evaluate to 0 if the symbol is undefined. We use a litpool entry.
+; evaluate to 0 if the symbol is undefined. We use a GOT entry for PIC
+; otherwise a litpool entry.
   ret i32()* @var
 
 
 ; CHECK: adrp x[[ADDRHI:[0-9]+]], :got:var
 ; CHECK: ldr x0, [x[[ADDRHI]], :got_lo12:var]
 
+; CHECK-STATIC: .LCPI0_0:
+; CHECK-STATIC-NEXT: .xword  var
+; CHECK-STATIC: adrp x[[VAR:[0-9]+]], .LCPI0_0
+; CHECK-STATIC: ldr x0, [x[[VAR]], :lo12:.LCPI0_0]
+
   ; In the large model, the usual relocations are absolute and can
   ; materialise 0.
 ; CHECK-LARGE: movz x0, #:abs_g3:var
@@ -31,6 +38,11 @@ define i32* @bar() {
 ; CHECK: ldr [[BASE:x[0-9]+]], [x[[ADDRHI]], :got_lo12:arr_var]
 ; CHECK: add x0, [[BASE]], #20
 
+; CHECK-STATIC: .LCPI1_0:
+; CHECK-STATIC-NEXT: .xword arr_var
+; CHECK-STATIC: ldr [[BASE:x[0-9]+]], [{{x[0-9]+}}, :lo12:.LCPI1_0]
+; CHECK-STATIC: add x0, [[BASE]], #20
+
   ret i32* %addr
 
   ; In the large model, the usual relocations are absolute and can
@@ -49,6 +61,9 @@ define i32* @wibble() {
 ; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var
 ; CHECK: add x0, [[BASE]], :lo12:defined_weak_var
 
+; CHECK-STATIC: adrp [[BASE:x[0-9]+]], defined_weak_var
+; CHECK-STATIC: add x0, [[BASE]], :lo12:defined_weak_var
+
 ; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var
 ; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var
 ; CHECK-LARGE: movk x0, #:abs_g1_nc:defined_weak_var
diff --git a/test/CodeGen/AArch64/f16-convert.ll b/test/CodeGen/AArch64/f16-convert.ll
index 6fabdc5..12412d4 100644
--- a/test/CodeGen/AArch64/f16-convert.ll
+++ b/test/CodeGen/AArch64/f16-convert.ll
@@ -7,7 +7,7 @@ define float @load0(i16* nocapture readonly %a) nounwind {
 ; CHECK-NEXT: ret
 
   %tmp = load i16* %a, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %tmp1 = tail call float @llvm.convert.from.fp16.f32(i16 %tmp)
   ret float %tmp1
 }
 
@@ -18,8 +18,7 @@ define double @load1(i16* nocapture readonly %a) nounwind {
 ; CHECK-NEXT: ret
 
   %tmp = load i16* %a, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
-  %conv = fpext float %tmp1 to double
+  %conv = tail call double @llvm.convert.from.fp16.f64(i16 %tmp)
   ret double %conv
 }
 
@@ -32,7 +31,7 @@ define float @load2(i16* nocapture readonly %a, i32 %i) nounwind {
   %idxprom = sext i32 %i to i64
   %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %tmp1 = tail call float @llvm.convert.from.fp16.f32(i16 %tmp)
   ret float %tmp1
 }
 
@@ -45,8 +44,7 @@ define double @load3(i16* nocapture readonly %a, i32 %i) nounwind {
   %idxprom = sext i32 %i to i64
   %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
-  %conv = fpext float %tmp1 to double
+  %conv = tail call double @llvm.convert.from.fp16.f64(i16 %tmp)
   ret double %conv
 }
 
@@ -58,7 +56,7 @@ define float @load4(i16* nocapture readonly %a, i64 %i) nounwind {
 
   %arrayidx = getelementptr inbounds i16* %a, i64 %i
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %tmp1 = tail call float @llvm.convert.from.fp16.f32(i16 %tmp)
   ret float %tmp1
 }
 
@@ -70,8 +68,7 @@ define double @load5(i16* nocapture readonly %a, i64 %i) nounwind {
 
   %arrayidx = getelementptr inbounds i16* %a, i64 %i
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
-  %conv = fpext float %tmp1 to double
+  %conv = tail call double @llvm.convert.from.fp16.f64(i16 %tmp)
   ret double %conv
 }
 
@@ -83,7 +80,7 @@ define float @load6(i16* nocapture readonly %a) nounwind {
 
   %arrayidx = getelementptr inbounds i16* %a, i64 10
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %tmp1 = tail call float @llvm.convert.from.fp16.f32(i16 %tmp)
   ret float %tmp1
 }
 
@@ -95,8 +92,7 @@ define double @load7(i16* nocapture readonly %a) nounwind {
 
   %arrayidx = getelementptr inbounds i16* %a, i64 10
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
-  %conv = fpext float %tmp1 to double
+  %conv = tail call double @llvm.convert.from.fp16.f64(i16 %tmp)
   ret double %conv
 }
 
@@ -108,7 +104,7 @@ define float @load8(i16* nocapture readonly %a) nounwind {
 
   %arrayidx = getelementptr inbounds i16* %a, i64 -10
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %tmp1 = tail call float @llvm.convert.from.fp16.f32(i16 %tmp)
   ret float %tmp1
 }
 
@@ -120,8 +116,7 @@ define double @load9(i16* nocapture readonly %a) nounwind {
 
   %arrayidx = getelementptr inbounds i16* %a, i64 -10
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
-  %conv = fpext float %tmp1 to double
+  %conv = tail call double @llvm.convert.from.fp16.f64(i16 %tmp)
   ret double %conv
 }
 
@@ -131,7 +126,7 @@ define void @store0(i16* nocapture %a, float %val) nounwind {
 ; CHECK-NEXT: str  h0, [x0]
 ; CHECK-NEXT: ret
 
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %val)
   store i16 %tmp, i16* %a, align 2
   ret void
 }
@@ -143,7 +138,7 @@ define void @store1(i16* nocapture %a, double %val) nounwind {
 ; CHECK-NEXT: ret
 
   %conv = fptrunc double %val to float
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %conv)
   store i16 %tmp, i16* %a, align 2
   ret void
 }
@@ -154,7 +149,7 @@ define void @store2(i16* nocapture %a, i32 %i, float %val) nounwind {
 ; CHECK-NEXT: str h0, [x0, w1, sxtw #1]
 ; CHECK-NEXT: ret
 
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %val)
   %idxprom = sext i32 %i to i64
   %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom
   store i16 %tmp, i16* %arrayidx, align 2
@@ -168,7 +163,7 @@ define void @store3(i16* nocapture %a, i32 %i, double %val) nounwind {
 ; CHECK-NEXT: ret
 
   %conv = fptrunc double %val to float
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %conv)
   %idxprom = sext i32 %i to i64
   %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom
   store i16 %tmp, i16* %arrayidx, align 2
@@ -181,7 +176,7 @@ define void @store4(i16* nocapture %a, i64 %i, float %val) nounwind {
 ; CHECK-NEXT: str h0, [x0, x1, lsl #1]
 ; CHECK-NEXT: ret
 
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %val)
   %arrayidx = getelementptr inbounds i16* %a, i64 %i
   store i16 %tmp, i16* %arrayidx, align 2
   ret void
@@ -194,7 +189,7 @@ define void @store5(i16* nocapture %a, i64 %i, double %val) nounwind {
 ; CHECK-NEXT: ret
 
   %conv = fptrunc double %val to float
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %conv)
   %arrayidx = getelementptr inbounds i16* %a, i64 %i
   store i16 %tmp, i16* %arrayidx, align 2
   ret void
@@ -206,7 +201,7 @@ define void @store6(i16* nocapture %a, float %val) nounwind {
 ; CHECK-NEXT: str h0, [x0, #20]
 ; CHECK-NEXT: ret
 
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %val)
   %arrayidx = getelementptr inbounds i16* %a, i64 10
   store i16 %tmp, i16* %arrayidx, align 2
   ret void
@@ -219,7 +214,7 @@ define void @store7(i16* nocapture %a, double %val) nounwind {
 ; CHECK-NEXT: ret
 
   %conv = fptrunc double %val to float
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %conv)
   %arrayidx = getelementptr inbounds i16* %a, i64 10
   store i16 %tmp, i16* %arrayidx, align 2
   ret void
@@ -231,7 +226,7 @@ define void @store8(i16* nocapture %a, float %val) nounwind {
 ; CHECK-NEXT: stur h0, [x0, #-20]
 ; CHECK-NEXT: ret
 
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %val)
   %arrayidx = getelementptr inbounds i16* %a, i64 -10
   store i16 %tmp, i16* %arrayidx, align 2
   ret void
@@ -244,11 +239,13 @@ define void @store9(i16* nocapture %a, double %val) nounwind {
 ; CHECK-NEXT: ret
 
   %conv = fptrunc double %val to float
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %conv)
   %arrayidx = getelementptr inbounds i16* %a, i64 -10
   store i16 %tmp, i16* %arrayidx, align 2
   ret void
 }
 
-declare i16 @llvm.convert.to.fp16(float) nounwind readnone
-declare float @llvm.convert.from.fp16(i16) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
+declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone
+declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
diff --git a/test/CodeGen/AArch64/fast-isel-addressing-modes.ll b/test/CodeGen/AArch64/fast-isel-addressing-modes.ll
new file mode 100644
index 0000000..d86f00d
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-addressing-modes.ll
@@ -0,0 +1,627 @@
+; RUN: llc -mtriple=aarch64-apple-darwin                             -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SDAG
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FAST
+
+; Load / Store Base Register only
+define zeroext i1 @load_breg_i1(i1* %a) {
+; CHECK-LABEL: load_breg_i1
+; CHECK:       ldrb {{w[0-9]+}}, [x0]
+  %1 = load i1* %a
+  ret i1 %1
+}
+
+define zeroext i8 @load_breg_i8(i8* %a) {
+; CHECK-LABEL: load_breg_i8
+; CHECK:       ldrb {{w[0-9]+}}, [x0]
+  %1 = load i8* %a
+  ret i8 %1
+}
+
+define zeroext i16 @load_breg_i16(i16* %a) {
+; CHECK-LABEL: load_breg_i16
+; CHECK:       ldrh {{w[0-9]+}}, [x0]
+  %1 = load i16* %a
+  ret i16 %1
+}
+
+define i32 @load_breg_i32(i32* %a) {
+; CHECK-LABEL: load_breg_i32
+; CHECK:       ldr {{w[0-9]+}}, [x0]
+  %1 = load i32* %a
+  ret i32 %1
+}
+
+define i64 @load_breg_i64(i64* %a) {
+; CHECK-LABEL: load_breg_i64
+; CHECK:       ldr {{x[0-9]+}}, [x0]
+  %1 = load i64* %a
+  ret i64 %1
+}
+
+define float @load_breg_f32(float* %a) {
+; CHECK-LABEL: load_breg_f32
+; CHECK:       ldr {{s[0-9]+}}, [x0]
+  %1 = load float* %a
+  ret float %1
+}
+
+define double @load_breg_f64(double* %a) {
+; CHECK-LABEL: load_breg_f64
+; CHECK:       ldr {{d[0-9]+}}, [x0]
+  %1 = load double* %a
+  ret double %1
+}
+
+define void @store_breg_i1(i1* %a) {
+; CHECK-LABEL: store_breg_i1
+; CHECK:       strb wzr, [x0]
+  store i1 0, i1* %a
+  ret void
+}
+
+define void @store_breg_i1_2(i1* %a) {
+; CHECK-LABEL: store_breg_i1_2
+; CHECK:       strb {{w[0-9]+}}, [x0]
+  store i1 true, i1* %a
+  ret void
+}
+
+define void @store_breg_i8(i8* %a) {
+; CHECK-LABEL: store_breg_i8
+; CHECK:       strb wzr, [x0]
+  store i8 0, i8* %a
+  ret void
+}
+
+define void @store_breg_i16(i16* %a) {
+; CHECK-LABEL: store_breg_i16
+; CHECK:       strh wzr, [x0]
+  store i16 0, i16* %a
+  ret void
+}
+
+define void @store_breg_i32(i32* %a) {
+; CHECK-LABEL: store_breg_i32
+; CHECK:       str wzr, [x0]
+  store i32 0, i32* %a
+  ret void
+}
+
+define void @store_breg_i64(i64* %a) {
+; CHECK-LABEL: store_breg_i64
+; CHECK:       str xzr, [x0]
+  store i64 0, i64* %a
+  ret void
+}
+
+define void @store_breg_f32(float* %a) {
+; CHECK-LABEL: store_breg_f32
+; CHECK:       str wzr, [x0]
+  store float 0.0, float* %a
+  ret void
+}
+
+define void @store_breg_f64(double* %a) {
+; CHECK-LABEL: store_breg_f64
+; CHECK:       str xzr, [x0]
+  store double 0.0, double* %a
+  ret void
+}
+
+; Load Immediate
+define i32 @load_immoff_1() {
+; CHECK-LABEL: load_immoff_1
+; CHECK:       orr {{w|x}}[[REG:[0-9]+]], {{wzr|xzr}}, #0x80
+; CHECK:       ldr {{w[0-9]+}}, {{\[}}x[[REG]]{{\]}}
+  %1 = inttoptr i64 128 to i32*
+  %2 = load i32* %1
+  ret i32 %2
+}
+
+; Load / Store Base Register + Immediate Offset
+; Max supported negative offset
+define i32 @load_breg_immoff_1(i64 %a) {
+; CHECK-LABEL: load_breg_immoff_1
+; CHECK:       ldur {{w[0-9]+}}, [x0, #-256]
+  %1 = add i64 %a, -256
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+; Min not-supported negative offset
+define i32 @load_breg_immoff_2(i64 %a) {
+; CHECK-LABEL: load_breg_immoff_2
+; CHECK:       sub [[REG:x[0-9]+]], x0, #257
+; CHECK-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]]{{\]}}
+  %1 = add i64 %a, -257
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+; Max supported unscaled offset
+define i32 @load_breg_immoff_3(i64 %a) {
+; CHECK-LABEL: load_breg_immoff_3
+; CHECK:       ldur {{w[0-9]+}}, [x0, #255]
+  %1 = add i64 %a, 255
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+; Min un-supported unscaled offset
+define i32 @load_breg_immoff_4(i64 %a) {
+; CHECK-LABEL: load_breg_immoff_4
+; CHECK:       add [[REG:x[0-9]+]], x0, #257
+; CHECK-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]]{{\]}}
+  %1 = add i64 %a, 257
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+; Max supported scaled offset
+define i32 @load_breg_immoff_5(i64 %a) {
+; CHECK-LABEL: load_breg_immoff_5
+; CHECK:       ldr {{w[0-9]+}}, [x0, #16380]
+  %1 = add i64 %a, 16380
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+; Min un-supported scaled offset
+define i32 @load_breg_immoff_6(i64 %a) {
+; SDAG-LABEL: load_breg_immoff_6
+; SDAG:       orr	w[[NUM:[0-9]+]], wzr, #0x4000
+; SDAG-NEXT:  ldr {{w[0-9]+}}, [x0, x[[NUM]]]
+; FAST-LABEL: load_breg_immoff_6
+; FAST:       add [[REG:x[0-9]+]], x0, #4, lsl #12
+; FAST-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]]{{\]}}
+  %1 = add i64 %a, 16384
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+; Max supported negative offset
+define void @store_breg_immoff_1(i64 %a) {
+; CHECK-LABEL: store_breg_immoff_1
+; CHECK:       stur wzr, [x0, #-256]
+  %1 = add i64 %a, -256
+  %2 = inttoptr i64 %1 to i32*
+  store i32 0, i32* %2
+  ret void
+}
+
+; Min not-supported negative offset
+define void @store_breg_immoff_2(i64 %a) {
+; CHECK-LABEL: store_breg_immoff_2
+; CHECK:       sub [[REG:x[0-9]+]], x0, #257
+; CHECK-NEXT:  str wzr, {{\[}}[[REG]]{{\]}}
+  %1 = add i64 %a, -257
+  %2 = inttoptr i64 %1 to i32*
+  store i32 0, i32* %2
+  ret void
+}
+
+; Max supported unscaled offset
+define void @store_breg_immoff_3(i64 %a) {
+; CHECK-LABEL: store_breg_immoff_3
+; CHECK:       stur wzr, [x0, #255]
+  %1 = add i64 %a, 255
+  %2 = inttoptr i64 %1 to i32*
+  store i32 0, i32* %2
+  ret void
+}
+
+; Min un-supported unscaled offset
+define void @store_breg_immoff_4(i64 %a) {
+; CHECK-LABEL: store_breg_immoff_4
+; CHECK:       add [[REG:x[0-9]+]], x0, #257
+; CHECK-NEXT:  str wzr, {{\[}}[[REG]]{{\]}}
+  %1 = add i64 %a, 257
+  %2 = inttoptr i64 %1 to i32*
+  store i32 0, i32* %2
+  ret void
+}
+
+; Max supported scaled offset
+define void @store_breg_immoff_5(i64 %a) {
+; CHECK-LABEL: store_breg_immoff_5
+; CHECK:       str wzr, [x0, #16380]
+  %1 = add i64 %a, 16380
+  %2 = inttoptr i64 %1 to i32*
+  store i32 0, i32* %2
+  ret void
+}
+
+; Min un-supported scaled offset
+define void @store_breg_immoff_6(i64 %a) {
+; SDAG-LABEL: store_breg_immoff_6
+; SDAG:       orr	w[[NUM:[0-9]+]], wzr, #0x4000
+; SDAG-NEXT:  str wzr, [x0, x[[NUM]]]
+; FAST-LABEL: store_breg_immoff_6
+; FAST:       add [[REG:x[0-9]+]], x0, #4, lsl #12
+; FAST-NEXT:  str wzr, {{\[}}[[REG]]{{\]}}
+  %1 = add i64 %a, 16384
+  %2 = inttoptr i64 %1 to i32*
+  store i32 0, i32* %2
+  ret void
+}
+
+define i64 @load_breg_immoff_7(i64 %a) {
+; CHECK-LABEL: load_breg_immoff_7
+; CHECK:       ldr {{x[0-9]+}}, [x0, #48]
+  %1 = add i64 %a, 48
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load i64* %2
+  ret i64 %3
+}
+
+; Flip add operands
+define i64 @load_breg_immoff_8(i64 %a) {
+; CHECK-LABEL: load_breg_immoff_8
+; CHECK:       ldr {{x[0-9]+}}, [x0, #48]
+  %1 = add i64 48, %a
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load i64* %2
+  ret i64 %3
+}
+
+; Load Base Register + Register Offset
+define i64 @load_breg_offreg_1(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_offreg_1
+; CHECK:       ldr {{x[0-9]+}}, [x0, x1]
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load i64* %2
+  ret i64 %3
+}
+
+; Flip add operands
+define i64 @load_breg_offreg_2(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_offreg_2
+; CHECK:       ldr {{x[0-9]+}}, [x1, x0]
+  %1 = add i64 %b, %a
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load i64* %2
+  ret i64 %3
+}
+
+; Load Base Register + Register Offset + Immediate Offset
+define i64 @load_breg_offreg_immoff_1(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_offreg_immoff_1
+; CHECK:       add [[REG:x[0-9]+]], x0, x1
+; CHECK-NEXT:  ldr x0, {{\[}}[[REG]], #48{{\]}}
+  %1 = add i64 %a, %b
+  %2 = add i64 %1, 48
+  %3 = inttoptr i64 %2 to i64*
+  %4 = load i64* %3
+  ret i64 %4
+}
+
+define i64 @load_breg_offreg_immoff_2(i64 %a, i64 %b) {
+; SDAG-LABEL: load_breg_offreg_immoff_2
+; SDAG:       add [[REG1:x[0-9]+]], x0, x1
+; SDAG-NEXT:  orr w[[NUM:[0-9]+]], wzr, #0xf000
+; SDAG-NEXT:  ldr x0, {{\[}}[[REG1]], x[[NUM]]]
+; FAST-LABEL: load_breg_offreg_immoff_2
+; FAST:       add [[REG:x[0-9]+]], x0, #15, lsl #12
+; FAST-NEXT:  ldr x0, {{\[}}[[REG]], x1{{\]}}
+  %1 = add i64 %a, %b
+  %2 = add i64 %1, 61440
+  %3 = inttoptr i64 %2 to i64*
+  %4 = load i64* %3
+  ret i64 %4
+}
+
+; Load Scaled Register Offset
+define i32 @load_shift_offreg_1(i64 %a) {
+; CHECK-LABEL: load_shift_offreg_1
+; CHECK:       lsl [[REG:x[0-9]+]], x0, #2
+; CHECK:       ldr {{w[0-9]+}}, {{\[}}[[REG]]{{\]}}
+  %1 = shl i64 %a, 2
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+define i32 @load_mul_offreg_1(i64 %a) {
+; CHECK-LABEL: load_mul_offreg_1
+; CHECK:       lsl [[REG:x[0-9]+]], x0, #2
+; CHECK:       ldr {{w[0-9]+}}, {{\[}}[[REG]]{{\]}}
+  %1 = mul i64 %a, 4
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+; Load Base Register + Scaled Register Offset
+define i32 @load_breg_shift_offreg_1(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_shift_offreg_1
+; CHECK:       ldr {{w[0-9]+}}, [x1, x0, lsl #2]
+  %1 = shl i64 %a, 2
+  %2 = add i64 %1, %b
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  ret i32 %4
+}
+
+define i32 @load_breg_shift_offreg_2(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_shift_offreg_2
+; CHECK:       ldr {{w[0-9]+}}, [x1, x0, lsl #2]
+  %1 = shl i64 %a, 2
+  %2 = add i64 %b, %1
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  ret i32 %4
+}
+
+define i32 @load_breg_shift_offreg_3(i64 %a, i64 %b) {
+; SDAG-LABEL: load_breg_shift_offreg_3
+; SDAG:       lsl [[REG:x[0-9]+]], x0, #2
+; SDAG-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]], x1, lsl #2{{\]}}
+; FAST-LABEL: load_breg_shift_offreg_3
+; FAST:       lsl [[REG:x[0-9]+]], x1, #2
+; FAST-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]], x0, lsl #2{{\]}}
+  %1 = shl i64 %a, 2
+  %2 = shl i64 %b, 2
+  %3 = add i64 %1, %2
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i32 @load_breg_shift_offreg_4(i64 %a, i64 %b) {
+; SDAG-LABEL: load_breg_shift_offreg_4
+; SDAG:       lsl [[REG:x[0-9]+]], x1, #2
+; SDAG-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]], x0, lsl #2{{\]}}
+; FAST-LABEL: load_breg_shift_offreg_4
+; FAST:       lsl [[REG:x[0-9]+]], x0, #2
+; FAST-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]], x1, lsl #2{{\]}}
+  %1 = shl i64 %a, 2
+  %2 = shl i64 %b, 2
+  %3 = add i64 %2, %1
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i32 @load_breg_shift_offreg_5(i64 %a, i64 %b) {
+; SDAG-LABEL: load_breg_shift_offreg_5
+; SDAG:       lsl [[REG:x[0-9]+]], x1, #3
+; SDAG-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]], x0, lsl #2{{\]}}
+; FAST-LABEL: load_breg_shift_offreg_5
+; FAST:       lsl [[REG:x[0-9]+]], x1, #3
+; FAST-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]], x0, lsl #2{{\]}}
+  %1 = shl i64 %a, 2
+  %2 = shl i64 %b, 3
+  %3 = add i64 %1, %2
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i32 @load_breg_mul_offreg_1(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_mul_offreg_1
+; CHECK:       ldr {{w[0-9]+}}, [x1, x0, lsl #2]
+  %1 = mul i64 %a, 4
+  %2 = add i64 %1, %b
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  ret i32 %4
+}
+
+define zeroext i8 @load_breg_and_offreg_1(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_and_offreg_1
+; CHECK:       ldrb {{w[0-9]+}}, [x1, w0, uxtw]
+  %1 = and i64 %a, 4294967295
+  %2 = add i64 %1, %b
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  ret i8 %4
+}
+
+define zeroext i16 @load_breg_and_offreg_2(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_and_offreg_2
+; CHECK:       ldrh {{w[0-9]+}}, [x1, w0, uxtw #1]
+  %1 = and i64 %a, 4294967295
+  %2 = shl i64 %1, 1
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i16*
+  %5 = load i16* %4
+  ret i16 %5
+}
+
+define i32 @load_breg_and_offreg_3(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_and_offreg_3
+; CHECK:       ldr {{w[0-9]+}}, [x1, w0, uxtw #2]
+  %1 = and i64 %a, 4294967295
+  %2 = shl i64 %1, 2
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i64 @load_breg_and_offreg_4(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_and_offreg_4
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, uxtw #3]
+  %1 = and i64 %a, 4294967295
+  %2 = shl i64 %1, 3
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+; Not all 'and' instructions have immediates.
+define i64 @load_breg_and_offreg_5(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: load_breg_and_offreg_5
+; CHECK:       and [[REG:x[0-9]+]], x0, x2
+; CHECK-NEXT:  ldr {{x[0-9]+}}, {{\[}}[[REG]], x1{{\]}}
+  %1 = and i64 %a, %c
+  %2 = add i64 %1, %b
+  %3 = inttoptr i64 %2 to i64*
+  %4 = load i64* %3
+  ret i64 %4
+}
+
+define i64 @load_breg_and_offreg_6(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: load_breg_and_offreg_6
+; CHECK:       and [[REG:x[0-9]+]], x0, x2
+; CHECK-NEXT:  ldr {{x[0-9]+}}, {{\[}}x1, [[REG]], lsl #3{{\]}}
+  %1 = and i64 %a, %c
+  %2 = shl i64 %1, 3
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+; Load Base Register + Scaled Register Offset + Sign/Zero extension
+define i32 @load_breg_zext_shift_offreg_1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_zext_shift_offreg_1
+; CHECK:       ldr {{w[0-9]+}}, [x1, w0, uxtw #2]
+  %1 = zext i32 %a to i64
+  %2 = shl i64 %1, 2
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i32 @load_breg_zext_shift_offreg_2(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_zext_shift_offreg_2
+; CHECK:       ldr {{w[0-9]+}}, [x1, w0, uxtw #2]
+  %1 = zext i32 %a to i64
+  %2 = shl i64 %1, 2
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i32 @load_breg_zext_mul_offreg_1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_zext_mul_offreg_1
+; CHECK:       ldr {{w[0-9]+}}, [x1, w0, uxtw #2]
+  %1 = zext i32 %a to i64
+  %2 = mul i64 %1, 4
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i32 @load_breg_sext_shift_offreg_1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_sext_shift_offreg_1
+; CHECK:       ldr {{w[0-9]+}}, [x1, w0, sxtw #2]
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 2
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i32 @load_breg_sext_shift_offreg_2(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_sext_shift_offreg_2
+; CHECK:       ldr {{w[0-9]+}}, [x1, w0, sxtw #2]
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 2
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+; Make sure that we don't drop the first 'add' instruction.
+define i32 @load_breg_sext_shift_offreg_3(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_sext_shift_offreg_3
+; CHECK:       add [[REG:w[0-9]+]], w0, #4
+; CHECK:       ldr {{w[0-9]+}}, {{\[}}x1, [[REG]], sxtw #2{{\]}}
+  %1 = add i32 %a, 4
+  %2 = sext i32 %1 to i64
+  %3 = shl i64 %2, 2
+  %4 = add i64 %b, %3
+  %5 = inttoptr i64 %4 to i32*
+  %6 = load i32* %5
+  ret i32 %6
+}
+
+
+define i32 @load_breg_sext_mul_offreg_1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_sext_mul_offreg_1
+; CHECK:       ldr {{w[0-9]+}}, [x1, w0, sxtw #2]
+  %1 = sext i32 %a to i64
+  %2 = mul i64 %1, 4
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+; Load Scaled Register Offset + Immediate Offset + Sign/Zero extension
+define i64 @load_sext_shift_offreg_imm1(i32 %a) {
+; CHECK-LABEL: load_sext_shift_offreg_imm1
+; CHECK:       sbfiz [[REG:x[0-9]+]], {{x[0-9]+}}, #3, #32
+; CHECK-NEXT:  ldr {{x[0-9]+}}, {{\[}}[[REG]], #8{{\]}}
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %2, 8
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+; Load Base Register + Scaled Register Offset + Immediate Offset + Sign/Zero extension
+define i64 @load_breg_sext_shift_offreg_imm1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_sext_shift_offreg_imm1
+; CHECK:       add [[REG:x[0-9]+]], x1, w0, sxtw #3
+; CHECK-NEXT:  ldr {{x[0-9]+}}, {{\[}}[[REG]], #8{{\]}}
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = add i64 %3, 8
+  %5 = inttoptr i64 %4 to i64*
+  %6 = load i64* %5
+  ret i64 %6
+}
+
+; Test that the kill flag is not set - the machine instruction verifier does that for us.
+define i64 @kill_reg(i64 %a) {
+  %1 = sub i64 %a, 8
+  %2 = add i64 %1, 96
+  %3 = inttoptr i64 %2 to i64*
+  %4 = load i64* %3
+  %5 = add i64 %2, %4
+  ret i64 %5
+}
+
+define void @store_fi(i64 %i) {
+; CHECK-LABEL: store_fi
+; CHECK:       mov [[REG:x[0-9]+]], sp
+; CHECK:       str {{w[0-9]+}}, {{\[}}[[REG]], x0, lsl #2{{\]}}
+  %1 = alloca [8 x i32]
+  %2 = ptrtoint [8 x i32]* %1 to i64
+  %3 = mul i64 %i, 4
+  %4 = add i64 %2, %3
+  %5 = inttoptr i64 %4 to i32*
+  store i32 47, i32* %5, align 4
+  ret void
+}
+
+define i32 @load_fi(i64 %i) {
+; CHECK-LABEL: load_fi
+; CHECK:       mov [[REG:x[0-9]+]], sp
+; CHECK:       ldr {{w[0-9]+}}, {{\[}}[[REG]], x0, lsl #2{{\]}}
+  %1 = alloca [8 x i32]
+  %2 = ptrtoint [8 x i32]* %1 to i64
+  %3 = mul i64 %i, 4
+  %4 = add i64 %2, %3
+  %5 = inttoptr i64 %4 to i32*
+  %6 = load i32* %5, align 4
+  ret i32 %6
+}
+
diff --git a/test/CodeGen/AArch64/fast-isel-branch_weights.ll b/test/CodeGen/AArch64/fast-isel-branch_weights.ll
new file mode 100644
index 0000000..5b22476
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-branch_weights.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=arm64-apple-darwin -aarch64-atomic-cfg-tidy=0                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -aarch64-atomic-cfg-tidy=0 -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+; Test if the BBs are reordred according to their branch weights.
+define i64 @branch_weights_test(i64 %a, i64 %b) {
+; CHECK-LABEL: branch_weights_test
+; CHECK-LABEL: success
+; CHECK-LABEL: fail
+  %1 = icmp ult i64 %a, %b
+  br i1 %1, label %fail, label %success, !prof !0
+
+fail:
+  ret i64 -1
+
+success:
+  ret i64 0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 0, i32 2147483647}
diff --git a/test/CodeGen/AArch64/fast-isel-call-return.ll b/test/CodeGen/AArch64/fast-isel-call-return.ll
new file mode 100644
index 0000000..9b10969
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-call-return.ll
@@ -0,0 +1,12 @@
+; RUN: llc -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linux-gnu"
+
+define i8* @test_call_return_type(i64 %size) {
+entry:
+; CHECK: bl xmalloc
+  %0 = call noalias i8* @xmalloc(i64 undef)
+  ret i8* %0
+}
+
+declare noalias i8* @xmalloc(i64)
diff --git a/test/CodeGen/AArch64/fast-isel-cbz.ll b/test/CodeGen/AArch64/fast-isel-cbz.ll
new file mode 100644
index 0000000..6e31a04
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-cbz.ll
@@ -0,0 +1,70 @@
+; RUN: llc -fast-isel -fast-isel-abort -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -mtriple=aarch64-apple-darwin < %s | FileCheck %s
+
+define i32 @icmp_eq_i1(i1 %a) {
+; CHECK-LABEL: icmp_eq_i1
+; CHECK:       tbz w0, #0, {{LBB.+_2}}
+  %1 = icmp eq i1 %a, 0
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i8(i8 %a) {
+; CHECK-LABEL: icmp_eq_i8
+; CHECK:       uxtb [[REG:w[0-9]+]], w0
+; CHECK:       cbz [[REG]], {{LBB.+_2}}
+  %1 = icmp eq i8 %a, 0
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i16(i16 %a) {
+; CHECK-LABEL: icmp_eq_i16
+; CHECK:       uxth [[REG:w[0-9]+]], w0
+; CHECK:       cbz [[REG]], {{LBB.+_2}}
+  %1 = icmp eq i16 %a, 0
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i32(i32 %a) {
+; CHECK-LABEL: icmp_eq_i32
+; CHECK:       cbz w0, {{LBB.+_2}}
+  %1 = icmp eq i32 %a, 0
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i64(i64 %a) {
+; CHECK-LABEL: icmp_eq_i64
+; CHECK:       cbz x0, {{LBB.+_2}}
+  %1 = icmp eq i64 %a, 0
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq_ptr(i8* %a) {
+; CHECK-LABEL: icmp_eq_ptr
+; CHECK:       cbz x0, {{LBB.+_2}}
+  %1 = icmp eq i8* %a, null
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
diff --git a/test/CodeGen/AArch64/fast-isel-cmp-branch.ll b/test/CodeGen/AArch64/fast-isel-cmp-branch.ll
new file mode 100644
index 0000000..3651f19
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-cmp-branch.ll
@@ -0,0 +1,293 @@
+; RUN: llc                             -aarch64-atomic-cfg-tidy=0 -mtriple=aarch64-apple-darwin < %s | FileCheck %s
+; RUN: llc -fast-isel -fast-isel-abort -aarch64-atomic-cfg-tidy=0 -mtriple=aarch64-apple-darwin < %s | FileCheck %s
+
+define i32 @fcmp_oeq(float %x, float %y) {
+; CHECK-LABEL: fcmp_oeq
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.ne {{LBB.+_2}}
+  %1 = fcmp oeq float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ogt(float %x, float %y) {
+; CHECK-LABEL: fcmp_ogt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.le {{LBB.+_2}}
+  %1 = fcmp ogt float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_oge(float %x, float %y) {
+; CHECK-LABEL: fcmp_oge
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.lt {{LBB.+_2}}
+  %1 = fcmp oge float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_olt(float %x, float %y) {
+; CHECK-LABEL: fcmp_olt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.pl {{LBB.+_2}}
+  %1 = fcmp olt float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ole(float %x, float %y) {
+; CHECK-LABEL: fcmp_ole
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.hi {{LBB.+_2}}
+  %1 = fcmp ole float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_one(float %x, float %y) {
+; CHECK-LABEL: fcmp_one
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.mi
+; CHECK-NEXT:  b.gt
+  %1 = fcmp one float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ord(float %x, float %y) {
+; CHECK-LABEL: fcmp_ord
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.vs {{LBB.+_2}}
+  %1 = fcmp ord float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uno(float %x, float %y) {
+; CHECK-LABEL: fcmp_uno
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.vs {{LBB.+_2}}
+  %1 = fcmp uno float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ueq(float %x, float %y) {
+; CHECK-LABEL: fcmp_ueq
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.eq {{LBB.+_2}}
+; CHECK-NEXT:  b.vs {{LBB.+_2}}
+  %1 = fcmp ueq float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ugt(float %x, float %y) {
+; CHECK-LABEL: fcmp_ugt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.ls {{LBB.+_2}}
+  %1 = fcmp ugt float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uge(float %x, float %y) {
+; CHECK-LABEL: fcmp_uge
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.mi {{LBB.+_2}}
+  %1 = fcmp uge float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ult(float %x, float %y) {
+; CHECK-LABEL: fcmp_ult
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.ge {{LBB.+_2}}
+  %1 = fcmp ult float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ule(float %x, float %y) {
+; CHECK-LABEL: fcmp_ule
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.gt {{LBB.+_2}}
+  %1 = fcmp ule float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_une(float %x, float %y) {
+; CHECK-LABEL: fcmp_une
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.eq {{LBB.+_2}}
+  %1 = fcmp une float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_eq
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.ne {{LBB.+_2}}
+  %1 = icmp eq i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ne(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_ne
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.eq {{LBB.+_2}}
+  %1 = icmp ne i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ugt(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_ugt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.ls {{LBB.+_2}}
+  %1 = icmp ugt i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_uge(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_uge
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.lo {{LBB.+_2}}
+  %1 = icmp uge i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ult(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_ult
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.hs {{LBB.+_2}}
+  %1 = icmp ult i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ule(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_ule
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.hi {{LBB.+_2}}
+  %1 = icmp ule i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sgt(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_sgt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.le {{LBB.+_2}}
+  %1 = icmp sgt i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sge(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_sge
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.lt {{LBB.+_2}}
+  %1 = icmp sge i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_slt(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_slt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.ge {{LBB.+_2}}
+  %1 = icmp slt i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sle(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_sle
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.gt {{LBB.+_2}}
+  %1 = icmp sle i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
diff --git a/test/CodeGen/AArch64/fast-isel-folding.ll b/test/CodeGen/AArch64/fast-isel-folding.ll
new file mode 100644
index 0000000..6b524ff
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-folding.ll
@@ -0,0 +1,54 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -O0 -fast-isel-abort -verify-machineinstrs < %s
+
+; Test that we don't fold the shift.
+define i64 @fold_shift_test(i64 %a, i1 %c) {
+  %1 = sub i64 %a, 8
+  %2 = ashr i64 %1, 3
+  br i1 %c, label %bb1, label %bb2
+bb1:
+  %3 = icmp ult i64 0, %2
+  br i1 %3, label %bb2, label %bb3
+bb2:
+  ret i64 1
+bb3:
+  ret i64 2
+}
+
+; Test that we don't fold the sign-extend.
+define i64 @fold_sext_test1(i32 %a, i1 %c) {
+  %1 = sub i32 %a, 8
+  %2 = sext i32 %1 to i64
+  br i1 %c, label %bb1, label %bb2
+bb1:
+  %3 = icmp ult i64 0, %2
+  br i1 %3, label %bb2, label %bb3
+bb2:
+  ret i64 1
+bb3:
+  ret i64 2
+}
+
+; Test that we don't fold the sign-extend.
+define i64 @fold_sext_test2(i32 %a, i1 %c) {
+  %1 = sub i32 %a, 8
+  %2 = sext i32 %1 to i64
+  br i1 %c, label %bb1, label %bb2
+bb1:
+  %3 = shl i64 %2, 4
+  ret i64 %3
+bb2:
+  ret i64 %2
+}
+
+; Test that we clear the kill flag.
+define i32 @fold_kill_test(i32 %a) {
+  %1 = sub i32 %a, 8
+  %2 = shl i32 %1, 3
+  %3 = icmp ult i32 0, %2
+  br i1 %3, label %bb1, label %bb2
+bb1:
+  ret i32 %2
+bb2:
+  %4 = add i32 %2, 4
+  ret i32 %4
+}
diff --git a/test/CodeGen/AArch64/fast-isel-gep.ll b/test/CodeGen/AArch64/fast-isel-gep.ll
new file mode 100644
index 0000000..4dc0a05
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-gep.ll
@@ -0,0 +1,49 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+%struct.foo = type { i32, i64, float, double }
+
+define double* @test_struct(%struct.foo* %f) {
+; CHECK-LABEL: test_struct
+; CHECK:       add x0, x0, #24
+  %1 = getelementptr inbounds %struct.foo* %f, i64 0, i32 3
+  ret double* %1
+}
+
+define i32* @test_array1(i32* %a, i64 %i) {
+; CHECK-LABEL: test_array1
+; CHECK:       orr [[REG:x[0-9]+]], xzr, #0x4
+; CHECK-NEXT:  madd  x0, x1, [[REG]], x0
+  %1 = getelementptr inbounds i32* %a, i64 %i
+  ret i32* %1
+}
+
+define i32* @test_array2(i32* %a) {
+; CHECK-LABEL: test_array2
+; CHECK:       add  x0, x0, #16
+  %1 = getelementptr inbounds i32* %a, i64 4
+  ret i32* %1
+}
+
+define i32* @test_array3(i32* %a) {
+; CHECK-LABEL: test_array3
+; CHECK:       add x0, x0, #1, lsl #12
+  %1 = getelementptr inbounds i32* %a, i64 1024
+  ret i32* %1
+}
+
+define i32* @test_array4(i32* %a) {
+; CHECK-LABEL: test_array4
+; CHECK:       movz [[REG:x[0-9]+]], #0x1008
+; CHECK-NEXR:  add x0, x0, [[REG]]
+  %1 = getelementptr inbounds i32* %a, i64 1026
+  ret i32* %1
+}
+
+define i32* @test_array5(i32* %a, i32 %i) {
+; CHECK-LABEL: test_array5
+; CHECK:       sxtw [[REG1:x[0-9]+]], w1
+; CHECK-NEXT:  orr  [[REG2:x[0-9]+]], xzr, #0x4
+; CHECK-NEXT:  madd  {{x[0-9]+}}, [[REG1]], [[REG2]], x0
+  %1 = getelementptr inbounds i32* %a, i32 %i
+  ret i32* %1
+}
diff --git a/test/CodeGen/AArch64/fast-isel-int-ext.ll b/test/CodeGen/AArch64/fast-isel-int-ext.ll
new file mode 100644
index 0000000..866feba
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-int-ext.ll
@@ -0,0 +1,491 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+;
+; Test that we only use the sign/zero extend in the address calculation when
+; necessary.
+;
+; SHIFT
+;
+define i64 @load_addr_shift_zext1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_addr_shift_zext1
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, uxtw #3]
+  %1 = zext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_shift_zext2(i32 zeroext %a, i64 %b) {
+; CHECK-LABEL: load_addr_shift_zext2
+; CHECK:       ldr {{x[0-9]+}}, [x1, x0, lsl #3]
+  %1 = zext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_shift_zext3(i32 signext %a, i64 %b) {
+; CHECK-LABEL: load_addr_shift_zext3
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, uxtw #3]
+  %1 = zext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_shift_sext1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_addr_shift_sext1
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, sxtw #3]
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_shift_sext2(i32 zeroext %a, i64 %b) {
+; CHECK-LABEL: load_addr_shift_sext2
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, sxtw #3]
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_shift_sext3(i32 signext %a, i64 %b) {
+; CHECK-LABEL: load_addr_shift_sext3
+; CHECK:       ldr {{x[0-9]+}}, [x1, x0, lsl #3]
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+;
+; MUL
+;
+define i64 @load_addr_mul_zext1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_addr_mul_zext1
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, uxtw #3]
+  %1 = zext i32 %a to i64
+  %2 = mul i64 %1, 8
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_mul_zext2(i32 zeroext %a, i64 %b) {
+; CHECK-LABEL: load_addr_mul_zext2
+; CHECK:       ldr {{x[0-9]+}}, [x1, x0, lsl #3]
+  %1 = zext i32 %a to i64
+  %2 = mul i64 %1, 8
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_mul_zext3(i32 signext %a, i64 %b) {
+; CHECK-LABEL: load_addr_mul_zext3
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, uxtw #3]
+  %1 = zext i32 %a to i64
+  %2 = mul i64 %1, 8
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_mul_sext1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_addr_mul_sext1
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, sxtw #3]
+  %1 = sext i32 %a to i64
+  %2 = mul i64 %1, 8
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_mul_sext2(i32 zeroext %a, i64 %b) {
+; CHECK-LABEL: load_addr_mul_sext2
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, sxtw #3]
+  %1 = sext i32 %a to i64
+  %2 = mul i64 %1, 8
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_mul_sext3(i32 signext %a, i64 %b) {
+; CHECK-LABEL: load_addr_mul_sext3
+; CHECK:       ldr {{x[0-9]+}}, [x1, x0, lsl #3]
+  %1 = sext i32 %a to i64
+  %2 = mul i64 %1, 8
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+
+;
+; Test folding of the sign-/zero-extend into the load instruction.
+;
+
+; Unscaled
+define i32 @load_unscaled_zext_i8_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i8_to_i32
+; CHECK:       ldurb w0, [x0, #-8]
+; CHECK-NOT:   uxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = zext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_unscaled_zext_i16_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i16_to_i32
+; CHECK:       ldurh w0, [x0, #-8]
+; CHECK-NOT:   uxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = zext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_unscaled_zext_i8_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i8_to_i64
+; CHECK:       ldurb w0, [x0, #-8]
+; CHECK-NOT:   uxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = zext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_zext_i16_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i16_to_i64
+; CHECK:       ldurh w0, [x0, #-8]
+; CHECK-NOT:   uxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = zext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_zext_i32_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i32_to_i64
+; CHECK:       ldur w0, [x0, #-8]
+; CHECK-NOT:   uxtw
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+define i32 @load_unscaled_sext_i8_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i8_to_i32
+; CHECK:       ldursb w0, [x0, #-8]
+; CHECK-NOT:   sxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = sext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_unscaled_sext_i16_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i16_to_i32
+; CHECK:       ldursh w0, [x0, #-8]
+; CHECK-NOT:   sxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = sext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_unscaled_sext_i8_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i8_to_i64
+; CHECK:       ldursb x0, [x0, #-8]
+; CHECK-NOT:   sxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = sext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_sext_i16_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i16_to_i64
+; CHECK:       ldursh x0, [x0, #-8]
+; CHECK-NOT:   sxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = sext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_sext_i32_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i32_to_i64
+; CHECK:       ldursw x0, [x0, #-8]
+; CHECK-NOT:   sxtw
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  %4 = sext i32 %3 to i64
+  ret i64 %4
+}
+
+; Register
+define i32 @load_register_zext_i8_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i8_to_i32
+; CHECK:       ldrb w0, [x0, x1]
+; CHECK-NOT:   uxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = zext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_register_zext_i16_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i16_to_i32
+; CHECK:       ldrh w0, [x0, x1]
+; CHECK-NOT:   uxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = zext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_register_zext_i8_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i8_to_i64
+; CHECK:       ldrb w0, [x0, x1]
+; CHECK-NOT:   uxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = zext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_zext_i16_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i16_to_i64
+; CHECK:       ldrh w0, [x0, x1]
+; CHECK-NOT:   uxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = zext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_zext_i32_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i32_to_i64
+; CHECK:       ldr w0, [x0, x1]
+; CHECK-NOT:   uxtw
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+define i32 @load_register_sext_i8_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i8_to_i32
+; CHECK:       ldrsb w0, [x0, x1]
+; CHECK-NOT:   sxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = sext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_register_sext_i16_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i16_to_i32
+; CHECK:       ldrsh w0, [x0, x1]
+; CHECK-NOT:   sxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = sext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_register_sext_i8_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i8_to_i64
+; CHECK:       ldrsb x0, [x0, x1]
+; CHECK-NOT:   sxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = sext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_sext_i16_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i16_to_i64
+; CHECK:       ldrsh x0, [x0, x1]
+; CHECK-NOT:   sxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = sext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_sext_i32_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i32_to_i64
+; CHECK:       ldrsw x0, [x0, x1]
+; CHECK-NOT:   sxtw
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  %4 = sext i32 %3 to i64
+  ret i64 %4
+}
+
+; Extend
+define i32 @load_extend_zext_i8_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i8_to_i32
+; CHECK:       ldrb w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  %5 = zext i8 %4 to i32
+  ret i32 %5
+}
+
+define i32 @load_extend_zext_i16_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i16_to_i32
+; CHECK:       ldrh w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  %5 = zext i16 %4 to i32
+  ret i32 %5
+}
+
+define i64 @load_extend_zext_i8_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i8_to_i64
+; CHECK:       ldrb w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  %5 = zext i8 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_zext_i16_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i16_to_i64
+; CHECK:       ldrh w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  %5 = zext i16 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_zext_i32_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i32_to_i64
+; CHECK:       ldr w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxtw
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  %5 = zext i32 %4 to i64
+  ret i64 %5
+}
+
+define i32 @load_extend_sext_i8_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i8_to_i32
+; CHECK:       ldrsb w0, [x0, w1, sxtw]
+; CHECK-NOT:   sxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  %5 = sext i8 %4 to i32
+  ret i32 %5
+}
+
+define i32 @load_extend_sext_i16_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i16_to_i32
+; CHECK:       ldrsh w0, [x0, w1, sxtw]
+; CHECK-NOT:   sxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  %5 = sext i16 %4 to i32
+  ret i32 %5
+}
+
+define i64 @load_extend_sext_i8_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i8_to_i64
+; CHECK:       ldrsb x0, [x0, w1, sxtw]
+; CHECK-NOT:   sxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  %5 = sext i8 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_sext_i16_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i16_to_i64
+; CHECK:       ldrsh x0, [x0, w1, sxtw]
+; CHECK-NOT:   sxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  %5 = sext i16 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_sext_i32_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i32_to_i64
+; CHECK:       ldrsw x0, [x0, w1, sxtw]
+; CHECK-NOT:   sxtw
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  %5 = sext i32 %4 to i64
+  ret i64 %5
+}
+
diff --git a/test/CodeGen/AArch64/fast-isel-int-ext2.ll b/test/CodeGen/AArch64/fast-isel-int-ext2.ll
new file mode 100644
index 0000000..8df26b2
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-int-ext2.ll
@@ -0,0 +1,439 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -aarch64-atomic-cfg-tidy=false -disable-cgp-branch-opts -verify-machineinstrs < %s | FileCheck %s
+
+;
+; Test folding of the sign-/zero-extend into the load instruction.
+;
+
+; Unscaled
+define i32 @load_unscaled_zext_i8_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i8_to_i32
+; CHECK:       ldurb w0, [x0, #-8]
+; CHECK-NOT:   uxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_unscaled_zext_i16_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i16_to_i32
+; CHECK:       ldurh w0, [x0, #-8]
+; CHECK-NOT:   uxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_unscaled_zext_i8_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i8_to_i64
+; CHECK:       ldurb w0, [x0, #-8]
+; CHECK-NOT:   uxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_zext_i16_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i16_to_i64
+; CHECK:       ldurh w0, [x0, #-8]
+; CHECK-NOT:   uxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_zext_i32_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i32_to_i64
+; CHECK:       ldur w0, [x0, #-8]
+; CHECK-NOT:   uxtw
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+define i32 @load_unscaled_sext_i8_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i8_to_i32
+; CHECK:       ldursb w0, [x0, #-8]
+; CHECK-NOT:   sxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_unscaled_sext_i16_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i16_to_i32
+; CHECK:       ldursh w0, [x0, #-8]
+; CHECK-NOT:   sxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_unscaled_sext_i8_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i8_to_i64
+; CHECK:       ldursb x0, [x0, #-8]
+; CHECK-NOT:   sxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_sext_i16_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i16_to_i64
+; CHECK:       ldursh x0, [x0, #-8]
+; CHECK-NOT:   sxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_sext_i32_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i32_to_i64
+; CHECK:       ldursw x0, [x0, #-8]
+; CHECK-NOT:   sxtw
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i32 %3 to i64
+  ret i64 %4
+}
+
+; Register
+define i32 @load_register_zext_i8_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i8_to_i32
+; CHECK:       ldrb w0, [x0, x1]
+; CHECK-NOT:   uxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_register_zext_i16_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i16_to_i32
+; CHECK:       ldrh w0, [x0, x1]
+; CHECK-NOT:   uxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_register_zext_i8_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i8_to_i64
+; CHECK:       ldrb w0, [x0, x1]
+; CHECK-NOT:   uxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_zext_i16_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i16_to_i64
+; CHECK:       ldrh w0, [x0, x1]
+; CHECK-NOT:   uxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_zext_i32_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i32_to_i64
+; CHECK:       ldr w0, [x0, x1]
+; CHECK-NOT:   uxtw
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+define i32 @load_register_sext_i8_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i8_to_i32
+; CHECK:       ldrsb w0, [x0, x1]
+; CHECK-NOT:   sxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_register_sext_i16_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i16_to_i32
+; CHECK:       ldrsh w0, [x0, x1]
+; CHECK-NOT:   sxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_register_sext_i8_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i8_to_i64
+; CHECK:       ldrsb x0, [x0, x1]
+; CHECK-NOT:   sxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_sext_i16_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i16_to_i64
+; CHECK:       ldrsh x0, [x0, x1]
+; CHECK-NOT:   sxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_sext_i32_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i32_to_i64
+; CHECK:       ldrsw x0, [x0, x1]
+; CHECK-NOT:   sxtw
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i32 %3 to i64
+  ret i64 %4
+}
+
+; Extend
+define i32 @load_extend_zext_i8_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i8_to_i32
+; CHECK:       ldrb w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  br label %bb2
+
+bb2:
+  %5 = zext i8 %4 to i32
+  ret i32 %5
+}
+
+define i32 @load_extend_zext_i16_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i16_to_i32
+; CHECK:       ldrh w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  br label %bb2
+
+bb2:
+  %5 = zext i16 %4 to i32
+  ret i32 %5
+}
+
+define i64 @load_extend_zext_i8_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i8_to_i64
+; CHECK:       ldrb w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  br label %bb2
+
+bb2:
+  %5 = zext i8 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_zext_i16_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i16_to_i64
+; CHECK:       ldrh w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  br label %bb2
+
+bb2:
+  %5 = zext i16 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_zext_i32_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i32_to_i64
+; CHECK:       ldr w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxtw
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  br label %bb2
+
+bb2:
+  %5 = zext i32 %4 to i64
+  ret i64 %5
+}
+
+define i32 @load_extend_sext_i8_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i8_to_i32
+; CHECK:       ldrsb w0, [x0, w1, sxtw]
+; CHECK-NOT:   sxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  br label %bb2
+
+bb2:
+  %5 = sext i8 %4 to i32
+  ret i32 %5
+}
+
+define i32 @load_extend_sext_i16_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i16_to_i32
+; CHECK:       ldrsh w0, [x0, w1, sxtw]
+; CHECK-NOT:   sxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  br label %bb2
+
+bb2:
+  %5 = sext i16 %4 to i32
+  ret i32 %5
+}
+
+define i64 @load_extend_sext_i8_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i8_to_i64
+; CHECK:       ldrsb x0, [x0, w1, sxtw]
+; CHECK-NOT:   sxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  br label %bb2
+
+bb2:
+  %5 = sext i8 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_sext_i16_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i16_to_i64
+; CHECK:       ldrsh x0, [x0, w1, sxtw]
+; CHECK-NOT:   sxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  br label %bb2
+
+bb2:
+  %5 = sext i16 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_sext_i32_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i32_to_i64
+; CHECK:       ldrsw x0, [x0, w1, sxtw]
+; CHECK-NOT:   sxtw
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  br label %bb2
+
+bb2:
+  %5 = sext i32 %4 to i64
+  ret i64 %5
+}
+
diff --git a/test/CodeGen/AArch64/fast-isel-int-ext3.ll b/test/CodeGen/AArch64/fast-isel-int-ext3.ll
new file mode 100644
index 0000000..5d55a6b
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-int-ext3.ll
@@ -0,0 +1,117 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs < %s | FileCheck %s
+
+;
+; Test folding of the sign-/zero-extend into the load instruction.
+;
+
+; Unscaled
+define i32 @load_unscaled_zext_i8_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i8_to_i32
+; CHECK:       ldurb [[REG:w[0-9]+]], [x0, #-8]
+; CHECK:       uxtb w0, [[REG]]
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8 addrspace(256)*
+  %3 = load i8 addrspace(256)* %2
+  %4 = zext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_unscaled_zext_i16_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i16_to_i32
+; CHECK:       ldurh [[REG:w[0-9]+]], [x0, #-8]
+; CHECK:       uxth w0, [[REG]]
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16 addrspace(256)*
+  %3 = load i16 addrspace(256)* %2
+  %4 = zext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_unscaled_zext_i8_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i8_to_i64
+; CHECK:       ldurb w[[REG:[0-9]+]], [x0, #-8]
+; CHECK:       ubfx x0, x[[REG]], #0, #8
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8 addrspace(256)*
+  %3 = load i8 addrspace(256)* %2
+  %4 = zext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_zext_i16_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i16_to_i64
+; CHECK:       ldurh w[[REG:[0-9]+]], [x0, #-8]
+; CHECK:       ubfx x0, x[[REG]], #0, #16
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16 addrspace(256)*
+  %3 = load i16 addrspace(256)* %2
+  %4 = zext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_zext_i32_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i32_to_i64
+; CHECK:       ldur w[[REG:[0-9]+]], [x0, #-8]
+; CHECK:       ubfx x0, x[[REG]], #0, #32
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i32 addrspace(256)*
+  %3 = load i32 addrspace(256)* %2
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+define i32 @load_unscaled_sext_i8_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i8_to_i32
+; CHECK:       ldurb [[REG:w[0-9]+]], [x0, #-8]
+; CHECK:       sxtb w0, [[REG]]
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8 addrspace(256)*
+  %3 = load i8 addrspace(256)* %2
+  %4 = sext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_unscaled_sext_i16_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i16_to_i32
+; CHECK:       ldurh [[REG:w[0-9]+]], [x0, #-8]
+; CHECK:       sxth w0, [[REG]]
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16 addrspace(256)*
+  %3 = load i16 addrspace(256)* %2
+  %4 = sext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_unscaled_sext_i8_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i8_to_i64
+; CHECK:       ldurb [[REG:w[0-9]+]], [x0, #-8]
+; CHECK:       sxtb x0, [[REG]]
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8 addrspace(256)*
+  %3 = load i8 addrspace(256)* %2
+  %4 = sext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_sext_i16_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i16_to_i64
+; CHECK:       ldurh [[REG:w[0-9]+]], [x0, #-8]
+; CHECK:       sxth x0, [[REG]]
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16 addrspace(256)*
+  %3 = load i16 addrspace(256)* %2
+  %4 = sext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_sext_i32_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i32_to_i64
+; CHECK:       ldur [[REG:w[0-9]+]], [x0, #-8]
+; CHECK:       sxtw x0, [[REG]]
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i32 addrspace(256)*
+  %3 = load i32 addrspace(256)* %2
+  %4 = sext i32 %3 to i64
+  ret i64 %4
+}
+
diff --git a/test/CodeGen/AArch64/fast-isel-int-ext4.ll b/test/CodeGen/AArch64/fast-isel-int-ext4.ll
new file mode 100644
index 0000000..f25bb98
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-int-ext4.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+define i32 @kill_flag(i16 signext %a) {
+; CHECK-LABEL: kill_flag
+entry:
+  %0 = sext i16 %a to i32
+  br label %bb1
+
+bb1:
+  %1 = icmp slt i32 undef, %0
+  br i1 %1, label %loop, label %exit
+
+loop:
+  %2 = sext i16 %a to i32
+  %3 = icmp slt i32 undef, %2
+  br i1 %3, label %bb1, label %exit
+
+exit:
+  ret i32 0
+}
diff --git a/test/CodeGen/AArch64/fast-isel-intrinsic.ll b/test/CodeGen/AArch64/fast-isel-intrinsic.ll
new file mode 100644
index 0000000..fd1198a
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-intrinsic.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=aarch64-apple-darwin            -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs < %s | FileCheck %s
+
+define float @fabs_f32(float %a) {
+; CHECK-LABEL: fabs_f32
+; CHECK:       fabs s0, s0
+  %1 = call float @llvm.fabs.f32(float %a)
+  ret float %1
+}
+
+define double @fabs_f64(double %a) {
+; CHECK-LABEL: fabs_f64
+; CHECK:       fabs d0, d0
+  %1 = call double @llvm.fabs.f64(double %a)
+  ret double %1
+}
+
+declare double @llvm.fabs.f64(double)
+declare float @llvm.fabs.f32(float)
diff --git a/test/CodeGen/AArch64/fast-isel-logic-op.ll b/test/CodeGen/AArch64/fast-isel-logic-op.ll
new file mode 100644
index 0000000..2c7486e
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-logic-op.ll
@@ -0,0 +1,362 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel=0                  -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel=1 -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+; AND
+define zeroext i1 @and_rr_i1(i1 signext %a, i1 signext %b) {
+; CHECK-LABEL: and_rr_i1
+; CHECK:       and [[REG:w[0-9]+]], w0, w1
+  %1 = and i1 %a, %b
+  ret i1 %1
+}
+
+define zeroext i8 @and_rr_i8(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: and_rr_i8
+; CHECK:       and [[REG:w[0-9]+]], w0, w1
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xff
+  %1 = and i8 %a, %b
+  ret i8 %1
+}
+
+define zeroext i16 @and_rr_i16(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: and_rr_i16
+; CHECK:       and [[REG:w[0-9]+]], w0, w1
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xffff
+  %1 = and i16 %a, %b
+  ret i16 %1
+}
+
+define i32 @and_rr_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: and_rr_i32
+; CHECK:       and w0, w0, w1
+  %1 = and i32 %a, %b
+  ret i32 %1
+}
+
+define i64 @and_rr_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: and_rr_i64
+; CHECK:       and x0, x0, x1
+  %1 = and i64 %a, %b
+  ret i64 %1
+}
+
+define zeroext i1 @and_ri_i1(i1 signext %a) {
+; CHECK-LABEL: and_ri_i1
+; CHECK:       and {{w[0-9]+}}, w0, #0x1
+  %1 = and i1 %a, 1
+  ret i1 %1
+}
+
+define zeroext i8 @and_ri_i8(i8 signext %a) {
+; CHECK-LABEL: and_ri_i8
+; CHECK:       and {{w[0-9]+}}, w0, #0xf
+  %1 = and i8 %a, 15
+  ret i8 %1
+}
+
+define zeroext i16 @and_ri_i16(i16 signext %a) {
+; CHECK-LABEL: and_ri_i16
+; CHECK:       and {{w[0-9]+}}, w0, #0xff
+  %1 = and i16 %a, 255
+  ret i16 %1
+}
+
+define i32 @and_ri_i32(i32 %a) {
+; CHECK-LABEL: and_ri_i32
+; CHECK:       and w0, w0, #0xff
+  %1 = and i32 %a, 255
+  ret i32 %1
+}
+
+define i64 @and_ri_i64(i64 %a) {
+; CHECK-LABEL: and_ri_i64
+; CHECK:       and x0, x0, #0xff
+  %1 = and i64 %a, 255
+  ret i64 %1
+}
+
+define zeroext i8 @and_rs_i8(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: and_rs_i8
+; CHECK:       and [[REG:w[0-9]+]], w0, w1, lsl #4
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], {{#0xff|#0xf0}}
+  %1 = shl i8 %b, 4
+  %2 = and i8 %a, %1
+  ret i8 %2
+}
+
+define zeroext i16 @and_rs_i16(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: and_rs_i16
+; CHECK:       and [[REG:w[0-9]+]], w0, w1, lsl #8
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], {{#0xffff|#0xff00}}
+  %1 = shl i16 %b, 8
+  %2 = and i16 %a, %1
+  ret i16 %2
+}
+
+define i32 @and_rs_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: and_rs_i32
+; CHECK:       and w0, w0, w1, lsl #8
+  %1 = shl i32 %b, 8
+  %2 = and i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @and_rs_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: and_rs_i64
+; CHECK:       and x0, x0, x1, lsl #8
+  %1 = shl i64 %b, 8
+  %2 = and i64 %a, %1
+  ret i64 %2
+}
+
+define i32 @and_mul_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: and_mul_i32
+; CHECK:       and w0, w0, w1, lsl #2
+  %1 = mul i32 %b, 4
+  %2 = and i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @and_mul_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: and_mul_i64
+; CHECK:       and x0, x0, x1, lsl #2
+  %1 = mul i64 %b, 4
+  %2 = and i64 %a, %1
+  ret i64 %2
+}
+
+; OR
+define zeroext i1 @or_rr_i1(i1 signext %a, i1 signext %b) {
+; CHECK-LABEL: or_rr_i1
+; CHECK:       orr [[REG:w[0-9]+]], w0, w1
+  %1 = or i1 %a, %b
+  ret i1 %1
+}
+
+define zeroext i8 @or_rr_i8(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: or_rr_i8
+; CHECK:       orr [[REG:w[0-9]+]], w0, w1
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xff
+  %1 = or i8 %a, %b
+  ret i8 %1
+}
+
+define zeroext i16 @or_rr_i16(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: or_rr_i16
+; CHECK:       orr [[REG:w[0-9]+]], w0, w1
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xffff
+  %1 = or i16 %a, %b
+  ret i16 %1
+}
+
+define i32 @or_rr_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: or_rr_i32
+; CHECK:       orr w0, w0, w1
+  %1 = or i32 %a, %b
+  ret i32 %1
+}
+
+define i64 @or_rr_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: or_rr_i64
+; CHECK:       orr x0, x0, x1
+  %1 = or i64 %a, %b
+  ret i64 %1
+}
+
+define zeroext i8 @or_ri_i8(i8 %a) {
+; CHECK-LABEL: or_ri_i8
+; CHECK:       orr [[REG:w[0-9]+]], w0, #0xf
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xff
+  %1 = or i8 %a, 15
+  ret i8 %1
+}
+
+define zeroext i16 @or_ri_i16(i16 %a) {
+; CHECK-LABEL: or_ri_i16
+; CHECK:       orr [[REG:w[0-9]+]], w0, #0xff
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xffff
+  %1 = or i16 %a, 255
+  ret i16 %1
+}
+
+define i32 @or_ri_i32(i32 %a) {
+; CHECK-LABEL: or_ri_i32
+; CHECK:       orr w0, w0, #0xff
+  %1 = or i32 %a, 255
+  ret i32 %1
+}
+
+define i64 @or_ri_i64(i64 %a) {
+; CHECK-LABEL: or_ri_i64
+; CHECK:       orr x0, x0, #0xff
+  %1 = or i64 %a, 255
+  ret i64 %1
+}
+
+define zeroext i8 @or_rs_i8(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: or_rs_i8
+; CHECK:       orr [[REG:w[0-9]+]], w0, w1, lsl #4
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], {{#0xff|#0xf0}}
+  %1 = shl i8 %b, 4
+  %2 = or i8 %a, %1
+  ret i8 %2
+}
+
+define zeroext i16 @or_rs_i16(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: or_rs_i16
+; CHECK:       orr [[REG:w[0-9]+]], w0, w1, lsl #8
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], {{#0xffff|#0xff00}}
+  %1 = shl i16 %b, 8
+  %2 = or i16 %a, %1
+  ret i16 %2
+}
+
+define i32 @or_rs_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: or_rs_i32
+; CHECK:       orr w0, w0, w1, lsl #8
+  %1 = shl i32 %b, 8
+  %2 = or i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @or_rs_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: or_rs_i64
+; CHECK:       orr x0, x0, x1, lsl #8
+  %1 = shl i64 %b, 8
+  %2 = or i64 %a, %1
+  ret i64 %2
+}
+
+define i32 @or_mul_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: or_mul_i32
+; CHECK:       orr w0, w0, w1, lsl #2
+  %1 = mul i32 %b, 4
+  %2 = or i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @or_mul_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: or_mul_i64
+; CHECK:       orr x0, x0, x1, lsl #2
+  %1 = mul i64 %b, 4
+  %2 = or i64 %a, %1
+  ret i64 %2
+}
+
+; XOR
+define zeroext i1 @xor_rr_i1(i1 signext %a, i1 signext %b) {
+; CHECK-LABEL: xor_rr_i1
+; CHECK:       eor [[REG:w[0-9]+]], w0, w1
+  %1 = xor i1 %a, %b
+  ret i1 %1
+}
+
+define zeroext i8 @xor_rr_i8(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: xor_rr_i8
+; CHECK:       eor [[REG:w[0-9]+]], w0, w1
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xff
+  %1 = xor i8 %a, %b
+  ret i8 %1
+}
+
+define zeroext i16 @xor_rr_i16(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: xor_rr_i16
+; CHECK:       eor [[REG:w[0-9]+]], w0, w1
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xffff
+  %1 = xor i16 %a, %b
+  ret i16 %1
+}
+
+define i32 @xor_rr_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: xor_rr_i32
+; CHECK:       eor w0, w0, w1
+  %1 = xor i32 %a, %b
+  ret i32 %1
+}
+
+define i64 @xor_rr_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: xor_rr_i64
+; CHECK:       eor x0, x0, x1
+  %1 = xor i64 %a, %b
+  ret i64 %1
+}
+
+define zeroext i8 @xor_ri_i8(i8 signext %a) {
+; CHECK-LABEL: xor_ri_i8
+; CHECK:       eor [[REG:w[0-9]+]], w0, #0xf
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xff
+  %1 = xor i8 %a, 15
+  ret i8 %1
+}
+
+define zeroext i16 @xor_ri_i16(i16 signext %a) {
+; CHECK-LABEL: xor_ri_i16
+; CHECK:       eor [[REG:w[0-9]+]], w0, #0xff
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xffff
+  %1 = xor i16 %a, 255
+  ret i16 %1
+}
+
+define i32 @xor_ri_i32(i32 %a) {
+; CHECK-LABEL: xor_ri_i32
+; CHECK:       eor w0, w0, #0xff
+  %1 = xor i32 %a, 255
+  ret i32 %1
+}
+
+define i64 @xor_ri_i64(i64 %a) {
+; CHECK-LABEL: xor_ri_i64
+; CHECK:       eor x0, x0, #0xff
+  %1 = xor i64 %a, 255
+  ret i64 %1
+}
+
+define zeroext i8 @xor_rs_i8(i8 %a, i8 %b) {
+; CHECK-LABEL: xor_rs_i8
+; CHECK:       eor [[REG:w[0-9]+]], w0, w1, lsl #4
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], {{#0xff|#0xf0}}
+  %1 = shl i8 %b, 4
+  %2 = xor i8 %a, %1
+  ret i8 %2
+}
+
+define zeroext i16 @xor_rs_i16(i16 %a, i16 %b) {
+; CHECK-LABEL: xor_rs_i16
+; CHECK:       eor [[REG:w[0-9]+]], w0, w1, lsl #8
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], {{#0xffff|#0xff00}}
+  %1 = shl i16 %b, 8
+  %2 = xor i16 %a, %1
+  ret i16 %2
+}
+
+define i32 @xor_rs_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: xor_rs_i32
+; CHECK:       eor w0, w0, w1, lsl #8
+  %1 = shl i32 %b, 8
+  %2 = xor i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @xor_rs_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: xor_rs_i64
+; CHECK:       eor x0, x0, x1, lsl #8
+  %1 = shl i64 %b, 8
+  %2 = xor i64 %a, %1
+  ret i64 %2
+}
+
+define i32 @xor_mul_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: xor_mul_i32
+; CHECK:       eor w0, w0, w1, lsl #2
+  %1 = mul i32 %b, 4
+  %2 = xor i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @xor_mul_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: xor_mul_i64
+; CHECK:       eor x0, x0, x1, lsl #2
+  %1 = mul i64 %b, 4
+  %2 = xor i64 %a, %1
+  ret i64 %2
+}
+
diff --git a/test/CodeGen/AArch64/fast-isel-mul.ll b/test/CodeGen/AArch64/fast-isel-mul.ll
index d02c67f..f2fda27 100644
--- a/test/CodeGen/AArch64/fast-isel-mul.ll
+++ b/test/CodeGen/AArch64/fast-isel-mul.ll
@@ -1,40 +1,44 @@
-; RUN: llc -fast-isel -fast-isel-abort -mtriple=aarch64 -o - %s | FileCheck %s
+; RUN: llc -fast-isel -fast-isel-abort -verify-machineinstrs -mtriple=aarch64-apple-darwin < %s | FileCheck %s
 
-@var8 = global i8 0
-@var16 = global i16 0
-@var32 = global i32 0
-@var64 = global i64 0
-
-define void @test_mul8(i8 %lhs, i8 %rhs) {
+define zeroext i8 @test_mul8(i8 %lhs, i8 %rhs) {
 ; CHECK-LABEL: test_mul8:
-; CHECK: mul w0, w0, w1
-;  %lhs = load i8* @var8
-;  %rhs = load i8* @var8
-  %prod = mul i8 %lhs, %rhs
-  store i8 %prod, i8* @var8
-  ret void
+; CHECK:       mul {{w[0-9]+}}, w0, w1
+  %1 = mul i8 %lhs, %rhs
+  ret i8 %1
 }
 
-define void @test_mul16(i16 %lhs, i16 %rhs) {
+define zeroext i16 @test_mul16(i16 %lhs, i16 %rhs) {
 ; CHECK-LABEL: test_mul16:
-; CHECK: mul w0, w0, w1
-  %prod = mul i16 %lhs, %rhs
-  store i16 %prod, i16* @var16
-  ret void
+; CHECK:       mul {{w[0-9]+}}, w0, w1
+  %1 = mul i16 %lhs, %rhs
+  ret i16 %1
 }
 
-define void @test_mul32(i32 %lhs, i32 %rhs) {
+define i32 @test_mul32(i32 %lhs, i32 %rhs) {
 ; CHECK-LABEL: test_mul32:
-; CHECK: mul w0, w0, w1
-  %prod = mul i32 %lhs, %rhs
-  store i32 %prod, i32* @var32
-  ret void
+; CHECK:       mul {{w[0-9]+}}, w0, w1
+  %1 = mul i32 %lhs, %rhs
+  ret i32 %1
 }
 
-define void @test_mul64(i64 %lhs, i64 %rhs) {
+define i64 @test_mul64(i64 %lhs, i64 %rhs) {
 ; CHECK-LABEL: test_mul64:
-; CHECK: mul x0, x0, x1
-  %prod = mul i64 %lhs, %rhs
-  store i64 %prod, i64* @var64
-  ret void
+; CHECK:       mul {{x[0-9]+}}, x0, x1
+  %1 = mul i64 %lhs, %rhs
+  ret i64 %1
+}
+
+define i32 @test_mul2shift_i32(i32 %a) {
+; CHECK-LABEL: test_mul2shift_i32:
+; CHECK:       lsl {{w[0-9]+}}, w0, #2
+  %1 = mul i32 %a, 4
+  ret i32 %1
 }
+
+define i64 @test_mul2shift_i64(i64 %a) {
+; CHECK-LABEL: test_mul2shift_i64:
+; CHECK:       lsl {{x[0-9]+}}, x0, #3
+  %1 = mul i64 %a, 8
+  ret i64 %1
+}
+
diff --git a/test/CodeGen/AArch64/fast-isel-runtime-libcall.ll b/test/CodeGen/AArch64/fast-isel-runtime-libcall.ll
new file mode 100644
index 0000000..8d2d39a
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-runtime-libcall.ll
@@ -0,0 +1,96 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -code-model=small -verify-machineinstrs < %s | FileCheck %s --check-prefix=SMALL
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -code-model=large -verify-machineinstrs < %s | FileCheck %s --check-prefix=LARGE
+
+define float @frem_f32(float %a, float %b) {
+; SMALL-LABEL: frem_f32
+; SMALL:       bl _fmodf
+; LARGE-LABEL: frem_f32
+; LARGE:       adrp  [[REG:x[0-9]+]], _fmodf@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _fmodf@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = frem float %a, %b
+  ret float %1
+}
+
+define double @frem_f64(double %a, double %b) {
+; SMALL-LABEL: frem_f64
+; SMALL:       bl _fmod
+; LARGE-LABEL: frem_f64
+; LARGE:       adrp  [[REG:x[0-9]+]], _fmod@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _fmod@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = frem double %a, %b
+  ret double %1
+}
+
+define float @sin_f32(float %a) {
+; SMALL-LABEL: sin_f32
+; SMALL:       bl _sinf
+; LARGE-LABEL: sin_f32
+; LARGE:       adrp  [[REG:x[0-9]+]], _sinf@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _sinf@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = call float @llvm.sin.f32(float %a)
+  ret float %1
+}
+
+define double @sin_f64(double %a) {
+; SMALL-LABEL: sin_f64
+; SMALL:       bl _sin
+; LARGE-LABEL: sin_f64
+; LARGE:       adrp  [[REG:x[0-9]+]], _sin@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _sin@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = call double @llvm.sin.f64(double %a)
+  ret double %1
+}
+
+define float @cos_f32(float %a) {
+; SMALL-LABEL: cos_f32
+; SMALL:       bl _cosf
+; LARGE-LABEL: cos_f32
+; LARGE:       adrp  [[REG:x[0-9]+]], _cosf@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _cosf@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = call float @llvm.cos.f32(float %a)
+  ret float %1
+}
+
+define double @cos_f64(double %a) {
+; SMALL-LABEL: cos_f64
+; SMALL:       bl _cos
+; LARGE-LABEL: cos_f64
+; LARGE:       adrp  [[REG:x[0-9]+]], _cos@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _cos@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = call double @llvm.cos.f64(double %a)
+  ret double %1
+}
+
+define float @pow_f32(float %a, float %b) {
+; SMALL-LABEL: pow_f32
+; SMALL:       bl _powf
+; LARGE-LABEL: pow_f32
+; LARGE:       adrp  [[REG:x[0-9]+]], _powf@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _powf@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = call float @llvm.pow.f32(float %a, float %b)
+  ret float %1
+}
+
+define double @pow_f64(double %a, double %b) {
+; SMALL-LABEL: pow_f64
+; SMALL:       bl _pow
+; LARGE-LABEL: pow_f64
+; LARGE:       adrp  [[REG:x[0-9]+]], _pow@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _pow@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = call double @llvm.pow.f64(double %a, double %b)
+  ret double %1
+}
+declare float @llvm.sin.f32(float)
+declare double @llvm.sin.f64(double)
+declare float @llvm.cos.f32(float)
+declare double @llvm.cos.f64(double)
+declare float @llvm.pow.f32(float, float)
+declare double @llvm.pow.f64(double, double)
diff --git a/test/CodeGen/AArch64/fast-isel-sdiv.ll b/test/CodeGen/AArch64/fast-isel-sdiv.ll
new file mode 100644
index 0000000..3080776
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-sdiv.ll
@@ -0,0 +1,56 @@
+; RUN: llc -mtriple=aarch64-apple-darwin                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+define i32 @sdiv_i32_exact(i32 %a) {
+; CHECK-LABEL: sdiv_i32_exact
+; CHECK:       asr {{w[0-9]+}}, w0, #3
+  %1 = sdiv exact i32 %a, 8
+  ret i32 %1
+}
+
+define i32 @sdiv_i32_pos(i32 %a) {
+; CHECK-LABEL: sdiv_i32_pos
+; CHECK:       add [[REG1:w[0-9]+]], w0, #7
+; CHECK-NEXT:  cmp w0, #0
+; CHECK-NEXT:  csel [[REG2:w[0-9]+]], [[REG1]], w0, lt
+; CHECK-NEXT:  asr {{w[0-9]+}}, [[REG2]], #3
+  %1 = sdiv i32 %a, 8
+  ret i32 %1
+}
+
+define i32 @sdiv_i32_neg(i32 %a) {
+; CHECK-LABEL: sdiv_i32_neg
+; CHECK:       add [[REG1:w[0-9]+]], w0, #7
+; CHECK-NEXT:  cmp w0, #0
+; CHECK-NEXT:  csel [[REG2:w[0-9]+]], [[REG1]], w0, lt
+; CHECK-NEXT:  neg {{w[0-9]+}}, [[REG2]], asr #3
+  %1 = sdiv i32 %a, -8
+  ret i32 %1
+}
+
+define i64 @sdiv_i64_exact(i64 %a) {
+; CHECK-LABEL: sdiv_i64_exact
+; CHECK:       asr {{x[0-9]+}}, x0, #4
+  %1 = sdiv exact i64 %a, 16
+  ret i64 %1
+}
+
+define i64 @sdiv_i64_pos(i64 %a) {
+; CHECK-LABEL: sdiv_i64_pos
+; CHECK:       add [[REG1:x[0-9]+]], x0, #15
+; CHECK-NEXT:  cmp x0, #0
+; CHECK-NEXT:  csel [[REG2:x[0-9]+]], [[REG1]], x0, lt
+; CHECK-NEXT:  asr {{x[0-9]+}}, [[REG2]], #4
+  %1 = sdiv i64 %a, 16
+  ret i64 %1
+}
+
+define i64 @sdiv_i64_neg(i64 %a) {
+; CHECK-LABEL: sdiv_i64_neg
+; CHECK:       add [[REG1:x[0-9]+]], x0, #15
+; CHECK-NEXT:  cmp x0, #0
+; CHECK-NEXT:  csel [[REG2:x[0-9]+]], [[REG1]], x0, lt
+; CHECK-NEXT:  neg {{x[0-9]+}}, [[REG2]], asr #4
+  %1 = sdiv i64 %a, -16
+  ret i64 %1
+}
diff --git a/test/CodeGen/AArch64/fast-isel-select.ll b/test/CodeGen/AArch64/fast-isel-select.ll
new file mode 100644
index 0000000..928e9d4
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-select.ll
@@ -0,0 +1,316 @@
+; RUN: llc -mtriple=aarch64-apple-darwin                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+; First test the different supported value types for select.
+define zeroext i1 @select_i1(i1 zeroext %c, i1 zeroext %a, i1 zeroext %b) {
+; CHECK-LABEL: select_i1
+; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
+; CHECK-NEXT:  csel {{w[0-9]+}}, w1, w2, ne
+  %1 = select i1 %c, i1 %a, i1 %b
+  ret i1 %1
+}
+
+define zeroext i8 @select_i8(i1 zeroext %c, i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: select_i8
+; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
+; CHECK-NEXT:  csel {{w[0-9]+}}, w1, w2, ne
+  %1 = select i1 %c, i8 %a, i8 %b
+  ret i8 %1
+}
+
+define zeroext i16 @select_i16(i1 zeroext %c, i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: select_i16
+; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
+; CHECK-NEXT:  csel {{w[0-9]+}}, w1, w2, ne
+  %1 = select i1 %c, i16 %a, i16 %b
+  ret i16 %1
+}
+
+define i32 @select_i32(i1 zeroext %c, i32 %a, i32 %b) {
+; CHECK-LABEL: select_i32
+; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
+; CHECK-NEXT:  csel {{w[0-9]+}}, w1, w2, ne
+  %1 = select i1 %c, i32 %a, i32 %b
+  ret i32 %1
+}
+
+define i64 @select_i64(i1 zeroext %c, i64 %a, i64 %b) {
+; CHECK-LABEL: select_i64
+; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
+; CHECK-NEXT:  csel {{x[0-9]+}}, x1, x2, ne
+  %1 = select i1 %c, i64 %a, i64 %b
+  ret i64 %1
+}
+
+define float @select_f32(i1 zeroext %c, float %a, float %b) {
+; CHECK-LABEL: select_f32
+; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, ne
+  %1 = select i1 %c, float %a, float %b
+  ret float %1
+}
+
+define double @select_f64(i1 zeroext %c, double %a, double %b) {
+; CHECK-LABEL: select_f64
+; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
+; CHECK-NEXT:  fcsel {{d[0-9]+}}, d0, d1, ne
+  %1 = select i1 %c, double %a, double %b
+  ret double %1
+}
+
+; Now test the folding of all compares.
+define float @select_fcmp_false(float %x, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_false
+; CHECK:       mov.16b {{v[0-9]+}}, v2
+  %1 = fcmp ogt float %x, %x
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_ogt(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_ogt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, gt
+  %1 = fcmp ogt float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_oge(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_oge
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, ge
+  %1 = fcmp oge float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_olt(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_olt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, mi
+  %1 = fcmp olt float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_ole(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_ole
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, ls
+  %1 = fcmp ole float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_one(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_one
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel [[REG:s[0-9]+]], s2, s3, mi
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, [[REG]], gt
+  %1 = fcmp one float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_ord(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_ord
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, vc
+  %1 = fcmp ord float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_uno(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_uno
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, vs
+  %1 = fcmp uno float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_ueq(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_ueq
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel [[REG:s[0-9]+]], s2, s3, eq
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, [[REG]], vs
+  %1 = fcmp ueq float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_ugt(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_ugt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, hi
+  %1 = fcmp ugt float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_uge(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_uge
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, pl
+  %1 = fcmp uge float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_ult(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_ult
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, lt
+  %1 = fcmp ult float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+
+define float @select_fcmp_ule(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_ule
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, le
+  %1 = fcmp ule float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_une(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_une
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, ne
+  %1 = fcmp une float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_true(float %x, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_true
+; CHECK:       mov.16b {{v[0-9]+}}, v1
+  %1 = fcmp ueq float %x, %x
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_eq(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_eq
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, eq
+  %1 = icmp eq i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_ne(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_ne
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, ne
+  %1 = icmp ne i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_ugt(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_ugt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, hi
+  %1 = icmp ugt i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_uge(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_uge
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, hs
+  %1 = icmp uge i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_ult(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_ult
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, lo
+  %1 = icmp ult i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_ule(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_ule
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, ls
+  %1 = icmp ule i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_sgt(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_sgt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, gt
+  %1 = icmp sgt i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_sge(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_sge
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, ge
+  %1 = icmp sge i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_slt(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_slt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, lt
+  %1 = icmp slt i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_sle(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_sle
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, le
+  %1 = icmp sle i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+; Test peephole optimizations for select.
+define zeroext i1 @select_opt1(i1 zeroext %c, i1 zeroext %a) {
+; CHECK-LABEL: select_opt1
+; CHECK:       orr {{w[0-9]+}}, w0, w1
+  %1 = select i1 %c, i1 true, i1 %a
+  ret i1 %1
+}
+
+define zeroext i1 @select_opt2(i1 zeroext %c, i1 zeroext %a) {
+; CHECK-LABEL: select_opt2
+; CHECK:       eor [[REG:w[0-9]+]], w0, #0x1
+; CHECK:       orr {{w[0-9]+}}, [[REG]], w1
+  %1 = select i1 %c, i1 %a, i1 true
+  ret i1 %1
+}
+
+define zeroext i1 @select_opt3(i1 zeroext %c, i1 zeroext %a) {
+; CHECK-LABEL: select_opt3
+; CHECK:       bic {{w[0-9]+}}, w1, w0
+  %1 = select i1 %c, i1 false, i1 %a
+  ret i1 %1
+}
+
+define zeroext i1 @select_opt4(i1 zeroext %c, i1 zeroext %a) {
+; CHECK-LABEL: select_opt4
+; CHECK:       and {{w[0-9]+}}, w0, w1
+  %1 = select i1 %c, i1 %a, i1 false
+  ret i1 %1
+}
diff --git a/test/CodeGen/AArch64/fast-isel-shift.ll b/test/CodeGen/AArch64/fast-isel-shift.ll
new file mode 100644
index 0000000..ce4ba49
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-shift.ll
@@ -0,0 +1,545 @@
+; RUN: llc -fast-isel -fast-isel-abort -mtriple=aarch64-apple-darwin -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: asr_zext_i1_i16
+; CHECK:       uxth {{w[0-9]*}}, wzr
+define zeroext i16 @asr_zext_i1_i16(i1 %b) {
+  %1 = zext i1 %b to i16
+  %2 = ashr i16 %1, 1
+  ret i16 %2
+}
+
+; CHECK-LABEL: asr_sext_i1_i16
+; CHECK:       sbfx [[REG1:w[0-9]+]], {{w[0-9]*}}, #0, #1
+; CHECK-NEXT:  sxth {{w[0-9]*}}, [[REG1]]
+define signext i16 @asr_sext_i1_i16(i1 %b) {
+  %1 = sext i1 %b to i16
+  %2 = ashr i16 %1, 1
+  ret i16 %2
+}
+
+; CHECK-LABEL: asr_zext_i1_i32
+; CHECK:       mov {{w[0-9]*}}, wzr
+define i32 @asr_zext_i1_i32(i1 %b) {
+  %1 = zext i1 %b to i32
+  %2 = ashr i32 %1, 1
+  ret i32 %2
+}
+
+; CHECK-LABEL: asr_sext_i1_i32
+; CHECK:       sbfx  {{w[0-9]*}}, {{w[0-9]*}}, #0, #1
+define i32 @asr_sext_i1_i32(i1 %b) {
+  %1 = sext i1 %b to i32
+  %2 = ashr i32 %1, 1
+  ret i32 %2
+}
+
+; CHECK-LABEL: asr_zext_i1_i64
+; CHECK:       mov {{x[0-9]*}}, xzr
+define i64 @asr_zext_i1_i64(i1 %b) {
+  %1 = zext i1 %b to i64
+  %2 = ashr i64 %1, 1
+  ret i64 %2
+}
+
+; CHECK-LABEL: asr_sext_i1_i64
+; CHECK:       sbfx {{x[0-9]*}}, {{x[0-9]*}}, #0, #1
+define i64 @asr_sext_i1_i64(i1 %b) {
+  %1 = sext i1 %b to i64
+  %2 = ashr i64 %1, 1
+  ret i64 %2
+}
+
+; CHECK-LABEL: lsr_zext_i1_i16
+; CHECK:       uxth {{w[0-9]*}}, wzr
+define zeroext i16 @lsr_zext_i1_i16(i1 %b) {
+  %1 = zext i1 %b to i16
+  %2 = lshr i16 %1, 1
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsr_sext_i1_i16
+; CHECK:       sbfx [[REG1:w[0-9]+]], {{w[0-9]*}}, #0, #1
+; CHECK-NEXT:  ubfx [[REG2:w[0-9]+]], [[REG1]], #1, #15
+; CHECK-NEXT:  sxth {{w[0-9]*}}, [[REG2]]
+define signext i16 @lsr_sext_i1_i16(i1 %b) {
+  %1 = sext i1 %b to i16
+  %2 = lshr i16 %1, 1
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsr_zext_i1_i32
+; CHECK:       mov {{w[0-9]*}}, wzr
+define i32 @lsr_zext_i1_i32(i1 %b) {
+  %1 = zext i1 %b to i32
+  %2 = lshr i32 %1, 1
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsr_sext_i1_i32
+; CHECK:       sbfx [[REG1:w[0-9]+]], {{w[0-9]*}}, #0, #1
+; CHECK-NEXT:  lsr {{w[0-9]*}}, [[REG1:w[0-9]+]], #1
+define i32 @lsr_sext_i1_i32(i1 %b) {
+  %1 = sext i1 %b to i32
+  %2 = lshr i32 %1, 1
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsr_zext_i1_i64
+; CHECK:       mov {{x[0-9]*}}, xzr
+define i64 @lsr_zext_i1_i64(i1 %b) {
+  %1 = zext i1 %b to i64
+  %2 = lshr i64 %1, 1
+  ret i64 %2
+}
+
+; CHECK-LABEL: lsl_zext_i1_i16
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #1
+define zeroext i16 @lsl_zext_i1_i16(i1 %b) {
+  %1 = zext i1 %b to i16
+  %2 = shl i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsl_sext_i1_i16
+; CHECK:       sbfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #1
+define signext i16 @lsl_sext_i1_i16(i1 %b) {
+  %1 = sext i1 %b to i16
+  %2 = shl i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsl_zext_i1_i32
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #1
+define i32 @lsl_zext_i1_i32(i1 %b) {
+  %1 = zext i1 %b to i32
+  %2 = shl i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsl_sext_i1_i32
+; CHECK:       sbfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #1
+define i32 @lsl_sext_i1_i32(i1 %b) {
+  %1 = sext i1 %b to i32
+  %2 = shl i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsl_zext_i1_i64
+; CHECK:       ubfiz {{x[0-9]*}}, {{x[0-9]*}}, #4, #1
+define i64 @lsl_zext_i1_i64(i1 %b) {
+  %1 = zext i1 %b to i64
+  %2 = shl i64 %1, 4
+  ret i64 %2
+}
+
+; CHECK-LABEL: lsl_sext_i1_i64
+; CHECK:       sbfiz {{x[0-9]*}}, {{x[0-9]*}}, #4, #1
+define i64 @lsl_sext_i1_i64(i1 %b) {
+  %1 = sext i1 %b to i64
+  %2 = shl i64 %1, 4
+  ret i64 %2
+}
+
+; CHECK-LABEL: lslv_i8
+; CHECK:       and [[REG1:w[0-9]+]], w1, #0xff
+; CHECK-NEXT:  lsl [[REG2:w[0-9]+]], w0, [[REG1]]
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG2]], #0xff
+define zeroext i8 @lslv_i8(i8 %a, i8 %b) {
+  %1 = shl i8 %a, %b
+  ret i8 %1
+}
+
+; CHECK-LABEL: lsl_i8
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define zeroext i8 @lsl_i8(i8 %a) {
+  %1 = shl i8 %a, 4
+  ret i8 %1
+}
+
+; CHECK-LABEL: lsl_zext_i8_i16
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #8
+define zeroext i16 @lsl_zext_i8_i16(i8 %b) {
+  %1 = zext i8 %b to i16
+  %2 = shl i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsl_sext_i8_i16
+; CHECK:       sbfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #8
+define signext i16 @lsl_sext_i8_i16(i8 %b) {
+  %1 = sext i8 %b to i16
+  %2 = shl i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsl_zext_i8_i32
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #8
+define i32 @lsl_zext_i8_i32(i8 %b) {
+  %1 = zext i8 %b to i32
+  %2 = shl i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsl_sext_i8_i32
+; CHECK:       sbfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #8
+define i32 @lsl_sext_i8_i32(i8 %b) {
+  %1 = sext i8 %b to i32
+  %2 = shl i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsl_zext_i8_i64
+; CHECK:       ubfiz {{x[0-9]*}}, {{x[0-9]*}}, #4, #8
+define i64 @lsl_zext_i8_i64(i8 %b) {
+  %1 = zext i8 %b to i64
+  %2 = shl i64 %1, 4
+  ret i64 %2
+}
+
+; CHECK-LABEL: lsl_sext_i8_i64
+; CHECK:       sbfiz {{x[0-9]*}}, {{x[0-9]*}}, #4, #8
+define i64 @lsl_sext_i8_i64(i8 %b) {
+  %1 = sext i8 %b to i64
+  %2 = shl i64 %1, 4
+  ret i64 %2
+}
+
+; CHECK-LABEL: lslv_i16
+; CHECK:       and [[REG1:w[0-9]+]], w1, #0xffff
+; CHECK-NEXT:  lsl [[REG2:w[0-9]+]], w0, [[REG1]]
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG2]], #0xffff
+define zeroext i16 @lslv_i16(i16 %a, i16 %b) {
+  %1 = shl i16 %a, %b
+  ret i16 %1
+}
+
+; CHECK-LABEL: lsl_i16
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #8, #8
+define zeroext i16 @lsl_i16(i16 %a) {
+  %1 = shl i16 %a, 8
+  ret i16 %1
+}
+
+; CHECK-LABEL: lsl_zext_i16_i32
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #8, #16
+define i32 @lsl_zext_i16_i32(i16 %b) {
+  %1 = zext i16 %b to i32
+  %2 = shl i32 %1, 8
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsl_sext_i16_i32
+; CHECK:       sbfiz {{w[0-9]*}}, {{w[0-9]*}}, #8, #16
+define i32 @lsl_sext_i16_i32(i16 %b) {
+  %1 = sext i16 %b to i32
+  %2 = shl i32 %1, 8
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsl_zext_i16_i64
+; CHECK:       ubfiz {{x[0-9]*}}, {{x[0-9]*}}, #8, #16
+define i64 @lsl_zext_i16_i64(i16 %b) {
+  %1 = zext i16 %b to i64
+  %2 = shl i64 %1, 8
+  ret i64 %2
+}
+
+; CHECK-LABEL: lsl_sext_i16_i64
+; CHECK:       sbfiz {{x[0-9]*}}, {{x[0-9]*}}, #8, #16
+define i64 @lsl_sext_i16_i64(i16 %b) {
+  %1 = sext i16 %b to i64
+  %2 = shl i64 %1, 8
+  ret i64 %2
+}
+
+; CHECK-LABEL: lslv_i32
+; CHECK:       lsl {{w[0-9]*}}, w0, w1
+define zeroext i32 @lslv_i32(i32 %a, i32 %b) {
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+; CHECK-LABEL: lsl_i32
+; CHECK:       lsl {{w[0-9]*}}, {{w[0-9]*}}, #16
+define zeroext i32 @lsl_i32(i32 %a) {
+  %1 = shl i32 %a, 16
+  ret i32 %1
+}
+
+; CHECK-LABEL: lsl_zext_i32_i64
+; CHECK:       ubfiz {{x[0-9]+}}, {{x[0-9]+}}, #16, #32
+define i64 @lsl_zext_i32_i64(i32 %b) {
+  %1 = zext i32 %b to i64
+  %2 = shl i64 %1, 16
+  ret i64 %2
+}
+
+; CHECK-LABEL: lsl_sext_i32_i64
+; CHECK:       sbfiz {{x[0-9]+}}, {{x[0-9]+}}, #16, #32
+define i64 @lsl_sext_i32_i64(i32 %b) {
+  %1 = sext i32 %b to i64
+  %2 = shl i64 %1, 16
+  ret i64 %2
+}
+
+; CHECK-LABEL: lslv_i64
+; CHECK:       lsl {{x[0-9]*}}, x0, x1
+define i64 @lslv_i64(i64 %a, i64 %b) {
+  %1 = shl i64 %a, %b
+  ret i64 %1
+}
+
+; CHECK-LABEL: lsl_i64
+; CHECK:       lsl {{x[0-9]*}}, {{x[0-9]*}}, #32
+define i64 @lsl_i64(i64 %a) {
+  %1 = shl i64 %a, 32
+  ret i64 %1
+}
+
+; CHECK-LABEL: lsrv_i8
+; CHECK:       and [[REG1:w[0-9]+]], w0, #0xff
+; CHECK-NEXT:  and [[REG2:w[0-9]+]], w1, #0xff
+; CHECK-NEXT:  lsr [[REG3:w[0-9]+]], [[REG1]], [[REG2]]
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG3]], #0xff
+define zeroext i8 @lsrv_i8(i8 %a, i8 %b) {
+  %1 = lshr i8 %a, %b
+  ret i8 %1
+}
+
+; CHECK-LABEL: lsr_i8
+; CHECK:       ubfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define zeroext i8 @lsr_i8(i8 %a) {
+  %1 = lshr i8 %a, 4
+  ret i8 %1
+}
+
+; CHECK-LABEL: lsr_zext_i8_i16
+; CHECK:       ubfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define zeroext i16 @lsr_zext_i8_i16(i8 %b) {
+  %1 = zext i8 %b to i16
+  %2 = lshr i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsr_sext_i8_i16
+; CHECK:       sxtb [[REG:w[0-9]+]], w0
+; CHECK-NEXT:  ubfx {{w[0-9]*}}, [[REG]], #4, #12
+define signext i16 @lsr_sext_i8_i16(i8 %b) {
+  %1 = sext i8 %b to i16
+  %2 = lshr i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsr_zext_i8_i32
+; CHECK:       ubfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define i32 @lsr_zext_i8_i32(i8 %b) {
+  %1 = zext i8 %b to i32
+  %2 = lshr i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsr_sext_i8_i32
+; CHECK:       sxtb [[REG:w[0-9]+]], w0
+; CHECK-NEXT:  lsr {{w[0-9]*}}, [[REG]], #4
+define i32 @lsr_sext_i8_i32(i8 %b) {
+  %1 = sext i8 %b to i32
+  %2 = lshr i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsrv_i16
+; CHECK:       and [[REG1:w[0-9]+]], w0, #0xffff
+; CHECK-NEXT:  and [[REG2:w[0-9]+]], w1, #0xffff
+; CHECK-NEXT:  lsr [[REG3:w[0-9]+]], [[REG1]], [[REG2]]
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG3]], #0xffff
+define zeroext i16 @lsrv_i16(i16 %a, i16 %b) {
+  %1 = lshr i16 %a, %b
+  ret i16 %1
+}
+
+; CHECK-LABEL: lsr_i16
+; CHECK:       ubfx {{w[0-9]*}}, {{w[0-9]*}}, #8, #8
+define zeroext i16 @lsr_i16(i16 %a) {
+  %1 = lshr i16 %a, 8
+  ret i16 %1
+}
+
+; CHECK-LABEL: lsrv_i32
+; CHECK:       lsr {{w[0-9]*}}, w0, w1
+define zeroext i32 @lsrv_i32(i32 %a, i32 %b) {
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+; CHECK-LABEL: lsr_i32
+; CHECK:       lsr {{w[0-9]*}}, {{w[0-9]*}}, #16
+define zeroext i32 @lsr_i32(i32 %a) {
+  %1 = lshr i32 %a, 16
+  ret i32 %1
+}
+
+; CHECK-LABEL: lsrv_i64
+; CHECK:       lsr {{x[0-9]*}}, x0, x1
+define i64 @lsrv_i64(i64 %a, i64 %b) {
+  %1 = lshr i64 %a, %b
+  ret i64 %1
+}
+
+; CHECK-LABEL: lsr_i64
+; CHECK:       lsr {{x[0-9]*}}, {{x[0-9]*}}, #32
+define i64 @lsr_i64(i64 %a) {
+  %1 = lshr i64 %a, 32
+  ret i64 %1
+}
+
+; CHECK-LABEL: asrv_i8
+; CHECK:       sxtb [[REG1:w[0-9]+]], w0
+; CHECK-NEXT:  and  [[REG2:w[0-9]+]], w1, #0xff
+; CHECK-NEXT:  asr  [[REG3:w[0-9]+]], [[REG1]], [[REG2]]
+; CHECK-NEXT:  and  {{w[0-9]+}}, [[REG3]], #0xff
+define zeroext i8 @asrv_i8(i8 %a, i8 %b) {
+  %1 = ashr i8 %a, %b
+  ret i8 %1
+}
+
+; CHECK-LABEL: asr_i8
+; CHECK:       sbfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define zeroext i8 @asr_i8(i8 %a) {
+  %1 = ashr i8 %a, 4
+  ret i8 %1
+}
+
+; CHECK-LABEL: asr_zext_i8_i16
+; CHECK:       ubfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define zeroext i16 @asr_zext_i8_i16(i8 %b) {
+  %1 = zext i8 %b to i16
+  %2 = ashr i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: asr_sext_i8_i16
+; CHECK:       sbfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define signext i16 @asr_sext_i8_i16(i8 %b) {
+  %1 = sext i8 %b to i16
+  %2 = ashr i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: asr_zext_i8_i32
+; CHECK:       ubfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define i32 @asr_zext_i8_i32(i8 %b) {
+  %1 = zext i8 %b to i32
+  %2 = ashr i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: asr_sext_i8_i32
+; CHECK:       sbfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define i32 @asr_sext_i8_i32(i8 %b) {
+  %1 = sext i8 %b to i32
+  %2 = ashr i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: asrv_i16
+; CHECK:       sxth [[REG1:w[0-9]+]], w0
+; CHECK-NEXT:  and  [[REG2:w[0-9]+]], w1, #0xffff
+; CHECK-NEXT:  asr  [[REG3:w[0-9]+]], [[REG1]], [[REG2]]
+; CHECK-NEXT:  and  {{w[0-9]+}}, [[REG3]], #0xffff
+define zeroext i16 @asrv_i16(i16 %a, i16 %b) {
+  %1 = ashr i16 %a, %b
+  ret i16 %1
+}
+
+; CHECK-LABEL: asr_i16
+; CHECK:       sbfx {{w[0-9]*}}, {{w[0-9]*}}, #8, #8
+define zeroext i16 @asr_i16(i16 %a) {
+  %1 = ashr i16 %a, 8
+  ret i16 %1
+}
+
+; CHECK-LABEL: asrv_i32
+; CHECK:       asr {{w[0-9]*}}, w0, w1
+define zeroext i32 @asrv_i32(i32 %a, i32 %b) {
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+; CHECK-LABEL: asr_i32
+; CHECK:       asr {{w[0-9]*}}, {{w[0-9]*}}, #16
+define zeroext i32 @asr_i32(i32 %a) {
+  %1 = ashr i32 %a, 16
+  ret i32 %1
+}
+
+; CHECK-LABEL: asrv_i64
+; CHECK:       asr {{x[0-9]*}}, x0, x1
+define i64 @asrv_i64(i64 %a, i64 %b) {
+  %1 = ashr i64 %a, %b
+  ret i64 %1
+}
+
+; CHECK-LABEL: asr_i64
+; CHECK:       asr {{x[0-9]*}}, {{x[0-9]*}}, #32
+define i64 @asr_i64(i64 %a) {
+  %1 = ashr i64 %a, 32
+  ret i64 %1
+}
+
+; CHECK-LABEL: shift_test1
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+; CHECK-NEXT:  sbfx  {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define i32 @shift_test1(i8 %a) {
+  %1 = shl i8 %a, 4
+  %2 = ashr i8 %1, 4
+  %3 = sext i8 %2 to i32
+  ret i32 %3
+}
+
+; Test zero shifts
+
+; CHECK-LABEL: shl_zero
+; CHECK-NOT:   lsl
+define i32 @shl_zero(i32 %a) {
+  %1 = shl i32 %a, 0
+  ret i32 %1
+}
+
+; CHECK-LABEL: lshr_zero
+; CHECK-NOT:   lsr
+define i32 @lshr_zero(i32 %a) {
+  %1 = lshr i32 %a, 0
+  ret i32 %1
+}
+
+; CHECK-LABEL: ashr_zero
+; CHECK-NOT:   asr
+define i32 @ashr_zero(i32 %a) {
+  %1 = ashr i32 %a, 0
+  ret i32 %1
+}
+
+; CHECK-LABEL: shl_zext_zero
+; CHECK:       ubfx x0, x0, #0, #32
+define i64 @shl_zext_zero(i32 %a) {
+  %1 = zext i32 %a to i64
+  %2 = shl i64 %1, 0
+  ret i64 %2
+}
+
+; CHECK-LABEL: lshr_zext_zero
+; CHECK:       ubfx x0, x0, #0, #32
+define i64 @lshr_zext_zero(i32 %a) {
+  %1 = zext i32 %a to i64
+  %2 = lshr i64 %1, 0
+  ret i64 %2
+}
+
+; CHECK-LABEL: ashr_zext_zero
+; CHECK:       ubfx x0, x0, #0, #32
+define i64 @ashr_zext_zero(i32 %a) {
+  %1 = zext i32 %a to i64
+  %2 = ashr i64 %1, 0
+  ret i64 %2
+}
+
diff --git a/test/CodeGen/AArch64/fast-isel-sqrt.ll b/test/CodeGen/AArch64/fast-isel-sqrt.ll
new file mode 100644
index 0000000..1331d5c
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-sqrt.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=arm64-apple-darwin                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+define float @test_sqrt_f32(float %a) {
+; CHECK-LABEL: test_sqrt_f32
+; CHECK:       fsqrt s0, s0
+  %res = call float @llvm.sqrt.f32(float %a)
+  ret float %res
+}
+declare float @llvm.sqrt.f32(float) nounwind readnone
+
+define double @test_sqrt_f64(double %a) {
+; CHECK-LABEL: test_sqrt_f64
+; CHECK:       fsqrt d0, d0
+  %res = call double @llvm.sqrt.f64(double %a)
+  ret double %res
+}
+declare double @llvm.sqrt.f64(double) nounwind readnone
+
+
diff --git a/test/CodeGen/AArch64/fast-isel-switch-phi.ll b/test/CodeGen/AArch64/fast-isel-switch-phi.ll
new file mode 100644
index 0000000..c4f871c
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-switch-phi.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs < %s
+
+; Test that the Machine Instruction PHI node doesn't have more than one operand
+; from the same predecessor.
+define i32 @foo(i32 %a, i32 %b, i1 %c) {
+entry:
+  br i1 %c, label %switch, label %direct
+
+switch:
+  switch i32 %a, label %exit [
+    i32 43, label %continue
+    i32 45, label %continue
+  ]
+
+direct:
+  %var = add i32 %b, 1
+  br label %continue
+
+continue:
+  %var.phi = phi i32 [ %var, %direct ], [ 0, %switch ], [ 0, %switch ]
+  ret i32 %var.phi
+
+exit:
+  ret i32 1
+}
diff --git a/test/CodeGen/AArch64/fast-isel-tbz.ll b/test/CodeGen/AArch64/fast-isel-tbz.ll
new file mode 100644
index 0000000..d7f46b2
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-tbz.ll
@@ -0,0 +1,141 @@
+; RUN: llc                             -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -mtriple=aarch64-apple-darwin < %s | FileCheck %s
+; RUN: llc -fast-isel -fast-isel-abort -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -mtriple=aarch64-apple-darwin < %s | FileCheck %s
+
+define i32 @icmp_eq_i8(i8 zeroext %a) {
+; CHECK-LABEL: icmp_eq_i8
+; CHECK:       tbz {{w[0-9]+}}, #0, {{LBB.+_2}}
+  %1 = and i8 %a, 1
+  %2 = icmp eq i8 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i16(i16 zeroext %a) {
+; CHECK-LABEL: icmp_eq_i16
+; CHECK:       tbz w0, #1, {{LBB.+_2}}
+  %1 = and i16 %a, 2
+  %2 = icmp eq i16 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i32(i32 %a) {
+; CHECK-LABEL: icmp_eq_i32
+; CHECK:       tbz w0, #2, {{LBB.+_2}}
+  %1 = and i32 %a, 4
+  %2 = icmp eq i32 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i64_1(i64 %a) {
+; CHECK-LABEL: icmp_eq_i64_1
+; CHECK:       tbz w0, #3, {{LBB.+_2}}
+  %1 = and i64 %a, 8
+  %2 = icmp eq i64 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i64_2(i64 %a) {
+; CHECK-LABEL: icmp_eq_i64_2
+; CHECK:       tbz x0, #32, {{LBB.+_2}}
+  %1 = and i64 %a, 4294967296
+  %2 = icmp eq i64 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_ne_i8(i8 zeroext %a) {
+; CHECK-LABEL: icmp_ne_i8
+; CHECK:       tbnz w0, #0, {{LBB.+_2}}
+  %1 = and i8 %a, 1
+  %2 = icmp ne i8 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_ne_i16(i16 zeroext %a) {
+; CHECK-LABEL: icmp_ne_i16
+; CHECK:       tbnz w0, #1, {{LBB.+_2}}
+  %1 = and i16 %a, 2
+  %2 = icmp ne i16 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_ne_i32(i32 %a) {
+; CHECK-LABEL: icmp_ne_i32
+; CHECK:       tbnz w0, #2, {{LBB.+_2}}
+  %1 = and i32 %a, 4
+  %2 = icmp ne i32 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_ne_i64_1(i64 %a) {
+; CHECK-LABEL: icmp_ne_i64_1
+; CHECK:       tbnz w0, #3, {{LBB.+_2}}
+  %1 = and i64 %a, 8
+  %2 = icmp ne i64 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_ne_i64_2(i64 %a) {
+; CHECK-LABEL: icmp_ne_i64_2
+; CHECK:       tbnz x0, #32, {{LBB.+_2}}
+  %1 = and i64 %a, 4294967296
+  %2 = icmp ne i64 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+; Test that we don't fold the 'and' instruction into the compare.
+define i32 @icmp_eq_and_i32(i32 %a, i1 %c) {
+; CHECK-LABEL: icmp_eq_and_i32
+; CHECK:       and  [[REG:w[0-9]+]], w0, #0x4
+; CHECK-NEXT:  cbz  [[REG]], {{LBB.+_3}}
+  %1 = and i32 %a, 4
+  br i1 %c, label %bb0, label %bb2
+bb0:
+  %2 = icmp eq i32 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 0, i32 2147483647}
+!1 = metadata !{metadata !"branch_weights", i32 2147483647, i32 0}
diff --git a/test/CodeGen/AArch64/fast-isel-trunc.ll b/test/CodeGen/AArch64/fast-isel-trunc.ll
new file mode 100644
index 0000000..55937eb
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-trunc.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s
+
+; Test that %1 doesn't get the kill flag set before its last use.
+define i32 @test_trunc(i32 %a) {
+  %1 = add i32 %a, 1
+  %2 = trunc i32 %1 to i16
+  %3 = icmp ult i16 1, %2
+  %4 = add i32 %1, 1
+  %5 = sext i1 %3 to i32
+  %6 = and i32 %4, %5
+  ret i32 %6
+}
diff --git a/test/CodeGen/AArch64/fast-isel-vector-arithmetic.ll b/test/CodeGen/AArch64/fast-isel-vector-arithmetic.ll
new file mode 100644
index 0000000..eaa0db5
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-vector-arithmetic.ll
@@ -0,0 +1,74 @@
+; RUN: llc -mtriple=aarch64-apple-darwin                                                   -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -fast-isel-abort-args -verify-machineinstrs < %s | FileCheck %s
+
+; Vector Integer Add
+define <8 x i8> @add_v8i8_rr(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: add_v8i8_rr
+; CHECK: add.8b v0, v0, v1
+  %1 = add <8 x i8> %a, %b
+  ret <8 x i8> %1
+}
+
+define <16 x i8> @add_v16i8_rr(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: add_v16i8_rr
+; CHECK: add.16b v0, v0, v1
+  %1 = add <16 x i8> %a, %b
+  ret <16 x i8> %1
+}
+
+define <4 x i16> @add_v4i16_rr(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: add_v4i16_rr
+; CHECK: add.4h v0, v0, v1
+  %1 = add <4 x i16> %a, %b
+  ret <4 x i16> %1
+}
+
+define <8 x i16> @add_v8i16_rr(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: add_v8i16_rr
+; CHECK: add.8h v0, v0, v1
+  %1 = add <8 x i16> %a, %b
+  ret <8 x i16> %1
+}
+
+define <2 x i32> @add_v2i32_rr(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: add_v2i32_rr
+; CHECK: add.2s v0, v0, v1
+  %1 = add <2 x i32> %a, %b
+  ret <2 x i32> %1
+}
+
+define <4 x i32> @add_v4i32_rr(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: add_v4i32_rr
+; CHECK: add.4s v0, v0, v1
+  %1 = add <4 x i32> %a, %b
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @add_v2i64_rr(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: add_v2i64_rr
+; CHECK: add.2d v0, v0, v1
+  %1 = add <2 x i64> %a, %b
+  ret <2 x i64> %1
+}
+
+; Vector Floating-point Add
+define <2 x float> @add_v2f32_rr(<2 x float> %a, <2 x float> %b) {
+; CHECK: add_v2f32_rr
+; CHECK: fadd.2s v0, v0, v1
+  %1 = fadd <2 x float> %a, %b
+  ret <2 x float> %1
+}
+
+define <4 x float> @add_v4f32_rr(<4 x float> %a, <4 x float> %b) {
+; CHECK: add_v4f32_rr
+; CHECK: fadd.4s v0, v0, v1
+  %1 = fadd <4 x float> %a, %b
+  ret <4 x float> %1
+}
+
+define <2 x double> @add_v2f64_rr(<2 x double> %a, <2 x double> %b) {
+; CHECK: add_v2f64_rr
+; CHECK: fadd.2d v0, v0, v1
+  %1 = fadd <2 x double> %a, %b
+  ret <2 x double> %1
+}
diff --git a/test/CodeGen/AArch64/fast-isel-vret.ll b/test/CodeGen/AArch64/fast-isel-vret.ll
new file mode 100644
index 0000000..9ad9227
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-vret.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+; Test that we don't abort fast-isle for ret
+define <8 x i8> @ret_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: ret_v8i8
+; CHECK:       add.8b v0, v0, v1
+  %1 = add <8 x i8> %a, %b
+  ret <8 x i8> %1
+}
diff --git a/test/CodeGen/AArch64/fp16-instructions.ll b/test/CodeGen/AArch64/fp16-instructions.ll
new file mode 100644
index 0000000..7a44cd1
--- /dev/null
+++ b/test/CodeGen/AArch64/fp16-instructions.ll
@@ -0,0 +1,109 @@
+; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
+
+define half @add_h(half %a, half %b) {
+entry:
+; CHECK-LABEL: add_h:
+; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
+; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
+; CHECK: fadd [[RES:s[0-9]+]], [[OP1]], [[OP2]]
+; CHECK: fcvt h0, [[RES]]
+  %0 = fadd half %a, %b
+  ret half %0
+}
+
+
+define half @sub_h(half %a, half %b) {
+entry:
+; CHECK-LABEL: sub_h:
+; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
+; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
+; CHECK: fsub [[RES:s[0-9]+]], [[OP1]], [[OP2]]
+; CHECK: fcvt h0, [[RES]]
+  %0 = fsub half %a, %b
+  ret half %0
+}
+
+
+define half @mul_h(half %a, half %b) {
+entry:
+; CHECK-LABEL: mul_h:
+; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
+; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
+; CHECK: fmul [[RES:s[0-9]+]], [[OP1]], [[OP2]]
+; CHECK: fcvt h0, [[RES]]
+  %0 = fmul half %a, %b
+  ret half %0
+}
+
+
+define half @div_h(half %a, half %b) {
+entry:
+; CHECK-LABEL: div_h:
+; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
+; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
+; CHECK: fdiv [[RES:s[0-9]+]], [[OP1]], [[OP2]]
+; CHECK: fcvt h0, [[RES]]
+  %0 = fdiv half %a, %b
+  ret half %0
+}
+
+
+define half @load_h(half* %a) {
+entry:
+; CHECK-LABEL: load_h:
+; CHECK: ldr h0, [x0]
+  %0 = load half* %a, align 4
+  ret half %0
+}
+
+
+define void @store_h(half* %a, half %b) {
+entry:
+; CHECK-LABEL: store_h:
+; CHECK: str h0, [x0]
+  store half %b, half* %a, align 4
+  ret void
+}
+
+define half @s_to_h(float %a) {
+; CHECK-LABEL: s_to_h:
+; CHECK: fcvt h0, s0
+  %1 = fptrunc float %a to half
+  ret half %1
+}
+
+define half @d_to_h(double %a) {
+; CHECK-LABEL: d_to_h:
+; CHECK: fcvt h0, d0
+  %1 = fptrunc double %a to half
+  ret half %1
+}
+
+define float @h_to_s(half %a) {
+; CHECK-LABEL: h_to_s:
+; CHECK: fcvt s0, h0
+  %1 = fpext half %a to float
+  ret float %1
+}
+
+define double @h_to_d(half %a) {
+; CHECK-LABEL: h_to_d:
+; CHECK: fcvt d0, h0
+  %1 = fpext half %a to double
+  ret double %1
+}
+
+define half @bitcast_i_to_h(i16 %a) {
+; CHECK-LABEL: bitcast_i_to_h:
+; CHECK: fmov s0, w0
+  %1 = bitcast i16 %a to half
+  ret half %1
+}
+
+
+define i16 @bitcast_h_to_i(half %a) {
+; CHECK-LABEL: bitcast_h_to_i:
+; CHECK: fmov w0, s0
+  %1 = bitcast half %a to i16
+  ret i16 %1
+}
diff --git a/test/CodeGen/AArch64/fp16-v4-instructions.ll b/test/CodeGen/AArch64/fp16-v4-instructions.ll
new file mode 100644
index 0000000..8e89681
--- /dev/null
+++ b/test/CodeGen/AArch64/fp16-v4-instructions.ll
@@ -0,0 +1,122 @@
+; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
+
+define <4 x half> @add_h(<4 x half> %a, <4 x half> %b) {
+entry:
+; CHECK-LABEL: add_h:
+; CHECK-DAG: fcvtl [[OP1:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl [[OP2:v[0-9]+\.4s]], v1.4h
+; CHECK: fadd [[RES:v[0-9]+.4s]], [[OP1]], [[OP2]]
+; CHECK: fcvtn v0.4h, [[RES]]
+  %0 = fadd <4 x half> %a, %b
+  ret <4 x half> %0
+}
+
+
+define <4 x half> @sub_h(<4 x half> %a, <4 x half> %b) {
+entry:
+; CHECK-LABEL: sub_h:
+; CHECK-DAG: fcvtl [[OP1:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl [[OP2:v[0-9]+\.4s]], v1.4h
+; CHECK: fsub [[RES:v[0-9]+.4s]], [[OP1]], [[OP2]]
+; CHECK: fcvtn v0.4h, [[RES]]
+  %0 = fsub <4 x half> %a, %b
+  ret <4 x half> %0
+}
+
+
+define <4 x half> @mul_h(<4 x half> %a, <4 x half> %b) {
+entry:
+; CHECK-LABEL: mul_h:
+; CHECK-DAG: fcvtl [[OP1:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl [[OP2:v[0-9]+\.4s]], v1.4h
+; CHECK: fmul [[RES:v[0-9]+.4s]], [[OP1]], [[OP2]]
+; CHECK: fcvtn v0.4h, [[RES]]
+  %0 = fmul <4 x half> %a, %b
+  ret <4 x half> %0
+}
+
+
+define <4 x half> @div_h(<4 x half> %a, <4 x half> %b) {
+entry:
+; CHECK-LABEL: div_h:
+; CHECK-DAG: fcvtl [[OP1:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl [[OP2:v[0-9]+\.4s]], v1.4h
+; CHECK: fdiv [[RES:v[0-9]+.4s]], [[OP1]], [[OP2]]
+; CHECK: fcvtn v0.4h, [[RES]]
+  %0 = fdiv <4 x half> %a, %b
+  ret <4 x half> %0
+}
+
+
+define <4 x half> @load_h(<4 x half>* %a) {
+entry:
+; CHECK-LABEL: load_h:
+; CHECK: ldr d0, [x0]
+  %0 = load <4 x half>* %a, align 4
+  ret <4 x half> %0
+}
+
+
+define void @store_h(<4 x half>* %a, <4 x half> %b) {
+entry:
+; CHECK-LABEL: store_h:
+; CHECK: str d0, [x0]
+  store <4 x half> %b, <4 x half>* %a, align 4
+  ret void
+}
+
+define <4 x half> @s_to_h(<4 x float> %a) {
+; CHECK-LABEL: s_to_h:
+; CHECK: fcvtn v0.4h, v0.4s
+  %1 = fptrunc <4 x float> %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x half> @d_to_h(<4 x double> %a) {
+; CHECK-LABEL: d_to_h:
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+  %1 = fptrunc <4 x double> %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x float> @h_to_s(<4 x half> %a) {
+; CHECK-LABEL: h_to_s:
+; CHECK: fcvtl v0.4s, v0.4h
+  %1 = fpext <4 x half> %a to <4 x float>
+  ret <4 x float> %1
+}
+
+define <4 x double> @h_to_d(<4 x half> %a) {
+; CHECK-LABEL: h_to_d:
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+  %1 = fpext <4 x half> %a to <4 x double>
+  ret <4 x double> %1
+}
+
+define <4 x half> @bitcast_i_to_h(float, <4 x i16> %a) {
+; CHECK-LABEL: bitcast_i_to_h:
+; CHECK: mov v0.16b, v1.16b
+  %2 = bitcast <4 x i16> %a to <4 x half>
+  ret <4 x half> %2
+}
+
+define <4 x i16> @bitcast_h_to_i(float, <4 x half> %a) {
+; CHECK-LABEL: bitcast_h_to_i:
+; CHECK: mov v0.16b, v1.16b
+  %2 = bitcast <4 x half> %a to <4 x i16>
+  ret <4 x i16> %2
+}
diff --git a/test/CodeGen/AArch64/fp16-v8-instructions.ll b/test/CodeGen/AArch64/fp16-v8-instructions.ll
new file mode 100644
index 0000000..9ee2296
--- /dev/null
+++ b/test/CodeGen/AArch64/fp16-v8-instructions.ll
@@ -0,0 +1,255 @@
+; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
+
+define <8 x half> @add_h(<8 x half> %a, <8 x half> %b) {
+entry:
+; CHECK-LABEL: add_h:
+; CHECK: fcvt
+; CHECK: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK: fcvt
+  %0 = fadd <8 x half> %a, %b
+  ret <8 x half> %0
+}
+
+
+define <8 x half> @sub_h(<8 x half> %a, <8 x half> %b) {
+entry:
+; CHECK-LABEL: sub_h:
+; CHECK: fcvt
+; CHECK: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK: fcvt
+  %0 = fsub <8 x half> %a, %b
+  ret <8 x half> %0
+}
+
+
+define <8 x half> @mul_h(<8 x half> %a, <8 x half> %b) {
+entry:
+; CHECK-LABEL: mul_h:
+; CHECK: fcvt
+; CHECK: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK: fcvt
+  %0 = fmul <8 x half> %a, %b
+  ret <8 x half> %0
+}
+
+
+define <8 x half> @div_h(<8 x half> %a, <8 x half> %b) {
+entry:
+; CHECK-LABEL: div_h:
+; CHECK: fcvt
+; CHECK: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK: fcvt
+  %0 = fdiv <8 x half> %a, %b
+  ret <8 x half> %0
+}
+
+
+define <8 x half> @load_h(<8 x half>* %a) {
+entry:
+; CHECK-LABEL: load_h:
+; CHECK: ldr q0, [x0]
+  %0 = load <8 x half>* %a, align 4
+  ret <8 x half> %0
+}
+
+
+define void @store_h(<8 x half>* %a, <8 x half> %b) {
+entry:
+; CHECK-LABEL: store_h:
+; CHECK: str q0, [x0]
+  store <8 x half> %b, <8 x half>* %a, align 4
+  ret void
+}
+
+define <8 x half> @s_to_h(<8 x float> %a) {
+; CHECK-LABEL: s_to_h:
+; CHECK-DAG: fcvtn v0.4h, v0.4s
+; CHECK-DAG: fcvtn [[REG:v[0-9+]]].4h, v1.4s
+; CHECK: ins v0.d[1], [[REG]].d[0]
+  %1 = fptrunc <8 x float> %a to <8 x half>
+  ret <8 x half> %1
+}
+
+define <8 x half> @d_to_h(<8 x double> %a) {
+; CHECK-LABEL: d_to_h:
+; CHECK-DAG: ins v{{[0-9]+}}.d
+; CHECK-DAG: ins v{{[0-9]+}}.d
+; CHECK-DAG: ins v{{[0-9]+}}.d
+; CHECK-DAG: ins v{{[0-9]+}}.d
+; CHECK-DAG: fcvt h
+; CHECK-DAG: fcvt h
+; CHECK-DAG: fcvt h
+; CHECK-DAG: fcvt h
+; CHECK-DAG: fcvt h
+; CHECK-DAG: fcvt h
+; CHECK-DAG: fcvt h
+; CHECK-DAG: fcvt h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+  %1 = fptrunc <8 x double> %a to <8 x half>
+  ret <8 x half> %1
+}
+
+define <8 x float> @h_to_s(<8 x half> %a) {
+; CHECK-LABEL: h_to_s:
+; CHECK: fcvtl2 v1.4s, v0.8h
+; CHECK: fcvtl v0.4s, v0.4h
+  %1 = fpext <8 x half> %a to <8 x float>
+  ret <8 x float> %1
+}
+
+define <8 x double> @h_to_d(<8 x half> %a) {
+; CHECK-LABEL: h_to_d:
+; CHECK-DAG: fcvt d
+; CHECK-DAG: fcvt d
+; CHECK-DAG: fcvt d
+; CHECK-DAG: fcvt d
+; CHECK-DAG: fcvt d
+; CHECK-DAG: fcvt d
+; CHECK-DAG: fcvt d
+; CHECK-DAG: fcvt d
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+  %1 = fpext <8 x half> %a to <8 x double>
+  ret <8 x double> %1
+}
+
+
+define <8 x half> @bitcast_i_to_h(float, <8 x i16> %a) {
+; CHECK-LABEL: bitcast_i_to_h:
+; CHECK: mov v0.16b, v1.16b
+  %2 = bitcast <8 x i16> %a to <8 x half>
+  ret <8 x half> %2
+}
+
+define <8 x i16> @bitcast_h_to_i(float, <8 x half> %a) {
+; CHECK-LABEL: bitcast_h_to_i:
+; CHECK: mov v0.16b, v1.16b
+  %2 = bitcast <8 x half> %a to <8 x i16>
+  ret <8 x i16> %2
+}
+
diff --git a/test/CodeGen/AArch64/fp16-vector-bitcast.ll b/test/CodeGen/AArch64/fp16-vector-bitcast.ll
new file mode 100644
index 0000000..421a4f5
--- /dev/null
+++ b/test/CodeGen/AArch64/fp16-vector-bitcast.ll
@@ -0,0 +1,203 @@
+; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
+
+define <4 x i16> @v4f16_to_v4i16(float, <4 x half> %a) #0 {
+; CHECK-LABEL: v4f16_to_v4i16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x half> %a to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <2 x i32> @v4f16_to_v2i32(float, <4 x half> %a) #0 {
+; CHECK-LABEL: v4f16_to_v2i32:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x half> %a to <2 x i32>
+  ret <2 x i32> %1
+}
+
+define <1 x i64> @v4f16_to_v1i64(float, <4 x half> %a) #0 {
+; CHECK-LABEL: v4f16_to_v1i64:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x half> %a to <1 x i64>
+  ret <1 x i64> %1
+}
+
+define i64 @v4f16_to_i64(float, <4 x half> %a) #0 {
+; CHECK-LABEL: v4f16_to_i64:
+; CHECK: fmov x0, d1
+entry:
+  %1 = bitcast <4 x half> %a to i64
+  ret i64 %1
+}
+
+define <2 x float> @v4f16_to_v2float(float, <4 x half> %a) #0 {
+; CHECK-LABEL: v4f16_to_v2float:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x half> %a to <2 x float>
+  ret <2 x float> %1
+}
+
+define <1 x double> @v4f16_to_v1double(float, <4 x half> %a) #0 {
+; CHECK-LABEL: v4f16_to_v1double:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x half> %a to <1 x double>
+  ret <1 x double> %1
+}
+
+define double @v4f16_to_double(float, <4 x half> %a) #0 {
+; CHECK-LABEL: v4f16_to_double:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x half> %a to double
+  ret double %1
+}
+
+
+define <4 x half> @v4i16_to_v4f16(float, <4 x i16> %a) #0 {
+; CHECK-LABEL: v4i16_to_v4f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x i16> %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x half> @v2i32_to_v4f16(float, <2 x i32> %a) #0 {
+; CHECK-LABEL: v2i32_to_v4f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <2 x i32> %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x half> @v1i64_to_v4f16(float, <1 x i64> %a) #0 {
+; CHECK-LABEL: v1i64_to_v4f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <1 x i64> %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x half> @i64_to_v4f16(float, i64 %a) #0 {
+; CHECK-LABEL: i64_to_v4f16:
+; CHECK: fmov d0, x0
+entry:
+  %1 = bitcast i64 %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x half> @v2float_to_v4f16(float, <2 x float> %a) #0 {
+; CHECK-LABEL: v2float_to_v4f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <2 x float> %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x half> @v1double_to_v4f16(float, <1 x double> %a) #0 {
+; CHECK-LABEL: v1double_to_v4f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <1 x double> %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x half> @double_to_v4f16(float, double %a) #0 {
+; CHECK-LABEL: double_to_v4f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast double %a to <4 x half>
+  ret <4 x half> %1
+}
+
+
+
+
+
+
+
+
+
+
+define <8 x i16> @v8f16_to_v8i16(float, <8 x half> %a) #0 {
+; CHECK-LABEL: v8f16_to_v8i16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <8 x half> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @v8f16_to_v4i32(float, <8 x half> %a) #0 {
+; CHECK-LABEL: v8f16_to_v4i32:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <8 x half> %a to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @v8f16_to_v2i64(float, <8 x half> %a) #0 {
+; CHECK-LABEL: v8f16_to_v2i64:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <8 x half> %a to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <4 x float> @v8f16_to_v4float(float, <8 x half> %a) #0 {
+; CHECK-LABEL: v8f16_to_v4float:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <8 x half> %a to <4 x float>
+  ret <4 x float> %1
+}
+
+define <2 x double> @v8f16_to_v2double(float, <8 x half> %a) #0 {
+; CHECK-LABEL: v8f16_to_v2double:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <8 x half> %a to <2 x double>
+  ret <2 x double> %1
+}
+
+define <8 x half> @v8i16_to_v8f16(float, <8 x i16> %a) #0 {
+; CHECK-LABEL: v8i16_to_v8f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <8 x i16> %a to <8 x half>
+  ret <8 x half> %1
+}
+
+define <8 x half> @v4i32_to_v8f16(float, <4 x i32> %a) #0 {
+; CHECK-LABEL: v4i32_to_v8f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x i32> %a to <8 x half>
+  ret <8 x half> %1
+}
+
+define <8 x half> @v2i64_to_v8f16(float, <2 x i64> %a) #0 {
+; CHECK-LABEL: v2i64_to_v8f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <2 x i64> %a to <8 x half>
+  ret <8 x half> %1
+}
+
+define <8 x half> @v4float_to_v8f16(float, <4 x float> %a) #0 {
+; CHECK-LABEL: v4float_to_v8f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x float> %a to <8 x half>
+  ret <8 x half> %1
+}
+
+define <8 x half> @v2double_to_v8f16(float, <2 x double> %a) #0 {
+; CHECK-LABEL: v2double_to_v8f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <2 x double> %a to <8 x half>
+  ret <8 x half> %1
+}
diff --git a/test/CodeGen/AArch64/fp16-vector-load-store.ll b/test/CodeGen/AArch64/fp16-vector-load-store.ll
new file mode 100644
index 0000000..edbbffe
--- /dev/null
+++ b/test/CodeGen/AArch64/fp16-vector-load-store.ll
@@ -0,0 +1,528 @@
+; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
+
+; Simple load of v4i16
+define <4 x half> @load_64(<4 x half>* nocapture readonly %a) #0 {
+; CHECK-LABEL: load_64:
+; CHECK: ldr d0, [x0]
+entry:
+  %0 = load <4 x half>* %a, align 8
+  ret <4 x half> %0
+}
+
+; Simple load of v8i16
+define <8 x half> @load_128(<8 x half>* nocapture readonly %a) #0 {
+; CHECK-LABEL: load_128:
+; CHECK: ldr q0, [x0]
+entry:
+  %0 = load <8 x half>* %a, align 16
+  ret <8 x half> %0
+}
+
+; Duplicating load to v4i16
+define <4 x half> @load_dup_64(half* nocapture readonly %a) #0 {
+; CHECK-LABEL: load_dup_64:
+; CHECK: ld1r { v0.4h }, [x0]
+entry:
+  %0 = load half* %a, align 2
+  %1 = insertelement <4 x half> undef, half %0, i32 0
+  %2 = shufflevector <4 x half> %1, <4 x half> undef, <4 x i32> zeroinitializer
+  ret <4 x half> %2
+}
+
+; Duplicating load to v8i16
+define <8 x half> @load_dup_128(half* nocapture readonly %a) #0 {
+; CHECK-LABEL: load_dup_128:
+; CHECK: ld1r { v0.8h }, [x0]
+entry:
+  %0 = load half* %a, align 2
+  %1 = insertelement <8 x half> undef, half %0, i32 0
+  %2 = shufflevector <8 x half> %1, <8 x half> undef, <8 x i32> zeroinitializer
+  ret <8 x half> %2
+}
+
+; Load to one lane of v4f16
+define <4 x half> @load_lane_64(half* nocapture readonly %a, <4 x half> %b) #0 {
+; CHECK-LABEL: load_lane_64:
+; CHECK: ld1 { v0.h }[2], [x0]
+entry:
+  %0 = load half* %a, align 2
+  %1 = insertelement <4 x half> %b, half %0, i32 2
+  ret <4 x half> %1
+}
+
+; Load to one lane of v8f16
+define <8 x half> @load_lane_128(half* nocapture readonly %a, <8 x half> %b) #0 {
+; CHECK-LABEL: load_lane_128:
+; CHECK: ld1 { v0.h }[5], [x0]
+entry:
+  %0 = load half* %a, align 2
+  %1 = insertelement <8 x half> %b, half %0, i32 5
+  ret <8 x half> %1
+}
+
+; Simple store of v4f16
+define void @store_64(<4 x half>* nocapture %a, <4 x half> %b) #1 {
+; CHECK-LABEL: store_64:
+; CHECK: str d0, [x0]
+entry:
+  store <4 x half> %b, <4 x half>* %a, align 8
+  ret void
+}
+
+; Simple store of v8f16
+define void @store_128(<8 x half>* nocapture %a, <8 x half> %b) #1 {
+; CHECK-LABEL: store_128:
+; CHECK: str q0, [x0]
+entry:
+  store <8 x half> %b, <8 x half>* %a, align 16
+  ret void
+}
+
+; Store from one lane of v4f16
+define void @store_lane_64(half* nocapture %a, <4 x half> %b) #1 {
+; CHECK-LABEL: store_lane_64:
+; CHECK: st1 { v0.h }[2], [x0]
+entry:
+  %0 = extractelement <4 x half> %b, i32 2
+  store half %0, half* %a, align 2
+  ret void
+}
+
+; Store from one lane of v8f16
+define void @store_lane_128(half* nocapture %a, <8 x half> %b) #1 {
+; CHECK-LABEL: store_lane_128:
+; CHECK: st1 { v0.h }[5], [x0]
+entry:
+  %0 = extractelement <8 x half> %b, i32 5
+  store half %0, half* %a, align 2
+  ret void
+}
+
+; NEON intrinsics - (de-)interleaving loads and stores
+declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>*)
+declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>*)
+declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>*)
+declare void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*)
+declare void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*)
+declare void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*)
+declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>*)
+declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>*)
+declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>*)
+declare void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*)
+declare void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*)
+declare void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*)
+
+; Load 2 x v4f16 with de-interleaving
+define { <4 x half>, <4 x half> } @load_interleave_64_2(<4 x half>* %a) #0 {
+; CHECK-LABEL: load_interleave_64_2:
+; CHECK: ld2 { v0.4h, v1.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>* %a)
+  ret { <4 x half>, <4 x half> } %0
+}
+
+; Load 3 x v4f16 with de-interleaving
+define { <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_3(<4 x half>* %a) #0 {
+; CHECK-LABEL: load_interleave_64_3:
+; CHECK: ld3 { v0.4h, v1.4h, v2.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>* %a)
+  ret { <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Load 4 x v4f16 with de-interleaving
+define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_4(<4 x half>* %a) #0 {
+; CHECK-LABEL: load_interleave_64_4:
+; CHECK: ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>* %a)
+  ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Store 2 x v4f16 with interleaving
+define void @store_interleave_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: store_interleave_64_2:
+; CHECK: st2 { v0.4h, v1.4h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a)
+  ret void
+}
+
+; Store 3 x v4f16 with interleaving
+define void @store_interleave_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
+; CHECK-LABEL: store_interleave_64_3:
+; CHECK: st3 { v0.4h, v1.4h, v2.4h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a)
+  ret void
+}
+
+; Store 4 x v4f16 with interleaving
+define void @store_interleave_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
+; CHECK-LABEL: store_interleave_64_4:
+; CHECK: st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a)
+  ret void
+}
+
+; Load 2 x v8f16 with de-interleaving
+define { <8 x half>, <8 x half> } @load_interleave_128_2(<8 x half>* %a) #0 {
+; CHECK-LABEL: load_interleave_128_2:
+; CHECK: ld2 { v0.8h, v1.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>* %a)
+  ret { <8 x half>, <8 x half> } %0
+}
+
+; Load 3 x v8f16 with de-interleaving
+define { <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_3(<8 x half>* %a) #0 {
+; CHECK-LABEL: load_interleave_128_3:
+; CHECK: ld3 { v0.8h, v1.8h, v2.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>* %a)
+  ret { <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+; Load 8 x v8f16 with de-interleaving
+define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_4(<8 x half>* %a) #0 {
+; CHECK-LABEL: load_interleave_128_4:
+; CHECK: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>* %a)
+  ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+; Store 2 x v8f16 with interleaving
+define void @store_interleave_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: store_interleave_128_2:
+; CHECK: st2 { v0.8h, v1.8h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a)
+  ret void
+}
+
+; Store 3 x v8f16 with interleaving
+define void @store_interleave_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
+; CHECK-LABEL: store_interleave_128_3:
+; CHECK: st3 { v0.8h, v1.8h, v2.8h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a)
+  ret void
+}
+
+; Store 8 x v8f16 with interleaving
+define void @store_interleave_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
+; CHECK-LABEL: store_interleave_128_4:
+; CHECK: st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a)
+  ret void
+}
+
+; NEON intrinsics - duplicating loads
+declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half*)
+declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half*)
+declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half*)
+declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half*)
+declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half*)
+declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half*)
+
+; Load 2 x v4f16 with duplication
+define { <4 x half>, <4 x half> } @load_dup_64_2(half* %a) #0 {
+; CHECK-LABEL: load_dup_64_2:
+; CHECK: ld2r { v0.4h, v1.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half* %a)
+  ret { <4 x half>, <4 x half> } %0
+}
+
+; Load 3 x v4f16 with duplication
+define { <4 x half>, <4 x half>, <4 x half> } @load_dup_64_3(half* %a) #0 {
+; CHECK-LABEL: load_dup_64_3:
+; CHECK: ld3r { v0.4h, v1.4h, v2.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half* %a)
+  ret { <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Load 4 x v4f16 with duplication
+define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_dup_64_4(half* %a) #0 {
+; CHECK-LABEL: load_dup_64_4:
+; CHECK: ld4r { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half* %a)
+  ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Load 2 x v8f16 with duplication
+define { <8 x half>, <8 x half> } @load_dup_128_2(half* %a) #0 {
+; CHECK-LABEL: load_dup_128_2:
+; CHECK: ld2r { v0.8h, v1.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half* %a)
+  ret { <8 x half>, <8 x half> } %0
+}
+
+; Load 3 x v8f16 with duplication
+define { <8 x half>, <8 x half>, <8 x half> } @load_dup_128_3(half* %a) #0 {
+; CHECK-LABEL: load_dup_128_3:
+; CHECK: ld3r { v0.8h, v1.8h, v2.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half* %a)
+  ret { <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+; Load 8 x v8f16 with duplication
+define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_dup_128_4(half* %a) #0 {
+; CHECK-LABEL: load_dup_128_4:
+; CHECK: ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half* %a)
+  ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+
+; NEON intrinsics - loads and stores to/from one lane
+declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*)
+declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*)
+declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*)
+declare void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*)
+declare void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*)
+declare void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*)
+declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*)
+declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*)
+declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*)
+declare void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*)
+declare void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*)
+declare void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*)
+
+; Load one lane of 2 x v4f16
+define { <4 x half>, <4 x half> } @load_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: load_lane_64_2:
+; CHECK: ld2 { v0.h, v1.h }[2], [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a)
+  ret { <4 x half>, <4 x half> } %0
+}
+
+; Load one lane of 3 x v4f16
+define { <4 x half>, <4 x half>, <4 x half> } @load_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
+; CHECK-LABEL: load_lane_64_3:
+; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a)
+  ret { <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Load one lane of 4 x v4f16
+define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
+; CHECK-LABEL: load_lane_64_4:
+; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a)
+  ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Store one lane of 2 x v4f16
+define void @store_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: store_lane_64_2:
+; CHECK: st2 { v0.h, v1.h }[2], [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a)
+  ret void
+}
+
+; Store one lane of 3 x v4f16
+define void @store_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
+; CHECK-LABEL: store_lane_64_3:
+; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a)
+  ret void
+}
+
+; Store one lane of 4 x v4f16
+define void @store_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
+; CHECK-LABEL: store_lane_64_4:
+; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a)
+  ret void
+}
+
+; Load one lane of 2 x v8f16
+define { <8 x half>, <8 x half> } @load_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: load_lane_128_2:
+; CHECK: ld2 { v0.h, v1.h }[2], [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a)
+  ret { <8 x half>, <8 x half> } %0
+}
+
+; Load one lane of 3 x v8f16
+define { <8 x half>, <8 x half>, <8 x half> } @load_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
+; CHECK-LABEL: load_lane_128_3:
+; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a)
+  ret { <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+; Load one lane of 8 x v8f16
+define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
+; CHECK-LABEL: load_lane_128_4:
+; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a)
+  ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+; Store one lane of 2 x v8f16
+define void @store_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: store_lane_128_2:
+; CHECK: st2 { v0.h, v1.h }[2], [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a)
+  ret void
+}
+
+; Store one lane of 3 x v8f16
+define void @store_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
+; CHECK-LABEL: store_lane_128_3:
+; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a)
+  ret void
+}
+
+; Store one lane of 8 x v8f16
+define void @store_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
+; CHECK-LABEL: store_lane_128_4:
+; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a)
+  ret void
+}
+
+; NEON intrinsics - load/store without interleaving
+declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>*)
+declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>*)
+declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>*)
+declare void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*)
+declare void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*)
+declare void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*)
+declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>*)
+declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>*)
+declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>*)
+declare void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*)
+declare void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*)
+declare void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*)
+
+; Load 2 x v4f16 without de-interleaving
+define { <4 x half>, <4 x half> } @load_64_2(<4 x half>* %a) #0 {
+; CHECK-LABEL: load_64_2:
+; CHECK: ld1 { v0.4h, v1.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>* %a)
+  ret { <4 x half>, <4 x half> } %0
+}
+
+; Load 3 x v4f16 without de-interleaving
+define { <4 x half>, <4 x half>, <4 x half> } @load_64_3(<4 x half>* %a) #0 {
+; CHECK-LABEL: load_64_3:
+; CHECK: ld1 { v0.4h, v1.4h, v2.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>* %a)
+  ret { <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Load 4 x v4f16 without de-interleaving
+define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_64_4(<4 x half>* %a) #0 {
+; CHECK-LABEL: load_64_4:
+; CHECK: ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>* %a)
+  ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Store 2 x v4f16 without interleaving
+define void @store_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: store_64_2:
+; CHECK: st1 { v0.4h, v1.4h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a)
+  ret void
+}
+
+; Store 3 x v4f16 without interleaving
+define void @store_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
+; CHECK-LABEL: store_64_3:
+; CHECK: st1 { v0.4h, v1.4h, v2.4h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a)
+  ret void
+}
+
+; Store 4 x v4f16 without interleaving
+define void @store_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
+; CHECK-LABEL: store_64_4:
+; CHECK: st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a)
+  ret void
+}
+
+; Load 2 x v8f16 without de-interleaving
+define { <8 x half>, <8 x half> } @load_128_2(<8 x half>* %a) #0 {
+; CHECK-LABEL: load_128_2:
+; CHECK: ld1 { v0.8h, v1.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>* %a)
+  ret { <8 x half>, <8 x half> } %0
+}
+
+; Load 3 x v8f16 without de-interleaving
+define { <8 x half>, <8 x half>, <8 x half> } @load_128_3(<8 x half>* %a) #0 {
+; CHECK-LABEL: load_128_3:
+; CHECK: ld1 { v0.8h, v1.8h, v2.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>* %a)
+  ret { <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+; Load 8 x v8f16 without de-interleaving
+define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_128_4(<8 x half>* %a) #0 {
+; CHECK-LABEL: load_128_4:
+; CHECK: ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>* %a)
+  ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+; Store 2 x v8f16 without interleaving
+define void @store_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: store_128_2:
+; CHECK: st1 { v0.8h, v1.8h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a)
+  ret void
+}
+
+; Store 3 x v8f16 without interleaving
+define void @store_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
+; CHECK-LABEL: store_128_3:
+; CHECK: st1 { v0.8h, v1.8h, v2.8h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a)
+  ret void
+}
+
+; Store 8 x v8f16 without interleaving
+define void @store_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
+; CHECK-LABEL: store_128_4:
+; CHECK: st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/fp16-vector-shuffle.ll b/test/CodeGen/AArch64/fp16-vector-shuffle.ll
new file mode 100644
index 0000000..74d1b43
--- /dev/null
+++ b/test/CodeGen/AArch64/fp16-vector-shuffle.ll
@@ -0,0 +1,301 @@
+; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
+
+; float16x4_t select_64(float16x4_t a, float16x4_t b, uint16x4_t c) { return vbsl_u16(c, a, b); }
+define <4 x half> @select_64(<4 x half> %a, <4 x half> %b, <4 x i16> %c) #0 {
+; CHECK-LABEL: select_64:
+; CHECK: bsl
+entry:
+  %0 = bitcast <4 x half> %a to <4 x i16>
+  %1 = bitcast <4 x half> %b to <4 x i16>
+  %vbsl3.i = and <4 x i16> %0, %c
+  %2 = xor <4 x i16> %c, <i16 -1, i16 -1, i16 -1, i16 -1>
+  %vbsl4.i = and <4 x i16> %1, %2
+  %vbsl5.i = or <4 x i16> %vbsl3.i, %vbsl4.i
+  %3 = bitcast <4 x i16> %vbsl5.i to <4 x half>
+  ret <4 x half> %3
+}
+
+; float16x8_t select_128(float16x8_t a, float16x8_t b, uint16x8_t c) { return vbslq_u16(c, a, b); }
+define <8 x half> @select_128(<8 x half> %a, <8 x half> %b, <8 x i16> %c) #0 {
+; CHECK-LABEL: select_128:
+; CHECK: bsl
+entry:
+  %0 = bitcast <8 x half> %a to <8 x i16>
+  %1 = bitcast <8 x half> %b to <8 x i16>
+  %vbsl3.i = and <8 x i16> %0, %c
+  %2 = xor <8 x i16> %c, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %vbsl4.i = and <8 x i16> %1, %2
+  %vbsl5.i = or <8 x i16> %vbsl3.i, %vbsl4.i
+  %3 = bitcast <8 x i16> %vbsl5.i to <8 x half>
+  ret <8 x half> %3
+}
+
+; float16x4_t lane_64_64(float16x4_t a, float16x4_t b) {
+;  return vcopy_lane_s16(a, 1, b, 2);
+; }
+define <4 x half> @lane_64_64(<4 x half> %a, <4 x half> %b) #0 {
+; CHECK-LABEL: lane_64_64:
+; CHECK: ins
+entry:
+  %0 = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x half> %0
+}
+
+; float16x8_t lane_128_64(float16x8_t a, float16x4_t b) {
+;   return vcopyq_lane_s16(a, 1, b, 2);
+; }
+define <8 x half> @lane_128_64(<8 x half> %a, <4 x half> %b) #0 {
+; CHECK-LABEL: lane_128_64:
+; CHECK: ins
+entry:
+  %0 = bitcast <4 x half> %b to <4 x i16>
+  %vget_lane = extractelement <4 x i16> %0, i32 2
+  %1 = bitcast <8 x half> %a to <8 x i16>
+  %vset_lane = insertelement <8 x i16> %1, i16 %vget_lane, i32 1
+  %2 = bitcast <8 x i16> %vset_lane to <8 x half>
+  ret <8 x half> %2
+}
+
+; float16x4_t lane_64_128(float16x4_t a, float16x8_t b) {
+;   return vcopy_laneq_s16(a, 3, b, 5);
+; }
+define <4 x half> @lane_64_128(<4 x half> %a, <8 x half> %b) #0 {
+; CHECK-LABEL: lane_64_128:
+; CHECK: ins
+entry:
+  %0 = bitcast <8 x half> %b to <8 x i16>
+  %vgetq_lane = extractelement <8 x i16> %0, i32 5
+  %1 = bitcast <4 x half> %a to <4 x i16>
+  %vset_lane = insertelement <4 x i16> %1, i16 %vgetq_lane, i32 3
+  %2 = bitcast <4 x i16> %vset_lane to <4 x half>
+  ret <4 x half> %2
+}
+
+; float16x8_t lane_128_128(float16x8_t a, float16x8_t b) {
+;   return vcopyq_laneq_s16(a, 3, b, 5);
+; }
+define <8 x half> @lane_128_128(<8 x half> %a, <8 x half> %b) #0 {
+; CHECK-LABEL: lane_128_128:
+; CHECK: ins
+entry:
+  %0 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x half> %0
+}
+
+; float16x4_t ext_64(float16x4_t a, float16x4_t b) {
+;   return vext_s16(a, b, 3);
+; }
+define <4 x half> @ext_64(<4 x half> %a, <4 x half> %b) #0 {
+; CHECK-LABEL: ext_64:
+; CHECK: ext
+entry:
+  %0 = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x half> %0
+}
+
+; float16x8_t ext_128(float16x8_t a, float16x8_t b) {
+;   return vextq_s16(a, b, 3);
+; }
+define <8 x half> @ext_128(<8 x half> %a, <8 x half> %b) #0 {
+; CHECK-LABEL: ext_128:
+; CHECK: ext
+entry:
+  %0 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+  ret <8 x half> %0
+}
+
+; float16x4_t rev32_64(float16x4_t a) {
+;   return vrev32_s16(a);
+; }
+define <4 x half> @rev32_64(<4 x half> %a) #0 {
+entry:
+; CHECK-LABEL: rev32_64:
+; CHECK: rev32
+  %0 = shufflevector <4 x half> %a, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  ret <4 x half> %0
+}
+
+; float16x4_t rev64_64(float16x4_t a) {
+;   return vrev64_s16(a);
+; }
+define <4 x half> @rev64_64(<4 x half> %a) #0 {
+entry:
+; CHECK-LABEL: rev64_64:
+; CHECK: rev64
+  %0 = shufflevector <4 x half> %a, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x half> %0
+}
+
+; float16x8_t rev32_128(float16x8_t a) {
+;   return vrev32q_s16(a);
+; }
+define <8 x half> @rev32_128(<8 x half> %a) #0 {
+entry:
+; CHECK-LABEL: rev32_128:
+; CHECK: rev32
+  %0 = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  ret <8 x half> %0
+}
+
+; float16x8_t rev64_128(float16x8_t a) {
+;   return vrev64q_s16(a);
+; }
+define <8 x half> @rev64_128(<8 x half> %a) #0 {
+entry:
+; CHECK-LABEL: rev64_128:
+; CHECK: rev64
+  %0 = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x half> %0
+}
+
+; float16x4_t create_64(long long a) { return vcreate_f16(a); }
+define <4 x half> @create_64(i64 %a) #0 {
+; CHECK-LABEL: create_64:
+; CHECK: fmov
+entry:
+  %0 = bitcast i64 %a to <4 x half>
+  ret <4 x half> %0
+}
+
+; float16x4_t dup_64(__fp16 a) { return vdup_n_f16(a); }
+define <4 x half> @dup_64(half %a) #0 {
+; CHECK-LABEL: dup_64:
+; CHECK: dup
+entry:
+  %vecinit = insertelement <4 x half> undef, half %a, i32 0
+  %vecinit1 = insertelement <4 x half> %vecinit, half %a, i32 1
+  %vecinit2 = insertelement <4 x half> %vecinit1, half %a, i32 2
+  %vecinit3 = insertelement <4 x half> %vecinit2, half %a, i32 3
+  ret <4 x half> %vecinit3
+}
+
+; float16x8_t dup_128(__fp16 a) { return vdupq_n_f16(a); }
+define <8 x half> @dup_128(half %a) #0 {
+entry:
+; CHECK-LABEL: dup_128:
+; CHECK: dup
+  %vecinit = insertelement <8 x half> undef, half %a, i32 0
+  %vecinit1 = insertelement <8 x half> %vecinit, half %a, i32 1
+  %vecinit2 = insertelement <8 x half> %vecinit1, half %a, i32 2
+  %vecinit3 = insertelement <8 x half> %vecinit2, half %a, i32 3
+  %vecinit4 = insertelement <8 x half> %vecinit3, half %a, i32 4
+  %vecinit5 = insertelement <8 x half> %vecinit4, half %a, i32 5
+  %vecinit6 = insertelement <8 x half> %vecinit5, half %a, i32 6
+  %vecinit7 = insertelement <8 x half> %vecinit6, half %a, i32 7
+  ret <8 x half> %vecinit7
+}
+
+; float16x4_t dup_lane_64(float16x4_t a) { return vdup_lane_f16(a, 2); }
+define <4 x half> @dup_lane_64(<4 x half> %a) #0 {
+entry:
+; CHECK-LABEL: dup_lane_64:
+; CHECK: dup
+  %shuffle = shufflevector <4 x half> %a, <4 x half> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x half> %shuffle
+}
+
+; float16x8_t dup_lane_128(float16x4_t a) { return vdupq_lane_f16(a, 2); }
+define <8 x half> @dup_lane_128(<4 x half> %a) #0 {
+entry:
+; CHECK-LABEL: dup_lane_128:
+; CHECK: dup
+  %shuffle = shufflevector <4 x half> %a, <4 x half> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x half> %shuffle
+}
+
+; float16x4_t dup_laneq_64(float16x8_t a) { return vdup_laneq_f16(a, 2); }
+define <4 x half> @dup_laneq_64(<8 x half> %a) #0 {
+entry:
+; CHECK-LABEL: dup_laneq_64:
+; CHECK: dup
+  %shuffle = shufflevector <8 x half> %a, <8 x half> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x half> %shuffle
+}
+
+; float16x8_t dup_laneq_128(float16x8_t a) { return vdupq_laneq_f16(a, 2); }
+define <8 x half> @dup_laneq_128(<8 x half> %a) #0 {
+entry:
+; CHECK-LABEL: dup_laneq_128:
+; CHECK: dup
+  %shuffle = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x half> %shuffle
+}
+
+; float16x8_t vcombine(float16x4_t a, float16x4_t b) { return vcombine_f16(a, b); }
+define <8 x half> @vcombine(<4 x half> %a, <4 x half> %b) #0 {
+entry:
+; CHECK-LABEL: vcombine:
+; CHECK: ins
+  %shuffle.i = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x half> %shuffle.i
+}
+
+; float16x4_t get_high(float16x8_t a) { return vget_high_f16(a); }
+define <4 x half> @get_high(<8 x half> %a) #0 {
+; CHECK-LABEL: get_high:
+; CHECK: ext
+entry:
+  %shuffle.i = shufflevector <8 x half> %a, <8 x half> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x half> %shuffle.i
+}
+
+
+; float16x4_t get_low(float16x8_t a) { return vget_low_f16(a); }
+define <4 x half> @get_low(<8 x half> %a) #0 {
+; CHECK-LABEL: get_low:
+; CHECK-NOT: ext
+entry:
+  %shuffle.i = shufflevector <8 x half> %a, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x half> %shuffle.i
+}
+
+; float16x4_t set_lane_64(float16x4_t a, __fp16 b) { return vset_lane_f16(b, a, 2); }
+define <4 x half> @set_lane_64(<4 x half> %a, half %b) #0 {
+; CHECK-LABEL: set_lane_64:
+; CHECK: fmov
+; CHECK: ins
+entry:
+  %0 = bitcast half %b to i16
+  %1 = bitcast <4 x half> %a to <4 x i16>
+  %vset_lane = insertelement <4 x i16> %1, i16 %0, i32 2
+  %2 = bitcast <4 x i16> %vset_lane to <4 x half>
+  ret <4 x half> %2
+}
+
+
+; float16x8_t set_lane_128(float16x8_t a, __fp16 b) { return vsetq_lane_f16(b, a, 2); }
+define <8 x half> @set_lane_128(<8 x half> %a, half %b) #0 {
+; CHECK-LABEL: set_lane_128:
+; CHECK: fmov
+; CHECK: ins
+entry:
+  %0 = bitcast half %b to i16
+  %1 = bitcast <8 x half> %a to <8 x i16>
+  %vset_lane = insertelement <8 x i16> %1, i16 %0, i32 2
+  %2 = bitcast <8 x i16> %vset_lane to <8 x half>
+  ret <8 x half> %2
+}
+
+; __fp16 get_lane_64(float16x4_t a) { return vget_lane_f16(a, 2); }
+define half @get_lane_64(<4 x half> %a) #0 {
+; CHECK-LABEL: get_lane_64:
+; CHECK: umov
+; CHECK: fmov
+entry:
+  %0 = bitcast <4 x half> %a to <4 x i16>
+  %vget_lane = extractelement <4 x i16> %0, i32 2
+  %1 = bitcast i16 %vget_lane to half
+  ret half %1
+}
+
+; __fp16 get_lane_128(float16x8_t a) { return vgetq_lane_f16(a, 2); }
+define half @get_lane_128(<8 x half> %a) #0 {
+; CHECK-LABEL: get_lane_128:
+; CHECK: umov
+; CHECK: fmov
+entry:
+  %0 = bitcast <8 x half> %a to <8 x i16>
+  %vgetq_lane = extractelement <8 x i16> %0, i32 2
+  %1 = bitcast i16 %vgetq_lane to half
+  ret half %1
+}
diff --git a/test/CodeGen/AArch64/fpconv-vector-op-scalarize.ll b/test/CodeGen/AArch64/fpconv-vector-op-scalarize.ll
new file mode 100644
index 0000000..56e0b4a
--- /dev/null
+++ b/test/CodeGen/AArch64/fpconv-vector-op-scalarize.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+; PR20778
+; Check that the legalizer doesn't crash when scalarizing FP conversion
+; instructions' operands.  The operands are all illegal on AArch64,
+; ensuring they are legalized.  The results are all legal.
+
+define <1 x double> @test_sitofp(<1 x i1> %in) {
+; CHECK-LABEL: test_sitofp:
+; CHECK:       sbfx  [[GPR:w[0-9]+]], w0, #0, #1
+; CHECK-NEXT:  scvtf d0, [[GPR]]
+; CHECK-NEXT:  ret
+entry:
+  %0 = sitofp <1 x i1> %in to <1 x double>
+  ret <1 x double> %0
+}
+
+define <1 x double> @test_uitofp(<1 x i1> %in) {
+; CHECK-LABEL: test_uitofp:
+; CHECK:       and   [[GPR:w[0-9]+]], w0, #0x1
+; CHECK-NEXT:  ucvtf d0, [[GPR]]
+; CHECK-NEXT:  ret
+entry:
+  %0 = uitofp <1 x i1> %in to <1 x double>
+  ret <1 x double> %0
+}
+
+define <1 x i64> @test_fptosi(<1 x fp128> %in) {
+; CHECK-LABEL: test_fptosi:
+; CHECK:       bl    ___fixtfdi
+; CHECK-NEXT:  fmov  d0, x0
+entry:
+  %0 = fptosi <1 x fp128> %in to <1 x i64>
+  ret <1 x i64> %0
+}
+
+define <1 x i64> @test_fptoui(<1 x fp128> %in) {
+; CHECK-LABEL: test_fptoui:
+; CHECK:       bl    ___fixunstfdi
+; CHECK-NEXT:  fmov  d0, x0
+entry:
+  %0 = fptoui <1 x fp128> %in to <1 x i64>
+  ret <1 x i64> %0
+}
diff --git a/test/CodeGen/AArch64/frameaddr.ll b/test/CodeGen/AArch64/frameaddr.ll
index 85d95e2..d6bb50e 100644
--- a/test/CodeGen/AArch64/frameaddr.ll
+++ b/test/CodeGen/AArch64/frameaddr.ll
@@ -1,20 +1,29 @@
-; RUN: llc -o - %s -mtriple=arm64-apple-ios7.0  | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
 
-define i8* @t() nounwind {
+define i8* @test_frameaddress0() nounwind {
 entry:
-; CHECK-LABEL: t:
+; CHECK-LABEL: test_frameaddress0:
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
 ; CHECK: mov x0, x29
-	%0 = call i8* @llvm.frameaddress(i32 0)
-        ret i8* %0
+; CHECK: ldp x29, x30, [sp], #16
+; CHECK: ret
+  %0 = call i8* @llvm.frameaddress(i32 0)
+  ret i8* %0
 }
 
-define i8* @t2() nounwind {
+define i8* @test_frameaddress2() nounwind {
 entry:
-; CHECK-LABEL: t2:
+; CHECK-LABEL: test_frameaddress2:
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
 ; CHECK: ldr x[[reg:[0-9]+]], [x29]
-; CHECK: ldr {{x[0-9]+}}, [x[[reg]]]
-	%0 = call i8* @llvm.frameaddress(i32 2)
-        ret i8* %0
+; CHECK: ldr x0, [x[[reg]]]
+; CHECK: ldp x29, x30, [sp], #16
+; CHECK: ret
+  %0 = call i8* @llvm.frameaddress(i32 2)
+  ret i8* %0
 }
 
 declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/func-calls.ll b/test/CodeGen/AArch64/func-calls.ll
index 422c576..51979f0 100644
--- a/test/CodeGen/AArch64/func-calls.ll
+++ b/test/CodeGen/AArch64/func-calls.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-neon | FileCheck --check-prefix=CHECK-NONEON %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-BE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-BE %s
 
 %myStruct = type { i64 , i8, i32 }
 
diff --git a/test/CodeGen/AArch64/half.ll b/test/CodeGen/AArch64/half.ll
new file mode 100644
index 0000000..a46094b
--- /dev/null
+++ b/test/CodeGen/AArch64/half.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+
+define void @test_load_store(half* %in, half* %out) {
+; CHECK-LABEL: test_load_store:
+; CHECK: ldr [[TMP:h[0-9]+]], [x0]
+; CHECK: str [[TMP]], [x1]
+  %val = load half* %in
+  store half %val, half* %out
+  ret void
+}
+
+define i16 @test_bitcast_from_half(half* %addr) {
+; CHECK-LABEL: test_bitcast_from_half:
+; CHECK: ldrh w0, [x0]
+  %val = load half* %addr
+  %val_int = bitcast half %val to i16
+  ret i16 %val_int
+}
+
+define i16 @test_reg_bitcast_from_half(half %in) {
+; CHECK-LABEL: test_reg_bitcast_from_half:
+; CHECK-NOT: str
+; CHECK-NOT: ldr
+; CHECK-DAG: fmov w0, s0
+; CHECK: ret
+  %val = bitcast half %in to i16
+  ret i16 %val
+}
+
+define void @test_bitcast_to_half(half* %addr, i16 %in) {
+; CHECK-LABEL: test_bitcast_to_half:
+; CHECK: strh w1, [x0]
+  %val_fp = bitcast i16 %in to half
+  store half %val_fp, half* %addr
+  ret void
+}
+
+define half @test_reg_bitcast_to_half(i16 %in) {
+; CHECK-LABEL: test_reg_bitcast_to_half:
+; CHECK-NOT: str
+; CHECK-NOT: ldr
+; CHECK-DAG: fmov s0, w0
+; CHECK: ret
+
+  %val = bitcast i16 %in to half
+  ret half %val
+}
+
+define float @test_extend32(half* %addr) {
+; CHECK-LABEL: test_extend32:
+; CHECK: fcvt {{s[0-9]+}}, {{h[0-9]+}}
+
+  %val16 = load half* %addr
+  %val32 = fpext half %val16 to float
+  ret float %val32
+}
+
+define double @test_extend64(half* %addr) {
+; CHECK-LABEL: test_extend64:
+; CHECK: fcvt {{d[0-9]+}}, {{h[0-9]+}}
+
+  %val16 = load half* %addr
+  %val32 = fpext half %val16 to double
+  ret double %val32
+}
+
+define void @test_trunc32(float %in, half* %addr) {
+; CHECK-LABEL: test_trunc32:
+; CHECK: fcvt {{h[0-9]+}}, {{s[0-9]+}}
+
+  %val16 = fptrunc float %in to half
+  store half %val16, half* %addr
+  ret void
+}
+
+define void @test_trunc64(double %in, half* %addr) {
+; CHECK-LABEL: test_trunc64:
+; CHECK: fcvt {{h[0-9]+}}, {{d[0-9]+}}
+
+  %val16 = fptrunc double %in to half
+  store half %val16, half* %addr
+  ret void
+}
diff --git a/test/CodeGen/AArch64/hints.ll b/test/CodeGen/AArch64/hints.ll
new file mode 100644
index 0000000..d7d9e23
--- /dev/null
+++ b/test/CodeGen/AArch64/hints.ll
@@ -0,0 +1,67 @@
+; RUN: llc -mtriple aarch64-eabi -o - %s | FileCheck %s
+
+declare void @llvm.aarch64.hint(i32) nounwind
+
+define void @hint_nop() {
+entry:
+  tail call void @llvm.aarch64.hint(i32 0) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_nop
+; CHECK: nop
+
+define void @hint_yield() {
+entry:
+  tail call void @llvm.aarch64.hint(i32 1) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_yield
+; CHECK: yield
+
+define void @hint_wfe() {
+entry:
+  tail call void @llvm.aarch64.hint(i32 2) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_wfe
+; CHECK: wfe
+
+define void @hint_wfi() {
+entry:
+  tail call void @llvm.aarch64.hint(i32 3) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_wfi
+; CHECK: wfi
+
+define void @hint_sev() {
+entry:
+  tail call void @llvm.aarch64.hint(i32 4) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_sev
+; CHECK: sev
+
+define void @hint_sevl() {
+entry:
+  tail call void @llvm.aarch64.hint(i32 5) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_sevl
+; CHECK: sevl
+
+define void @hint_undefined() {
+entry:
+  tail call void @llvm.aarch64.hint(i32 8) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_undefined
+; CHECK: hint #0x8
+
diff --git a/test/CodeGen/AArch64/init-array.ll b/test/CodeGen/AArch64/init-array.ll
index f47b490..a275e7e 100644
--- a/test/CodeGen/AArch64/init-array.ll
+++ b/test/CodeGen/AArch64/init-array.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -o - %s | FileCheck %s
 
 define internal void @_GLOBAL__I_a() section ".text.startup" {
   ret void
diff --git a/test/CodeGen/AArch64/intrinsics-memory-barrier.ll b/test/CodeGen/AArch64/intrinsics-memory-barrier.ll
new file mode 100644
index 0000000..09e34ae
--- /dev/null
+++ b/test/CodeGen/AArch64/intrinsics-memory-barrier.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -mtriple=aarch64-eabi -O=3 | FileCheck %s
+
+define void @test() {
+  ; CHECK: dmb sy
+  call void @llvm.aarch64.dmb(i32 15)
+  ; CHECK: dmb osh
+  call void @llvm.aarch64.dmb(i32 3)
+  ; CHECK: dsb sy
+  call void @llvm.aarch64.dsb(i32 15)
+  ; CHECK: dsb ishld
+  call void @llvm.aarch64.dsb(i32 9)
+  ; CHECK: isb
+  call void @llvm.aarch64.isb(i32 15)
+  ret void
+}
+
+; Important point is that the compiler should not reorder memory access
+; instructions around DMB.
+; Failure to do so, two STRs will collapse into one STP.
+define void @test_dmb_reordering(i32 %a, i32 %b, i32* %d) {
+  store i32 %a, i32* %d              ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}]
+
+  call void @llvm.aarch64.dmb(i32 15); CHECK: dmb sy
+
+  %d1 = getelementptr i32* %d, i64 1
+  store i32 %b, i32* %d1             ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #4]
+
+  ret void
+}
+
+; Similarly for DSB.
+define void @test_dsb_reordering(i32 %a, i32 %b, i32* %d) {
+  store i32 %a, i32* %d              ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}]
+
+  call void @llvm.aarch64.dsb(i32 15); CHECK: dsb sy
+
+  %d1 = getelementptr i32* %d, i64 1
+  store i32 %b, i32* %d1             ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #4]
+
+  ret void
+}
+
+; And ISB.
+define void @test_isb_reordering(i32 %a, i32 %b, i32* %d) {
+  store i32 %a, i32* %d              ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}]
+
+  call void @llvm.aarch64.isb(i32 15); CHECK: isb
+
+  %d1 = getelementptr i32* %d, i64 1
+  store i32 %b, i32* %d1             ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #4]
+
+  ret void
+}
+
+declare void @llvm.aarch64.dmb(i32)
+declare void @llvm.aarch64.dsb(i32)
+declare void @llvm.aarch64.isb(i32)
diff --git a/test/CodeGen/AArch64/jump-table.ll b/test/CodeGen/AArch64/jump-table.ll
index 69fbd99..16682e9 100644
--- a/test/CodeGen/AArch64/jump-table.ll
+++ b/test/CodeGen/AArch64/jump-table.ll
@@ -56,10 +56,11 @@ lbl4:
 ; CHECK-NEXT: .xword
 
 ; CHECK-PIC-NOT: .data_region
+; CHECK-PIC-NOT: .LJTI0_0
 ; CHECK-PIC: .LJTI0_0:
-; CHECK-PIC-NEXT: .word
-; CHECK-PIC-NEXT: .word
-; CHECK-PIC-NEXT: .word
-; CHECK-PIC-NEXT: .word
-; CHECK-PIC-NEXT: .word
+; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
+; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
+; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
+; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
+; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
 ; CHECK-PIC-NOT: .end_data_region
diff --git a/test/CodeGen/AArch64/legalize-bug-bogus-cpu.ll b/test/CodeGen/AArch64/legalize-bug-bogus-cpu.ll
new file mode 100644
index 0000000..b785a8f
--- /dev/null
+++ b/test/CodeGen/AArch64/legalize-bug-bogus-cpu.ll
@@ -0,0 +1,8 @@
+; RUN: llc -march=aarch64 -mcpu=bogus -o - %s
+
+; Fix the bug in PR20557. Set mcpu to a bogus name, llc will crash in type
+; legalization.
+define <4 x float> @fneg4(<4 x float> %x) {
+  %sub = fsub <4 x float> zeroinitializer, %x
+  ret <4 x float> %sub
+}
diff --git a/test/CodeGen/AArch64/machine_cse.ll b/test/CodeGen/AArch64/machine_cse.ll
new file mode 100644
index 0000000..bc9ab10
--- /dev/null
+++ b/test/CodeGen/AArch64/machine_cse.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 | FileCheck %s
+
+; marked as external to prevent possible optimizations
+@a = external global i32
+@b = external global i32
+@c = external global i32
+@d = external global i32
+@e = external global i32
+
+define void @combine-sign-comparisons-by-cse(i32 *%arg) {
+; CHECK: cmp
+; CHECK: b.ge
+; CHECK-NOT: cmp
+; CHECK: b.le
+
+entry:
+  %a = load i32* @a, align 4
+  %b = load i32* @b, align 4
+  %c = load i32* @c, align 4
+  %d = load i32* @d, align 4
+  %e = load i32* @e, align 4
+
+  %cmp = icmp slt i32 %a, %e
+  br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
+
+land.lhs.true:
+  %cmp1 = icmp eq i32 %b, %c
+  br i1 %cmp1, label %return, label %if.end
+
+lor.lhs.false:
+  %cmp2 = icmp sgt i32 %a, %e
+  br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3:
+  %cmp4 = icmp eq i32 %b, %d
+  br i1 %cmp4, label %return, label %if.end
+
+if.end:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+  store i32 %a, i32 *%arg
+  ret void
+}
diff --git a/test/CodeGen/AArch64/madd-combiner.ll b/test/CodeGen/AArch64/madd-combiner.ll
new file mode 100644
index 0000000..7c9787a
--- /dev/null
+++ b/test/CodeGen/AArch64/madd-combiner.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=aarch64-apple-darwin            -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs < %s | FileCheck %s
+
+; Test that we use the correct register class.
+define i32 @mul_add_imm(i32 %a, i32 %b) {
+; CHECK-LABEL: mul_add_imm
+; CHECK:       orr [[REG:w[0-9]+]], wzr, #0x4
+; CHECK-NEXT:  madd  {{w[0-9]+}}, w0, w1, [[REG]]
+  %1 = mul i32 %a, %b
+  %2 = add i32 %1, 4
+  ret i32 %2
+}
+
+define i32 @mul_sub_imm1(i32 %a, i32 %b) {
+; CHECK-LABEL: mul_sub_imm1
+; CHECK:       orr [[REG:w[0-9]+]], wzr, #0x4
+; CHECK-NEXT:  msub  {{w[0-9]+}}, w0, w1, [[REG]]
+  %1 = mul i32 %a, %b
+  %2 = sub i32 4, %1
+  ret i32 %2
+}
+
+; bugpoint reduced test case. This only tests that we pass the MI verifier.
+define void @mul_add_imm2() {
+entry:
+  br label %for.body
+for.body:
+  br i1 undef, label %for.body, label %for.body8
+for.body8:
+  %0 = mul i64 undef, -3
+  %mul1971 = add i64 %0, -3
+  %cmp7 = icmp slt i64 %mul1971, 1390451930000
+  br i1 %cmp7, label %for.body8, label %for.end20
+for.end20:
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/madd-lohi.ll b/test/CodeGen/AArch64/madd-lohi.ll
new file mode 100644
index 0000000..550a8cb
--- /dev/null
+++ b/test/CodeGen/AArch64/madd-lohi.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-linux-gnu %s -o - | FileCheck --check-prefix=CHECK-BE %s
+
+define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
+; CHECK-LABEL: test_128bitmul:
+; CHECK-DAG: umulh [[CARRY:x[0-9]+]], x0, x2
+; CHECK-DAG: madd [[PART1:x[0-9]+]], x0, x3, [[CARRY]]
+; CHECK: madd x1, x1, x2, [[PART1]]
+; CHECK: mul x0, x0, x2
+
+; CHECK-BE-LABEL: test_128bitmul:
+; CHECK-BE-DAG: umulh [[CARRY:x[0-9]+]], x1, x3
+; CHECK-BE-DAG: madd [[PART1:x[0-9]+]], x1, x2, [[CARRY]]
+; CHECK-BE: madd x0, x0, x3, [[PART1]]
+; CHECK-BE: mul x1, x1, x3
+
+  %prod = mul i128 %lhs, %rhs
+  ret i128 %prod
+}
diff --git a/test/CodeGen/AArch64/mul-lohi.ll b/test/CodeGen/AArch64/mul-lohi.ll
index 0689fbd..4515697 100644
--- a/test/CodeGen/AArch64/mul-lohi.ll
+++ b/test/CodeGen/AArch64/mul-lohi.ll
@@ -1,17 +1,16 @@
-; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - | FileCheck %s
-; RUN: llc -mtriple=arm64_be-linux-gnu %s -o - | FileCheck --check-prefix=CHECK-BE %s
-
+; RUN: llc -mtriple=arm64-apple-ios7.0 -mcpu=cyclone %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-linux-gnu -mcpu=cyclone %s -o - | FileCheck --check-prefix=CHECK-BE %s
 define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
 ; CHECK-LABEL: test_128bitmul:
+; CHECK-DAG: mul [[PART1:x[0-9]+]], x0, x3
 ; CHECK-DAG: umulh [[CARRY:x[0-9]+]], x0, x2
-; CHECK-DAG: madd [[PART1:x[0-9]+]], x0, x3, [[CARRY]]
-; CHECK: madd x1, x1, x2, [[PART1]]
+; CHECK: mul [[PART2:x[0-9]+]], x1, x2
 ; CHECK: mul x0, x0, x2
 
 ; CHECK-BE-LABEL: test_128bitmul:
+; CHECK-BE-DAG: mul [[PART1:x[0-9]+]], x1, x2
 ; CHECK-BE-DAG: umulh [[CARRY:x[0-9]+]], x1, x3
-; CHECK-BE-DAG: madd [[PART1:x[0-9]+]], x1, x2, [[CARRY]]
-; CHECK-BE: madd x0, x0, x3, [[PART1]]
+; CHECK-BE: mul [[PART2:x[0-9]+]], x0, x3
 ; CHECK-BE: mul x1, x1, x3
 
   %prod = mul i128 %lhs, %rhs
diff --git a/test/CodeGen/AArch64/neon-perm.ll b/test/CodeGen/AArch64/neon-perm.ll
index 4f8571d..41e391d 100644
--- a/test/CodeGen/AArch64/neon-perm.ll
+++ b/test/CodeGen/AArch64/neon-perm.ll
@@ -1387,6 +1387,13 @@ entry:
   ret <8 x i16> %shuffle.i
 }
 
+define <4 x i8> @test_vzip1_v4i8(<8 x i8> %p) {
+; CHECK-LABEL: test_vzip1_v4i8:
+; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %lo = shufflevector <8 x i8> %p, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i8> %lo
+}
+
 define <8 x i8> @test_same_vzip2_s8(<8 x i8> %a) {
 ; CHECK-LABEL: test_same_vzip2_s8:
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
diff --git a/test/CodeGen/AArch64/neon-scalar-copy.ll b/test/CodeGen/AArch64/neon-scalar-copy.ll
index a01df32..6afac31 100644
--- a/test/CodeGen/AArch64/neon-scalar-copy.ll
+++ b/test/CodeGen/AArch64/neon-scalar-copy.ll
@@ -101,3 +101,20 @@ define <1 x i64> @test_vector_copy_dup_dv2D(<1 x i64> %a, <2 x i64> %c) {
   ret <1 x i64> %vset_lane
 }
 
+; Undefined behaviour, so we really don't care what actually gets emitted, just
+; as long as we don't crash (since it could be dynamically unreachable).
+define i32 @test_out_of_range_extract(<4 x i32> %vec) {
+; CHECK-LABEL: test_out_of_range_extract:
+; CHECK: ret
+  %elt = extractelement <4 x i32> %vec, i32 4
+  ret i32 %elt
+}
+
+; Undefined behaviour, so we really don't care what actually gets emitted, just
+; as long as we don't crash (since it could be dynamically unreachable).
+define void @test_out_of_range_insert(<4 x i32> %vec, i32 %elt) {
+; CHECK-LABEL: test_out_of_range_insert:
+; CHECK: ret
+  insertelement <4 x i32> %vec, i32 %elt, i32 4
+  ret void
+}
diff --git a/test/CodeGen/AArch64/paired-load.ll b/test/CodeGen/AArch64/paired-load.ll
new file mode 100644
index 0000000..3dddb9e
--- /dev/null
+++ b/test/CodeGen/AArch64/paired-load.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linux-gnu"
+
+; Ensure we're generating ldp instructions instead of ldr Q.
+; CHECK: ldp
+; CHECK: stp
+define void @f(i64* %p, i64* %q) {
+  %addr2 = getelementptr i64* %q, i32 1
+  %addr = getelementptr i64* %p, i32 1
+  %x = load i64* %p
+  %y = load i64* %addr
+  store i64 %x, i64* %q
+  store i64 %y, i64* %addr2
+  ret void
+}
diff --git a/test/CodeGen/AArch64/pic-eh-stubs.ll b/test/CodeGen/AArch64/pic-eh-stubs.ll
index e8c7625..93ee0e6 100644
--- a/test/CodeGen/AArch64/pic-eh-stubs.ll
+++ b/test/CodeGen/AArch64/pic-eh-stubs.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
-; RUN: llc -mtriple=arm64_be-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
 
 ; Make sure exception-handling PIC code can be linked correctly. An alternative
 ; to the sequence described below would have .gcc_except_table itself writable
diff --git a/test/CodeGen/AArch64/postra-mi-sched.ll b/test/CodeGen/AArch64/postra-mi-sched.ll
new file mode 100644
index 0000000..5a40724
--- /dev/null
+++ b/test/CodeGen/AArch64/postra-mi-sched.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -O3 -march=aarch64 -mcpu=cortex-a53 | FileCheck %s
+
+; With cortex-a53, each of fmul and fcvt have latency of 6 cycles.  After the
+; pre-RA MI scheduler, fmul, fcvt and fdiv will be consecutive.  The top-down
+; post-RA MI scheduler will clean this up.
+
+@d1 = common global double 0.000000e+00, align 8
+
+define i32 @test1(float %s2, float %s3, double %d, i32 %i2, i32 %i3) {
+entry:
+; CHECK-LABEL: @test1
+; CHECK: fmul
+; CHECK-NEXT: add
+; CHECK: fcvt
+; CHECK-NEXT: mul
+  %mul = fmul float %s2, %s3
+  %conv = fpext float %mul to double
+  %div = fdiv double %d, %conv
+  store double %div, double* @d1, align 8
+  %factor = shl i32 %i3, 1
+  %add1 = add i32 %i2, 4
+  %add2 = add i32 %add1, %factor
+  %add3 = add nsw i32 %add2, %i2
+  %add4 = add nsw i32 %add3, %add2
+  %mul5 = mul i32 %add3, %add3
+  %mul6 = mul i32 %mul5, %add4
+  %mul7 = shl i32 %add4, 1
+  %factor18 = mul i32 %mul7, %mul6
+  %add9 = add i32 %factor18, %mul6
+  ret i32 %add9
+}
diff --git a/test/CodeGen/AArch64/rbit.ll b/test/CodeGen/AArch64/rbit.ll
new file mode 100644
index 0000000..3404ae4
--- /dev/null
+++ b/test/CodeGen/AArch64/rbit.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s
+
+; CHECK-LABEL: rbit32
+; CHECK: rbit w0, w0
+define i32 @rbit32(i32 %t) {
+entry:
+  %rbit.i = call i32 @llvm.aarch64.rbit.i32(i32 %t)
+  ret i32 %rbit.i
+}
+
+; CHECK-LABEL: rbit64
+; CHECK: rbit x0, x0
+define i64 @rbit64(i64 %t) {
+entry:
+  %rbit.i = call i64 @llvm.aarch64.rbit.i64(i64 %t)
+  ret i64 %rbit.i
+}
+
+declare i64 @llvm.aarch64.rbit.i64(i64)
+declare i32 @llvm.aarch64.rbit.i32(i32)
diff --git a/test/CodeGen/AArch64/rm_redundant_cmp.ll b/test/CodeGen/AArch64/rm_redundant_cmp.ll
new file mode 100644
index 0000000..36dc118
--- /dev/null
+++ b/test/CodeGen/AArch64/rm_redundant_cmp.ll
@@ -0,0 +1,254 @@
+; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 | FileCheck %s
+
+; The following cases are for i16
+
+%struct.s_signed_i16 = type { i16, i16, i16 }
+%struct.s_unsigned_i16 = type { i16, i16, i16 }
+
+@cost_s_i8_i16 = common global %struct.s_signed_i16 zeroinitializer, align 2
+@cost_u_i16 = common global %struct.s_unsigned_i16 zeroinitializer, align 2
+
+define void @test_i16_2cmp_signed_1() {
+; CHECK-LABEL: test_i16_2cmp_signed_1
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.gt
+; CHECK-NOT: cmp
+; CHECK: b.ne
+entry:
+  %0 = load i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 1), align 2
+  %1 = load i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 2), align 2
+  %cmp = icmp sgt i16 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i16 %0, i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp eq i16 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i16 %0, i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+define void @test_i16_2cmp_signed_2() {
+; CHECK-LABEL: test_i16_2cmp_signed_2
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.le
+; CHECK-NOT: cmp
+; CHECK: b.ge
+entry:
+  %0 = load i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 1), align 2
+  %1 = load i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 2), align 2
+  %cmp = icmp sgt i16 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i16 %0, i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp slt i16 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i16 %1, i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+define void @test_i16_2cmp_unsigned_1() {
+; CHECK-LABEL: test_i16_2cmp_unsigned_1
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.hi
+; CHECK-NOT: cmp
+; CHECK: b.ne
+entry:
+  %0 = load i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 1), align 2
+  %1 = load i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 2), align 2
+  %cmp = icmp ugt i16 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i16 %0, i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp eq i16 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i16 %0, i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+define void @test_i16_2cmp_unsigned_2() {
+; CHECK-LABEL: test_i16_2cmp_unsigned_2
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.ls
+; CHECK-NOT: cmp
+; CHECK: b.hs
+entry:
+  %0 = load i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 1), align 2
+  %1 = load i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 2), align 2
+  %cmp = icmp ugt i16 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i16 %0, i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp ult i16 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i16 %1, i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+; The following cases are for i8
+
+%struct.s_signed_i8 = type { i8, i8, i8 }
+%struct.s_unsigned_i8 = type { i8, i8, i8 }
+
+@cost_s = common global %struct.s_signed_i8 zeroinitializer, align 2
+@cost_u_i8 = common global %struct.s_unsigned_i8 zeroinitializer, align 2
+
+
+define void @test_i8_2cmp_signed_1() {
+; CHECK-LABEL: test_i8_2cmp_signed_1
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.gt
+; CHECK-NOT: cmp
+; CHECK: b.ne
+entry:
+  %0 = load i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 1), align 2
+  %1 = load i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 2), align 2
+  %cmp = icmp sgt i8 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i8 %0, i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp eq i8 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i8 %0, i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+define void @test_i8_2cmp_signed_2() {
+; CHECK-LABEL: test_i8_2cmp_signed_2
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.le
+; CHECK-NOT: cmp
+; CHECK: b.ge
+entry:
+  %0 = load i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 1), align 2
+  %1 = load i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 2), align 2
+  %cmp = icmp sgt i8 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i8 %0, i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp slt i8 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i8 %1, i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+define void @test_i8_2cmp_unsigned_1() {
+; CHECK-LABEL: test_i8_2cmp_unsigned_1
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.hi
+; CHECK-NOT: cmp
+; CHECK: b.ne
+entry:
+  %0 = load i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 1), align 2
+  %1 = load i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 2), align 2
+  %cmp = icmp ugt i8 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i8 %0, i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp eq i8 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i8 %0, i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+define void @test_i8_2cmp_unsigned_2() {
+; CHECK-LABEL: test_i8_2cmp_unsigned_2
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.ls
+; CHECK-NOT: cmp
+; CHECK: b.hs
+entry:
+  %0 = load i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 1), align 2
+  %1 = load i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 2), align 2
+  %cmp = icmp ugt i8 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i8 %0, i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp ult i8 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i8 %1, i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+; Make sure the case below won't crash.
+
+; The optimization of ZERO_EXTEND and SIGN_EXTEND in type legalization stage can't assert
+; the operand of a set_cc is always a TRUNCATE.
+
+define i1 @foo(float %inl, float %inr) {
+  %lval = fptosi float %inl to i8
+  %rval = fptosi float %inr to i8
+  %sum = icmp eq i8 %lval, %rval
+  ret i1 %sum
+}
diff --git a/test/CodeGen/AArch64/sdivpow2.ll b/test/CodeGen/AArch64/sdivpow2.ll
new file mode 100644
index 0000000..6c02ea9
--- /dev/null
+++ b/test/CodeGen/AArch64/sdivpow2.ll
@@ -0,0 +1,74 @@
+; RUN: llc -mtriple=arm64-linux-gnu -fast-isel=0 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -fast-isel=1 -verify-machineinstrs < %s | FileCheck %s
+
+define i32 @test1(i32 %x) {
+; CHECK-LABEL: test1
+; CHECK: add w8, w0, #7
+; CHECK: cmp w0, #0
+; CHECK: csel w8, w8, w0, lt
+; CHECK: asr w0, w8, #3
+  %div = sdiv i32 %x, 8
+  ret i32 %div
+}
+
+define i32 @test2(i32 %x) {
+; CHECK-LABEL: test2
+; CHECK: add w8, w0, #7
+; CHECK: cmp w0, #0
+; CHECK: csel w8, w8, w0, lt
+; CHECK: neg w0, w8, asr #3
+  %div = sdiv i32 %x, -8
+  ret i32 %div
+}
+
+define i32 @test3(i32 %x) {
+; CHECK-LABEL: test3
+; CHECK: add w8, w0, #31
+; CHECK: cmp w0, #0
+; CHECK: csel w8, w8, w0, lt
+; CHECK: asr w0, w8, #5
+  %div = sdiv i32 %x, 32
+  ret i32 %div
+}
+
+define i64 @test4(i64 %x) {
+; CHECK-LABEL: test4
+; CHECK: add x8, x0, #7
+; CHECK: cmp x0, #0
+; CHECK: csel x8, x8, x0, lt
+; CHECK: asr x0, x8, #3
+  %div = sdiv i64 %x, 8
+  ret i64 %div
+}
+
+define i64 @test5(i64 %x) {
+; CHECK-LABEL: test5
+; CHECK: add x8, x0, #7
+; CHECK: cmp x0, #0
+; CHECK: csel x8, x8, x0, lt
+; CHECK: neg x0, x8, asr #3
+  %div = sdiv i64 %x, -8
+  ret i64 %div
+}
+
+define i64 @test6(i64 %x) {
+; CHECK-LABEL: test6
+; CHECK: add x8, x0, #63
+; CHECK: cmp x0, #0
+; CHECK: csel x8, x8, x0, lt
+; CHECK: asr x0, x8, #6
+  %div = sdiv i64 %x, 64
+  ret i64 %div
+}
+
+define i64 @test7(i64 %x) {
+; CHECK-LABEL: test7
+; CHECK: orr [[REG:x[0-9]+]], xzr, #0xffffffffffff
+; CHECK: add x8, x0, [[REG]]
+; CHECK: cmp x0, #0
+; CHECK: csel x8, x8, x0, lt
+; CHECK: asr x0, x8, #48
+  %div = sdiv i64 %x, 281474976710656
+  ret i64 %div
+}
+
diff --git a/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll b/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll
new file mode 100644
index 0000000..bedbf5f
--- /dev/null
+++ b/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios -relocation-model=pic | FileCheck %s
+
+@__stack_chk_guard = external global i64*
+
+; PR20558
+
+; CHECK: adrp [[R0:x[0-9]+]], ___stack_chk_guard@GOTPAGE
+; CHECK: ldr  [[R1:x[0-9]+]], {{\[}}[[R0]], ___stack_chk_guard@GOTPAGEOFF{{\]}}
+; CHECK: ldr  [[R2:x[0-9]+]], {{\[}}[[R1]]{{\]}}
+; CHECK: stur [[R2]], {{\[}}x29, [[SLOT0:[0-9#\-]+]]{{\]}}
+; CHECK: ldur [[R3:x[0-9]+]], {{\[}}x29, [[SLOT0]]{{\]}}
+; CHECK: sub  [[R4:x[0-9]+]], [[R2]], [[R3]]
+; CHECK: cbnz [[R4]], LBB
+
+define i32 @test_stack_guard_remat2() {
+entry:
+  %StackGuardSlot = alloca i8*
+  %StackGuard = load i8** bitcast (i64** @__stack_chk_guard to i8**)
+  call void @llvm.stackprotector(i8* %StackGuard, i8** %StackGuardSlot)
+  %container = alloca [32 x i8], align 1
+  call void @llvm.stackprotectorcheck(i8** bitcast (i64** @__stack_chk_guard to i8**))
+  ret i32 -1
+}
+
+declare void @llvm.stackprotector(i8*, i8**)
+declare void @llvm.stackprotectorcheck(i8**)
diff --git a/test/CodeGen/AArch64/stack_guard_remat.ll b/test/CodeGen/AArch64/stack_guard_remat.ll
new file mode 100644
index 0000000..cee7266
--- /dev/null
+++ b/test/CodeGen/AArch64/stack_guard_remat.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios -relocation-model=pic -no-integrated-as | FileCheck %s -check-prefix=DARWIN
+; RUN: llc < %s -mtriple=arm64-apple-ios -relocation-model=static -no-integrated-as | FileCheck %s -check-prefix=DARWIN
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -relocation-model=pic -no-integrated-as | FileCheck %s -check-prefix=PIC-LINUX
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -relocation-model=static -code-model=large -no-integrated-as | FileCheck %s -check-prefix=STATIC-LARGE
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -relocation-model=static -code-model=small -no-integrated-as | FileCheck %s -check-prefix=STATIC-SMALL
+
+; DARWIN: foo2
+; DARWIN: adrp [[R0:x[0-9]+]], ___stack_chk_guard@GOTPAGE
+; DARWIN: ldr [[R1:x[0-9]+]], {{\[}}[[R0]], ___stack_chk_guard@GOTPAGEOFF{{\]}}
+; DARWIN: ldr {{x[0-9]+}}, {{\[}}[[R1]]{{\]}}
+
+; PIC-LINUX: foo2
+; PIC-LINUX: adrp [[R0:x[0-9]+]], :got:__stack_chk_guard
+; PIC-LINUX: ldr [[R1:x[0-9]+]], {{\[}}[[R0]], :got_lo12:__stack_chk_guard{{\]}}
+; PIC-LINUX: ldr {{x[0-9]+}}, {{\[}}[[R1]]{{\]}}
+
+; STATIC-LARGE: foo2
+; STATIC-LARGE: movz [[R0:x[0-9]+]], #:abs_g3:__stack_chk_guard
+; STATIC-LARGE: movk [[R0]], #:abs_g2_nc:__stack_chk_guard
+; STATIC-LARGE: movk [[R0]], #:abs_g1_nc:__stack_chk_guard
+; STATIC-LARGE: movk [[R0]], #:abs_g0_nc:__stack_chk_guard
+; STATIC-LARGE: ldr {{x[0-9]+}}, {{\[}}[[R0]]{{\]}}
+
+; STATIC-SMALL: foo2
+; STATIC-SMALL: adrp [[R0:x[0-9]+]], __stack_chk_guard
+; STATIC-SMALL: ldr {{x[0-9]+}}, {{\[}}[[R0]], :lo12:__stack_chk_guard{{\]}}
+
+define i32 @test_stack_guard_remat() #0 {
+entry:
+  %a1 = alloca [256 x i32], align 4
+  %0 = bitcast [256 x i32]* %a1 to i8*
+  call void @llvm.lifetime.start(i64 1024, i8* %0)
+  %arraydecay = getelementptr inbounds [256 x i32]* %a1, i64 0, i64 0
+  call void @foo3(i32* %arraydecay)
+  call void asm sideeffect "foo2", "~{w0},~{w1},~{w2},~{w3},~{w4},~{w5},~{w6},~{w7},~{w8},~{w9},~{w10},~{w11},~{w12},~{w13},~{w14},~{w15},~{w16},~{w17},~{w18},~{w19},~{w20},~{w21},~{w22},~{w23},~{w24},~{w25},~{w26},~{w27},~{w28},~{w29},~{w30}"()
+  call void @llvm.lifetime.end(i64 1024, i8* %0)
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+
+declare void @foo3(i32*)
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+attributes #0 = { nounwind sspstrong "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/AArch64/tail-call.ll b/test/CodeGen/AArch64/tail-call.ll
index 8aab842..7fb3954 100644
--- a/test/CodeGen/AArch64/tail-call.ll
+++ b/test/CodeGen/AArch64/tail-call.ll
@@ -3,6 +3,7 @@
 declare fastcc void @callee_stack0()
 declare fastcc void @callee_stack8([8 x i32], i64)
 declare fastcc void @callee_stack16([8 x i32], i64, i64)
+declare extern_weak fastcc void @callee_weak()
 
 define fastcc void @caller_to0_from0() nounwind {
 ; CHECK-LABEL: caller_to0_from0:
@@ -92,3 +93,13 @@ define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
 ; CHECK-NEXT: add sp, sp, #16
 ; CHECK-NEXT: b callee_stack16
 }
+
+
+; Weakly-referenced extern functions cannot be tail-called, as AAELF does
+; not define the behaviour of branch instructions to undefined weak symbols.
+define fastcc void @caller_weak() {
+; CHECK-LABEL: caller_weak:
+; CHECK: bl callee_weak
+  tail call void @callee_weak()
+  ret void
+}
diff --git a/test/CodeGen/AArch64/tailcall-fastisel.ll b/test/CodeGen/AArch64/tailcall-fastisel.ll
new file mode 100644
index 0000000..3ba6391
--- /dev/null
+++ b/test/CodeGen/AArch64/tailcall-fastisel.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin -O0 | FileCheck %s
+
+; CHECK: b _foo0
+
+define i32 @foo1() {
+entry:
+  %call = tail call i32 @foo0()
+  ret i32 %call
+}
+
+declare i32 @foo0()
diff --git a/test/CodeGen/AArch64/tbz-tbnz.ll b/test/CodeGen/AArch64/tbz-tbnz.ll
new file mode 100644
index 0000000..c77043c
--- /dev/null
+++ b/test/CodeGen/AArch64/tbz-tbnz.ll
@@ -0,0 +1,258 @@
+; RUN: llc -O1 -march=aarch64 < %s | FileCheck %s
+
+declare void @t()
+
+define void @test1(i32 %a) {
+; CHECK-LABEL: @test1
+entry:
+  %sub = add nsw i32 %a, -12
+  %cmp = icmp slt i32 %sub, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK: sub [[CMP:w[0-9]+]], w0, #12
+; CHECK: tbz [[CMP]], #31
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test2(i64 %a) {
+; CHECK-LABEL: @test2
+entry:
+  %sub = add nsw i64 %a, -12
+  %cmp = icmp slt i64 %sub, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK: sub [[CMP:x[0-9]+]], x0, #12
+; CHECK: tbz [[CMP]], #63
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test3(i32 %a) {
+; CHECK-LABEL: @test3
+entry:
+  %sub = add nsw i32 %a, -12
+  %cmp = icmp sgt i32 %sub, -1
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK: sub [[CMP:w[0-9]+]], w0, #12
+; CHECK: tbnz [[CMP]], #31
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test4(i64 %a) {
+; CHECK-LABEL: @test4
+entry:
+  %sub = add nsw i64 %a, -12
+  %cmp = icmp sgt i64 %sub, -1
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK: sub [[CMP:x[0-9]+]], x0, #12
+; CHECK: tbnz [[CMP]], #63
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test5(i32 %a) {
+; CHECK-LABEL: @test5
+entry:
+  %sub = add nsw i32 %a, -12
+  %cmp = icmp sge i32 %sub, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK: sub [[CMP:w[0-9]+]], w0, #12
+; CHECK: tbnz [[CMP]], #31
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test6(i64 %a) {
+; CHECK-LABEL: @test6
+entry:
+  %sub = add nsw i64 %a, -12
+  %cmp = icmp sge i64 %sub, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK: sub [[CMP:x[0-9]+]], x0, #12
+; CHECK: tbnz [[CMP]], #63
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test7(i32 %a) {
+; CHECK-LABEL: @test7
+entry:
+  %sub = sub nsw i32 %a, 12
+  %cmp = icmp slt i32 %sub, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK: sub [[CMP:w[0-9]+]], w0, #12
+; CHECK: tbz [[CMP]], #31
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test8(i64 %val1, i64 %val2, i64 %val3) {
+; CHECK-LABEL: @test8
+  %and1 = and i64 %val1, %val2
+  %tst1 = icmp slt i64 %and1, 0
+  br i1 %tst1, label %if.then1, label %if.end
+
+; CHECK: tst x0, x1
+; CHECK-NEXT: b.ge
+
+if.then1:
+  %and2 = and i64 %val2, %val3
+  %tst2 = icmp sge i64 %and2, 0
+  br i1 %tst2, label %if.then2, label %if.end
+
+; CHECK: and [[CMP:x[0-9]+]], x1, x2
+; CHECK-NOT: cmp
+; CHECK: tbnz [[CMP]], #63
+
+if.then2:
+  %shifted_op1 = shl i64 %val2, 63
+  %shifted_and1 = and i64 %val1, %shifted_op1
+  %tst3 = icmp slt i64 %shifted_and1, 0
+  br i1 %tst3, label %if.then3, label %if.end
+
+; CHECK: tst x0, x1, lsl #63
+; CHECK: b.ge
+
+if.then3:
+  %shifted_op2 = shl i64 %val2, 62
+  %shifted_and2 = and i64 %val1, %shifted_op2
+  %tst4 = icmp sge i64 %shifted_and2, 0
+  br i1 %tst4, label %if.then4, label %if.end
+
+; CHECK: tst x0, x1, lsl #62
+; CHECK: b.lt
+
+if.then4:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test9(i64 %val1) {
+; CHECK-LABEL: @test9
+  %tst = icmp slt i64 %val1, 0
+  br i1 %tst, label %if.then, label %if.end
+
+; CHECK-NOT: cmp
+; CHECK: tbz x0, #63
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test10(i64 %val1) {
+; CHECK-LABEL: @test10
+  %tst = icmp slt i64 %val1, 0
+  br i1 %tst, label %if.then, label %if.end
+
+; CHECK-NOT: cmp
+; CHECK: tbz x0, #63
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test11(i64 %val1, i64* %ptr) {
+; CHECK-LABEL: @test11
+
+; CHECK: ldr [[CMP:x[0-9]+]], [x1]
+; CHECK-NOT: cmp
+; CHECK: tbz [[CMP]], #63
+
+  %val = load i64* %ptr
+  %tst = icmp slt i64 %val, 0
+  br i1 %tst, label %if.then, label %if.end
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test12(i64 %val1) {
+; CHECK-LABEL: @test12
+  %tst = icmp slt i64 %val1, 0
+  br i1 %tst, label %if.then, label %if.end
+
+; CHECK-NOT: cmp
+; CHECK: tbz x0, #63
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test13(i64 %val1, i64 %val2) {
+; CHECK-LABEL: @test13
+  %or = or i64 %val1, %val2
+  %tst = icmp slt i64 %or, 0
+  br i1 %tst, label %if.then, label %if.end
+
+; CHECK: orr [[CMP:x[0-9]+]], x0, x1
+; CHECK-NOT: cmp
+; CHECK: tbz [[CMP]], #63
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/trunc-v1i64.ll b/test/CodeGen/AArch64/trunc-v1i64.ll
index 159b8e0..19efd2f 100644
--- a/test/CodeGen/AArch64/trunc-v1i64.ll
+++ b/test/CodeGen/AArch64/trunc-v1i64.ll
@@ -60,4 +60,23 @@ define <8 x i8> @test_v1i8_1(<1 x i64> %in0) {
   %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %2 = trunc <8 x i64> %1 to <8 x i8>
   ret <8 x i8> %2
-}
-\ No newline at end of file
+}
+
+; PR20777: v1i1 is also problematic, but we can't widen it, so we extract_elt
+; the i64 out of the v1i64 operand, and truncate that scalar instead.
+
+define <1 x i1> @test_v1i1_0(<1 x i64> %in0) {
+; CHECK-LABEL: test_v1i1_0:
+; CHECK: fmov w0, s0
+  %1 = trunc <1 x i64> %in0 to <1 x i1>
+  ret <1 x i1> %1
+}
+
+define i1 @test_v1i1_1(<1 x i64> %in0) {
+; CHECK-LABEL: test_v1i1_1:
+; CHECK: fmov [[REG:w[0-9]+]], s0
+  %1 = trunc <1 x i64> %in0 to <1 x i1>
+; CHECK: and w0, [[REG]], #0x1
+  %2 = extractelement <1 x i1> %1, i32 0
+  ret i1 %2
+}