aboutsummaryrefslogtreecommitdiffstats
path: root/test/CodeGen/ARM
diff options
context:
space:
mode:
Diffstat (limited to 'test/CodeGen/ARM')
-rw-r--r--test/CodeGen/ARM/2011-03-23-PeepholeBug.ll2
-rw-r--r--test/CodeGen/ARM/2012-03-13-DAGCombineBug.ll3
-rw-r--r--test/CodeGen/ARM/2012-03-26-FoldImmBug.ll33
-rw-r--r--test/CodeGen/ARM/2012-04-02-TwoAddrInstrCrash.ll24
-rw-r--r--test/CodeGen/ARM/2012-04-10-DAGCombine.ll31
-rw-r--r--test/CodeGen/ARM/call-tc.ll67
-rw-r--r--test/CodeGen/ARM/commute-movcc.ll67
-rw-r--r--test/CodeGen/ARM/dagcombine-anyexttozeroext.ll30
-rw-r--r--test/CodeGen/ARM/fast-isel-br-const.ll3
-rw-r--r--test/CodeGen/ARM/fast-isel.ll9
-rw-r--r--test/CodeGen/ARM/fcopysign.ll20
-rw-r--r--test/CodeGen/ARM/fusedMAC.ll87
-rw-r--r--test/CodeGen/ARM/ldr_post.ll19
-rw-r--r--test/CodeGen/ARM/ldr_pre.ll10
-rw-r--r--test/CodeGen/ARM/ldrd.ll3
-rw-r--r--test/CodeGen/ARM/lit.local.cfg9
-rw-r--r--test/CodeGen/ARM/lsr-icmp-imm.ll33
-rw-r--r--test/CodeGen/ARM/lsr-on-unrolled-loops.ll640
-rw-r--r--test/CodeGen/ARM/machine-cse-cmp.ll2
-rw-r--r--test/CodeGen/ARM/opt-shuff-tstore.ll19
-rw-r--r--test/CodeGen/ARM/reg_asc_order.ll16
-rw-r--r--test/CodeGen/ARM/reg_sequence.ll2
-rw-r--r--test/CodeGen/ARM/select-imm.ll2
-rw-r--r--test/CodeGen/ARM/select.ll10
-rw-r--r--test/CodeGen/ARM/tail-opts.ll2
-rw-r--r--test/CodeGen/ARM/vdiv_combine.ll12
-rw-r--r--test/CodeGen/ARM/vector-extend-narrow.ll60
-rw-r--r--test/CodeGen/ARM/vrev.ll6
-rw-r--r--test/CodeGen/ARM/widen-vmovs.ll9
29 files changed, 528 insertions, 702 deletions
diff --git a/test/CodeGen/ARM/2011-03-23-PeepholeBug.ll b/test/CodeGen/ARM/2011-03-23-PeepholeBug.ll
index 7c9af6f..0fe88bd 100644
--- a/test/CodeGen/ARM/2011-03-23-PeepholeBug.ll
+++ b/test/CodeGen/ARM/2011-03-23-PeepholeBug.ll
@@ -26,7 +26,7 @@ bb2: ; preds = %bb1, %entry
; CHECK: bb2
; CHECK: subs [[REG:r[0-9]+]], #1
; CHECK: cmp [[REG]], #0
-; CHECK: bgt
+; CHECK: ble
%indvar = phi i32 [ %indvar.next, %bb1 ], [ 0, %entry ]
%tries.0 = sub i32 2147483647, %indvar
%tmp1 = icmp sgt i32 %tries.0, 0
diff --git a/test/CodeGen/ARM/2012-03-13-DAGCombineBug.ll b/test/CodeGen/ARM/2012-03-13-DAGCombineBug.ll
index 6d596df..6206cd7 100644
--- a/test/CodeGen/ARM/2012-03-13-DAGCombineBug.ll
+++ b/test/CodeGen/ARM/2012-03-13-DAGCombineBug.ll
@@ -6,8 +6,7 @@
; (i32 extload $addr+c*sizeof(i16)
define void @test_hi_short3(<3 x i16> * nocapture %srcA, <2 x i16> * nocapture %dst) nounwind {
entry:
-; CHECK: ldrh [[REG:r[0-9]+]]
-; CHECK: strh [[REG]]
+; CHECK: vst1.32
%0 = load <3 x i16> * %srcA, align 8
%1 = shufflevector <3 x i16> %0, <3 x i16> undef, <2 x i32> <i32 2, i32 undef>
store <2 x i16> %1, <2 x i16> * %dst, align 4
diff --git a/test/CodeGen/ARM/2012-03-26-FoldImmBug.ll b/test/CodeGen/ARM/2012-03-26-FoldImmBug.ll
new file mode 100644
index 0000000..0ff4f51
--- /dev/null
+++ b/test/CodeGen/ARM/2012-03-26-FoldImmBug.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 | FileCheck %s
+
+; ARM has a peephole optimization which looks for a def / use pair. The def
+; produces a 32-bit immediate which is consumed by the use. It tries to
+; fold the immediate by breaking it into two parts and fold them into the
+; immmediate fields of two uses. e.g
+; movw r2, #40885
+; movt r3, #46540
+; add r0, r0, r3
+; =>
+; add.w r0, r0, #3019898880
+; add.w r0, r0, #30146560
+;
+; However, this transformation is incorrect if the user produces a flag. e.g.
+; movw r2, #40885
+; movt r3, #46540
+; adds r0, r0, r3
+; =>
+; add.w r0, r0, #3019898880
+; adds.w r0, r0, #30146560
+; Note the adds.w may not set the carry flag even if the original sequence
+; would.
+;
+; rdar://11116189
+define i64 @t(i64 %aInput) nounwind {
+; CHECK: t:
+; CHECK: movs [[REG:(r[0-9]+)]], #0
+; CHECK: movt [[REG]], #46540
+; CHECK: adds r{{[0-9]+}}, r{{[0-9]+}}, [[REG]]
+ %1 = mul i64 %aInput, 1000000
+ %2 = add i64 %1, -7952618389194932224
+ ret i64 %2
+}
diff --git a/test/CodeGen/ARM/2012-04-02-TwoAddrInstrCrash.ll b/test/CodeGen/ARM/2012-04-02-TwoAddrInstrCrash.ll
new file mode 100644
index 0000000..33ad187
--- /dev/null
+++ b/test/CodeGen/ARM/2012-04-02-TwoAddrInstrCrash.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s
+; PR11861
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+target triple = "armv7-none-linux-gnueabi"
+
+define arm_aapcs_vfpcc void @foo() nounwind align 2 {
+ br i1 undef, label %5, label %1
+
+; <label>:1 ; preds = %0
+ %2 = shufflevector <1 x i64> zeroinitializer, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %2 to <4 x float>
+ store <4 x float> zeroinitializer, <4 x float>* undef, align 16, !tbaa !0
+ store <4 x float> zeroinitializer, <4 x float>* undef, align 16, !tbaa !0
+ store <4 x float> %3, <4 x float>* undef, align 16, !tbaa !0
+ %4 = insertelement <4 x float> %3, float 8.000000e+00, i32 2
+ store <4 x float> %4, <4 x float>* undef, align 16, !tbaa !0
+ unreachable
+
+; <label>:5 ; preds = %0
+ ret void
+}
+
+!0 = metadata !{metadata !"omnipotent char", metadata !1}
+!1 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/ARM/2012-04-10-DAGCombine.ll b/test/CodeGen/ARM/2012-04-10-DAGCombine.ll
new file mode 100644
index 0000000..6f50f27
--- /dev/null
+++ b/test/CodeGen/ARM/2012-04-10-DAGCombine.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -march=arm -mcpu=cortex-a9 -enable-unsafe-fp-math
+;target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+;target triple = "armv7-none-linux-gnueabi"
+
+define arm_aapcs_vfpcc void @foo(<4 x float> %arg) nounwind align 2 {
+bb4:
+ %tmp = extractelement <2 x float> undef, i32 0
+ br i1 undef, label %bb18, label %bb5
+
+bb5: ; preds = %bb4
+ %tmp6 = fadd float %tmp, -1.500000e+01
+ %tmp7 = fdiv float %tmp6, 2.000000e+01
+ %tmp8 = fadd float %tmp7, 1.000000e+00
+ %tmp9 = fdiv float 1.000000e+00, %tmp8
+ %tmp10 = fsub float 1.000000e+00, %tmp9
+ %tmp11 = fmul float %tmp10, 1.000000e+01
+ %tmp12 = fadd float %tmp11, 1.500000e+01
+ %tmp13 = fdiv float %tmp12, %tmp
+ %tmp14 = insertelement <2 x float> undef, float %tmp13, i32 0
+ %tmp15 = shufflevector <2 x float> %tmp14, <2 x float> undef, <4 x i32> zeroinitializer
+ %tmp16 = fmul <4 x float> zeroinitializer, %tmp15
+ %tmp17 = fadd <4 x float> %tmp16, %arg
+ store <4 x float> %tmp17, <4 x float>* undef, align 8, !tbaa !0
+ br label %bb18
+
+bb18: ; preds = %bb5, %bb4
+ ret void
+}
+
+!0 = metadata !{metadata !"omnipotent char", metadata !1}
+!1 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/ARM/call-tc.ll b/test/CodeGen/ARM/call-tc.ll
index be3e105..94edff5 100644
--- a/test/CodeGen/ARM/call-tc.ll
+++ b/test/CodeGen/ARM/call-tc.ll
@@ -96,3 +96,70 @@ bb:
tail call void @foo() nounwind
ret void
}
+
+; Make sure codegenprep is duplicating ret instructions to enable tail calls.
+; rdar://11140249
+define i32 @t8(i32 %x) nounwind ssp {
+entry:
+; CHECKT2D: t8:
+; CHECKT2D-NOT: push
+; CHECKT2D-NOT
+ %and = and i32 %x, 1
+ %tobool = icmp eq i32 %and, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+; CHECKT2D: bne.w _a
+ %call = tail call i32 @a(i32 %x) nounwind
+ br label %return
+
+if.end: ; preds = %entry
+ %and1 = and i32 %x, 2
+ %tobool2 = icmp eq i32 %and1, 0
+ br i1 %tobool2, label %if.end5, label %if.then3
+
+if.then3: ; preds = %if.end
+; CHECKT2D: bne.w _b
+ %call4 = tail call i32 @b(i32 %x) nounwind
+ br label %return
+
+if.end5: ; preds = %if.end
+; CHECKT2D: b.w _c
+ %call6 = tail call i32 @c(i32 %x) nounwind
+ br label %return
+
+return: ; preds = %if.end5, %if.then3, %if.then
+ %retval.0 = phi i32 [ %call, %if.then ], [ %call4, %if.then3 ], [ %call6, %if.end5 ]
+ ret i32 %retval.0
+}
+
+declare i32 @a(i32)
+
+declare i32 @b(i32)
+
+declare i32 @c(i32)
+
+; PR12419
+; rdar://11195178
+; Use the correct input chain for the tailcall node or else the call to
+; _ZN9MutexLockD1Ev would be lost.
+%class.MutexLock = type { i8 }
+
+@x = external global i32, align 4
+
+define i32 @t9() nounwind {
+; CHECKT2D: t9:
+; CHECKT2D: blx __ZN9MutexLockC1Ev
+; CHECKT2D: blx __ZN9MutexLockD1Ev
+; CHECKT2D: b.w ___divsi3
+ %lock = alloca %class.MutexLock, align 1
+ %1 = call %class.MutexLock* @_ZN9MutexLockC1Ev(%class.MutexLock* %lock)
+ %2 = load i32* @x, align 4
+ %3 = sdiv i32 1000, %2
+ %4 = call %class.MutexLock* @_ZN9MutexLockD1Ev(%class.MutexLock* %lock)
+ ret i32 %3
+}
+
+declare %class.MutexLock* @_ZN9MutexLockC1Ev(%class.MutexLock*) unnamed_addr nounwind align 2
+
+declare %class.MutexLock* @_ZN9MutexLockD1Ev(%class.MutexLock*) unnamed_addr nounwind align 2
diff --git a/test/CodeGen/ARM/commute-movcc.ll b/test/CodeGen/ARM/commute-movcc.ll
new file mode 100644
index 0000000..7316452
--- /dev/null
+++ b/test/CodeGen/ARM/commute-movcc.ll
@@ -0,0 +1,67 @@
+; RUN: llc -mtriple=thumbv7-apple-ios -disable-code-place < %s | FileCheck %s
+; RUN: llc -mtriple=armv7-apple-ios -disable-code-place < %s | FileCheck %s
+
+; LLVM IR optimizers canonicalize icmp+select this way.
+; Make sure that TwoAddressInstructionPass can commute the corresponding
+; MOVCC instructions to avoid excessive copies in one of the if blocks.
+;
+; CHECK: %if.then
+; CHECK-NOT: mov
+; CHECK: movlo
+; CHECK: movlo
+; CHECK-NOT: mov
+
+; CHECK: %if.else
+; CHECK-NOT: mov
+; CHECK: movls
+; CHECK: movls
+; CHECK-NOT: mov
+
+; This is really an LSR test: Make sure that cmp is using the incremented
+; induction variable.
+; CHECK: %if.end8
+; CHECK: add{{(s|\.w)?}} [[IV:r[0-9]+]], {{.*}}#1
+; CHECK: cmp [[IV]], #
+
+define i32 @f(i32* nocapture %a, i32 %Pref) nounwind ssp {
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %if.end8
+ %i.012 = phi i32 [ 0, %entry ], [ %inc, %if.end8 ]
+ %BestCost.011 = phi i32 [ -1, %entry ], [ %BestCost.1, %if.end8 ]
+ %BestIdx.010 = phi i32 [ 0, %entry ], [ %BestIdx.1, %if.end8 ]
+ %arrayidx = getelementptr inbounds i32* %a, i32 %i.012
+ %0 = load i32* %arrayidx, align 4, !tbaa !0
+ %mul = mul i32 %0, %0
+ %sub = add nsw i32 %i.012, -5
+ %cmp2 = icmp eq i32 %sub, %Pref
+ br i1 %cmp2, label %if.else, label %if.then
+
+if.then: ; preds = %for.body
+ %cmp3 = icmp ult i32 %mul, %BestCost.011
+ %i.0.BestIdx.0 = select i1 %cmp3, i32 %i.012, i32 %BestIdx.010
+ %mul.BestCost.0 = select i1 %cmp3, i32 %mul, i32 %BestCost.011
+ br label %if.end8
+
+if.else: ; preds = %for.body
+ %cmp5 = icmp ugt i32 %mul, %BestCost.011
+ %BestIdx.0.i.0 = select i1 %cmp5, i32 %BestIdx.010, i32 %i.012
+ %BestCost.0.mul = select i1 %cmp5, i32 %BestCost.011, i32 %mul
+ br label %if.end8
+
+if.end8: ; preds = %if.else, %if.then
+ %BestIdx.1 = phi i32 [ %i.0.BestIdx.0, %if.then ], [ %BestIdx.0.i.0, %if.else ]
+ %BestCost.1 = phi i32 [ %mul.BestCost.0, %if.then ], [ %BestCost.0.mul, %if.else ]
+ store i32 %mul, i32* %arrayidx, align 4, !tbaa !0
+ %inc = add i32 %i.012, 1
+ %cmp = icmp eq i32 %inc, 11
+ br i1 %cmp, label %for.end, label %for.body
+
+for.end: ; preds = %if.end8
+ ret i32 %BestIdx.1
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll b/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll
new file mode 100644
index 0000000..18f57ea
--- /dev/null
+++ b/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mtriple armv7 %s -o - | FileCheck %s
+
+; CHECK: f:
+define float @f(<4 x i16>* nocapture %in) {
+ ; CHECK: vldr
+ ; CHECK: vmovl.u16
+ ; CHECK-NOT: vand
+ %1 = load <4 x i16>* %in
+ ; CHECK: vcvt.f32.u32
+ %2 = uitofp <4 x i16> %1 to <4 x float>
+ %3 = extractelement <4 x float> %2, i32 0
+ %4 = extractelement <4 x float> %2, i32 1
+ %5 = extractelement <4 x float> %2, i32 2
+
+ ; CHECK: vadd.f32
+ %6 = fadd float %3, %4
+ %7 = fadd float %6, %5
+
+ ret float %7
+}
+
+define float @g(<4 x i16>* nocapture %in) {
+ ; CHECK: vldr
+ %1 = load <4 x i16>* %in
+ ; CHECK-NOT: uxth
+ %2 = extractelement <4 x i16> %1, i32 0
+ ; CHECK: vcvt.f32.u32
+ %3 = uitofp i16 %2 to float
+ ret float %3
+}
diff --git a/test/CodeGen/ARM/fast-isel-br-const.ll b/test/CodeGen/ARM/fast-isel-br-const.ll
index 625adc2..7c532d5 100644
--- a/test/CodeGen/ARM/fast-isel-br-const.ll
+++ b/test/CodeGen/ARM/fast-isel-br-const.ll
@@ -5,7 +5,7 @@ define i32 @t1(i32 %a, i32 %b) nounwind uwtable ssp {
entry:
; THUMB: t1:
; ARM: t1:
-
+ %x = add i32 %a, %b
br i1 1, label %if.then, label %if.else
; THUMB-NOT: b LBB0_1
; ARM-NOT: b LBB0_1
@@ -24,6 +24,7 @@ if.then2: ; preds = %if.else
br label %if.end6
if.else3: ; preds = %if.else
+ %y = sub i32 %a, %b
br i1 1, label %if.then5, label %if.end
; THUMB-NOT: b LBB0_5
; ARM-NOT: b LBB0_5
diff --git a/test/CodeGen/ARM/fast-isel.ll b/test/CodeGen/ARM/fast-isel.ll
index 905543a..417e2d9 100644
--- a/test/CodeGen/ARM/fast-isel.ll
+++ b/test/CodeGen/ARM/fast-isel.ll
@@ -217,3 +217,12 @@ entry:
; THUMB: vcmpe.f32 s0, #0
ret i1 %4
}
+
+; ARM: @urem_fold
+; THUMB: @urem_fold
+; ARM: and r0, r0, #31
+; THUMB: and r0, r0, #31
+define i32 @urem_fold(i32 %a) nounwind {
+ %rem = urem i32 %a, 32
+ ret i32 %rem
+}
diff --git a/test/CodeGen/ARM/fcopysign.ll b/test/CodeGen/ARM/fcopysign.ll
index 87115cc..27fa2b0 100644
--- a/test/CodeGen/ARM/fcopysign.ll
+++ b/test/CodeGen/ARM/fcopysign.ll
@@ -40,26 +40,10 @@ entry:
ret double %1
}
-; rdar://9059537
-define i32 @test4() ssp {
-entry:
-; SOFT: test4:
-; SOFT: vmov.f64 [[REG4:(d[0-9]+)]], #1.000000e+00
-; This S-reg must be the first sub-reg of the last D-reg on vbsl.
-; SOFT: vcvt.f32.f64 {{s1?[02468]}}, [[REG4]]
-; SOFT: vshr.u64 [[REG4]], [[REG4]], #32
-; SOFT: vmov.i32 [[REG5:(d[0-9]+)]], #0x80000000
-; SOFT: vbsl [[REG5]], [[REG4]], {{d[0-9]+}}
- %call80 = tail call double @copysign(double 1.000000e+00, double undef)
- %conv81 = fptrunc double %call80 to float
- %tmp88 = bitcast float %conv81 to i32
- ret i32 %tmp88
-}
-
; rdar://9287902
-define float @test5() nounwind {
+define float @test4() nounwind {
entry:
-; SOFT: test5:
+; SOFT: test4:
; SOFT: vmov [[REG7:(d[0-9]+)]], r0, r1
; SOFT: vmov.i32 [[REG6:(d[0-9]+)]], #0x80000000
; SOFT: vshr.u64 [[REG7]], [[REG7]], #32
diff --git a/test/CodeGen/ARM/fusedMAC.ll b/test/CodeGen/ARM/fusedMAC.ll
index 40e8bb2..802d1b8 100644
--- a/test/CodeGen/ARM/fusedMAC.ll
+++ b/test/CodeGen/ARM/fusedMAC.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon,+vfp4 | FileCheck %s
+; RUN: llc < %s -mtriple=armv7-eabi -mattr=+neon,+vfp4 | FileCheck %s
; Check generated fused MAC and MLS.
define double @fusedMACTest1(double %d1, double %d2, double %d3) {
@@ -98,3 +98,88 @@ define <4 x float> @fusedMACTest12(<4 x float> %a, <4 x float> %b) {
%sub = fsub <4 x float> %a, %mul
ret <4 x float> %sub
}
+
+define float @test_fma_f32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK: test_fma_f32
+; CHECK: vfma.f32
+ %tmp1 = tail call float @llvm.fma.f32(float %a, float %b, float %c) nounwind readnone
+ ret float %tmp1
+}
+
+define double @test_fma_f64(double %a, double %b, double %c) nounwind readnone ssp {
+entry:
+; CHECK: test_fma_f64
+; CHECK: vfma.f64
+ %tmp1 = tail call double @llvm.fma.f64(double %a, double %b, double %c) nounwind readnone
+ ret double %tmp1
+}
+
+define <2 x float> @test_fma_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
+entry:
+; CHECK: test_fma_v2f32
+; CHECK: vfma.f32
+ %tmp1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind
+ ret <2 x float> %tmp1
+}
+
+define double @test_fms_f64(double %a, double %b, double %c) nounwind readnone ssp {
+entry:
+; CHECK: test_fms_f64
+; CHECK: vfms.f64
+ %tmp1 = fsub double -0.0, %a
+ %tmp2 = tail call double @llvm.fma.f64(double %tmp1, double %b, double %c) nounwind readnone
+ ret double %tmp2
+}
+
+define double @test_fms_f64_2(double %a, double %b, double %c) nounwind readnone ssp {
+entry:
+; CHECK: test_fms_f64_2
+; CHECK: vfms.f64
+ %tmp1 = fsub double -0.0, %b
+ %tmp2 = tail call double @llvm.fma.f64(double %a, double %tmp1, double %c) nounwind readnone
+ %tmp3 = fsub double -0.0, %tmp2
+ ret double %tmp3
+}
+
+define double @test_fnms_f64(double %a, double %b, double %c) nounwind readnone ssp {
+entry:
+; CHECK: test_fnms_f64
+; CHECK: vfnms.f64
+ %tmp1 = fsub double -0.0, %a
+ %tmp2 = tail call double @llvm.fma.f64(double %tmp1, double %b, double %c) nounwind readnone
+ %tmp3 = fsub double -0.0, %tmp2
+ ret double %tmp3
+}
+
+define double @test_fnms_f64_2(double %a, double %b, double %c) nounwind readnone ssp {
+entry:
+; CHECK: test_fnms_f64_2
+; CHECK: vfnms.f64
+ %tmp1 = fsub double -0.0, %b
+ %tmp2 = tail call double @llvm.fma.f64(double %a, double %tmp1, double %c) nounwind readnone
+ ret double %tmp2
+}
+
+define double @test_fnma_f64(double %a, double %b, double %c) nounwind readnone ssp {
+entry:
+; CHECK: test_fnma_f64
+; CHECK: vfnma.f64
+ %tmp1 = tail call double @llvm.fma.f64(double %a, double %b, double %c) nounwind readnone
+ %tmp2 = fsub double -0.0, %tmp1
+ ret double %tmp2
+}
+
+define double @test_fnma_f64_2(double %a, double %b, double %c) nounwind readnone ssp {
+entry:
+; CHECK: test_fnma_f64_2
+; CHECK: vfnma.f64
+ %tmp1 = fsub double -0.0, %a
+ %tmp2 = fsub double -0.0, %c
+ %tmp3 = tail call double @llvm.fma.f64(double %tmp1, double %b, double %tmp2) nounwind readnone
+ ret double %tmp3
+}
+
+declare float @llvm.fma.f32(float, float, float) nounwind readnone
+declare double @llvm.fma.f64(double, double, double) nounwind readnone
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/ldr_post.ll b/test/CodeGen/ARM/ldr_post.ll
index 97a48e1..8ddf025 100644
--- a/test/CodeGen/ARM/ldr_post.ll
+++ b/test/CodeGen/ARM/ldr_post.ll
@@ -1,7 +1,9 @@
-; RUN: llc < %s -march=arm | \
-; RUN: grep {ldr.*\\\[.*\],} | count 1
+; RUN: llc < %s -march=arm | FileCheck %s
-define i32 @test(i32 %a, i32 %b, i32 %c) {
+; CHECK: test1:
+; CHECK: ldr {{.*, \[.*]}}, -r2
+; CHECK-NOT: ldr
+define i32 @test1(i32 %a, i32 %b, i32 %c) {
%tmp1 = mul i32 %a, %b ; <i32> [#uses=2]
%tmp2 = inttoptr i32 %tmp1 to i32* ; <i32*> [#uses=1]
%tmp3 = load i32* %tmp2 ; <i32> [#uses=1]
@@ -10,3 +12,14 @@ define i32 @test(i32 %a, i32 %b, i32 %c) {
ret i32 %tmp5
}
+; CHECK: test2:
+; CHECK: ldr {{.*, \[.*\]}}, #-16
+; CHECK-NOT: ldr
+define i32 @test2(i32 %a, i32 %b) {
+ %tmp1 = mul i32 %a, %b ; <i32> [#uses=2]
+ %tmp2 = inttoptr i32 %tmp1 to i32* ; <i32*> [#uses=1]
+ %tmp3 = load i32* %tmp2 ; <i32> [#uses=1]
+ %tmp4 = sub i32 %tmp1, 16 ; <i32> [#uses=1]
+ %tmp5 = mul i32 %tmp4, %tmp3 ; <i32> [#uses=1]
+ ret i32 %tmp5
+}
diff --git a/test/CodeGen/ARM/ldr_pre.ll b/test/CodeGen/ARM/ldr_pre.ll
index 7c44284..e904e5f 100644
--- a/test/CodeGen/ARM/ldr_pre.ll
+++ b/test/CodeGen/ARM/ldr_pre.ll
@@ -1,6 +1,8 @@
-; RUN: llc < %s -march=arm | \
-; RUN: grep {ldr.*\\!} | count 2
+; RUN: llc < %s -march=arm | FileCheck %s
+; CHECK: test1:
+; CHECK: ldr {{.*!}}
+; CHECK-NOT: ldr
define i32* @test1(i32* %X, i32* %dest) {
%Y = getelementptr i32* %X, i32 4 ; <i32*> [#uses=2]
%A = load i32* %Y ; <i32> [#uses=1]
@@ -8,6 +10,9 @@ define i32* @test1(i32* %X, i32* %dest) {
ret i32* %Y
}
+; CHECK: test2:
+; CHECK: ldr {{.*!}}
+; CHECK-NOT: ldr
define i32 @test2(i32 %a, i32 %b, i32 %c) {
%tmp1 = sub i32 %a, %b ; <i32> [#uses=2]
%tmp2 = inttoptr i32 %tmp1 to i32* ; <i32*> [#uses=1]
@@ -16,4 +21,3 @@ define i32 @test2(i32 %a, i32 %b, i32 %c) {
%tmp5 = add i32 %tmp4, %tmp3 ; <i32> [#uses=1]
ret i32 %tmp5
}
-
diff --git a/test/CodeGen/ARM/ldrd.ll b/test/CodeGen/ARM/ldrd.ll
index a588bc3..3f8fd75 100644
--- a/test/CodeGen/ARM/ldrd.ll
+++ b/test/CodeGen/ARM/ldrd.ll
@@ -44,8 +44,7 @@ entry:
; BASIC: str
; GREEDY: @f
; GREEDY: %bb
-; GREEDY: ldr
-; GREEDY: ldr
+; GREEDY: ldrd
; GREEDY: str
define void @f(i32* nocapture %a, i32* nocapture %b, i32 %n) nounwind {
entry:
diff --git a/test/CodeGen/ARM/lit.local.cfg b/test/CodeGen/ARM/lit.local.cfg
index dd6c50d..cb77b09 100644
--- a/test/CodeGen/ARM/lit.local.cfg
+++ b/test/CodeGen/ARM/lit.local.cfg
@@ -1,13 +1,6 @@
config.suffixes = ['.ll', '.c', '.cpp']
-def getRoot(config):
- if not config.parent:
- return config
- return getRoot(config.parent)
-
-root = getRoot(config)
-
-targets = set(root.targets_to_build.split())
+targets = set(config.root.targets_to_build.split())
if not 'ARM' in targets:
config.unsupported = True
diff --git a/test/CodeGen/ARM/lsr-icmp-imm.ll b/test/CodeGen/ARM/lsr-icmp-imm.ll
new file mode 100644
index 0000000..5283f57
--- /dev/null
+++ b/test/CodeGen/ARM/lsr-icmp-imm.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=thumbv7-apple-ios -disable-code-place < %s | FileCheck %s
+; RUN: llc -mtriple=armv7-apple-ios -disable-code-place < %s | FileCheck %s
+
+; LSR should compare against the post-incremented induction variable.
+; In this case, the immediate value is -2 which requires a cmn instruction.
+;
+; CHECK: f:
+; CHECK: %for.body
+; CHECK: sub{{.*}}[[IV:r[0-9]+]], #2
+; CHECK: cmn{{.*}}[[IV]], #2
+; CHECK: bne
+define i32 @f(i32* nocapture %a, i32 %i) nounwind readonly ssp {
+entry:
+ %cmp3 = icmp eq i32 %i, -2
+ br i1 %cmp3, label %for.end, label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %bi.06 = phi i32 [ %i.addr.0.bi.0, %for.body ], [ 0, %entry ]
+ %i.addr.05 = phi i32 [ %sub, %for.body ], [ %i, %entry ]
+ %b.04 = phi i32 [ %.b.0, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32* %a, i32 %i.addr.05
+ %0 = load i32* %arrayidx, align 4
+ %cmp1 = icmp sgt i32 %0, %b.04
+ %.b.0 = select i1 %cmp1, i32 %0, i32 %b.04
+ %i.addr.0.bi.0 = select i1 %cmp1, i32 %i.addr.05, i32 %bi.06
+ %sub = add nsw i32 %i.addr.05, -2
+ %cmp = icmp eq i32 %i.addr.05, 0
+ br i1 %cmp, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ %bi.0.lcssa = phi i32 [ 0, %entry ], [ %i.addr.0.bi.0, %for.body ]
+ ret i32 %bi.0.lcssa
+}
diff --git a/test/CodeGen/ARM/lsr-on-unrolled-loops.ll b/test/CodeGen/ARM/lsr-on-unrolled-loops.ll
deleted file mode 100644
index ea5ae8f..0000000
--- a/test/CodeGen/ARM/lsr-on-unrolled-loops.ll
+++ /dev/null
@@ -1,640 +0,0 @@
-; RUN: llc -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8 -enable-lsr-nested < %s | FileCheck %s
-
-; LSR should recognize that this is an unrolled loop which can use
-; constant offset addressing, so that each of the following stores
-; uses the same register.
-
-; CHECK: vstr s{{.*}}, [{{(r[0-9]+)|(lr)}}, #32]
-; CHECK: vstr s{{.*}}, [{{(r[0-9]+)|(lr)}}, #64]
-; CHECK: vstr s{{.*}}, [{{(r[0-9]+)|(lr)}}, #96]
-
-; We can also save a register in the outer loop, but that requires
-; performing LSR on the outer loop.
-
-target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32"
-
-%0 = type { %1*, %3*, %6*, i8*, i32, i32, %8*, i32, i32, i32, i32, i32, i32, i32, double, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8**, i32, i32, i32, i32, i32, [64 x i32]*, [4 x %9*], [4 x %10*], [4 x %10*], i32, %11*, i32, i32, [16 x i8], [16 x i8], [16 x i8], i32, i32, i8, i8, i8, i16, i16, i32, i8, i32, %12*, i32, i32, i32, i32, i8*, i32, [4 x %11*], i32, i32, i32, [10 x i32], i32, i32, i32, i32, i32, %13*, %14*, %15*, %16*, %17*, %18*, %19*, %20*, %21*, %22*, %23* }
-%1 = type { void (%2*)*, void (%2*, i32)*, void (%2*)*, void (%2*, i8*)*, void (%2*)*, i32, %7, i32, i32, i8**, i32, i8**, i32, i32 }
-%2 = type { %1*, %3*, %6*, i8*, i32, i32 }
-%3 = type { i8* (%2*, i32, i32)*, i8* (%2*, i32, i32)*, i8** (%2*, i32, i32, i32)*, [64 x i16]** (%2*, i32, i32, i32)*, %4* (%2*, i32, i32, i32, i32, i32)*, %5* (%2*, i32, i32, i32, i32, i32)*, void (%2*)*, i8** (%2*, %4*, i32, i32, i32)*, [64 x i16]** (%2*, %5*, i32, i32, i32)*, void (%2*, i32)*, void (%2*)*, i32, i32 }
-%4 = type opaque
-%5 = type opaque
-%6 = type { void (%2*)*, i32, i32, i32, i32 }
-%7 = type { [8 x i32], [12 x i32] }
-%8 = type { i8*, i32, void (%0*)*, i32 (%0*)*, void (%0*, i32)*, i32 (%0*, i32)*, void (%0*)* }
-%9 = type { [64 x i16], i32 }
-%10 = type { [17 x i8], [256 x i8], i32 }
-%11 = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, %9*, i8* }
-%12 = type { %12*, i8, i32, i32, i8* }
-%13 = type { void (%0*)*, void (%0*)*, i32 }
-%14 = type { void (%0*, i32)*, void (%0*, i8**, i32*, i32)* }
-%15 = type { void (%0*)*, i32 (%0*)*, void (%0*)*, i32 (%0*, i8***)*, %5** }
-%16 = type { void (%0*, i32)*, void (%0*, i8***, i32*, i32, i8**, i32*, i32)* }
-%17 = type { i32 (%0*)*, void (%0*)*, void (%0*)*, void (%0*)*, i32, i32 }
-%18 = type { void (%0*)*, i32 (%0*)*, i32 (%0*)*, i32, i32, i32, i32 }
-%19 = type { void (%0*)*, i32 (%0*, [64 x i16]**)*, i32 }
-%20 = type { void (%0*)*, [10 x void (%0*, %11*, i16*, i8**, i32)*] }
-%21 = type { void (%0*)*, void (%0*, i8***, i32*, i32, i8**, i32*, i32)*, i32 }
-%22 = type { void (%0*)*, void (%0*, i8***, i32, i8**, i32)* }
-%23 = type { void (%0*, i32)*, void (%0*, i8**, i8**, i32)*, void (%0*)*, void (%0*)* }
-
-define void @test(%0* nocapture %a0, %11* nocapture %a1, i16* nocapture %a2, i8** nocapture %a3, i32 %a4) nounwind {
-bb:
- %t = alloca [64 x float], align 4
- %t5 = getelementptr inbounds %0* %a0, i32 0, i32 65
- %t6 = load i8** %t5, align 4
- %t7 = getelementptr inbounds %11* %a1, i32 0, i32 20
- %t8 = load i8** %t7, align 4
- br label %bb9
-
-bb9:
- %t10 = phi i32 [ 0, %bb ], [ %t157, %bb156 ]
- %t11 = add i32 %t10, 8
- %t12 = getelementptr [64 x float]* %t, i32 0, i32 %t11
- %t13 = add i32 %t10, 16
- %t14 = getelementptr [64 x float]* %t, i32 0, i32 %t13
- %t15 = add i32 %t10, 24
- %t16 = getelementptr [64 x float]* %t, i32 0, i32 %t15
- %t17 = add i32 %t10, 32
- %t18 = getelementptr [64 x float]* %t, i32 0, i32 %t17
- %t19 = add i32 %t10, 40
- %t20 = getelementptr [64 x float]* %t, i32 0, i32 %t19
- %t21 = add i32 %t10, 48
- %t22 = getelementptr [64 x float]* %t, i32 0, i32 %t21
- %t23 = add i32 %t10, 56
- %t24 = getelementptr [64 x float]* %t, i32 0, i32 %t23
- %t25 = getelementptr [64 x float]* %t, i32 0, i32 %t10
- %t26 = shl i32 %t10, 5
- %t27 = or i32 %t26, 8
- %t28 = getelementptr i8* %t8, i32 %t27
- %t29 = bitcast i8* %t28 to float*
- %t30 = or i32 %t26, 16
- %t31 = getelementptr i8* %t8, i32 %t30
- %t32 = bitcast i8* %t31 to float*
- %t33 = or i32 %t26, 24
- %t34 = getelementptr i8* %t8, i32 %t33
- %t35 = bitcast i8* %t34 to float*
- %t36 = or i32 %t26, 4
- %t37 = getelementptr i8* %t8, i32 %t36
- %t38 = bitcast i8* %t37 to float*
- %t39 = or i32 %t26, 12
- %t40 = getelementptr i8* %t8, i32 %t39
- %t41 = bitcast i8* %t40 to float*
- %t42 = or i32 %t26, 20
- %t43 = getelementptr i8* %t8, i32 %t42
- %t44 = bitcast i8* %t43 to float*
- %t45 = or i32 %t26, 28
- %t46 = getelementptr i8* %t8, i32 %t45
- %t47 = bitcast i8* %t46 to float*
- %t48 = getelementptr i8* %t8, i32 %t26
- %t49 = bitcast i8* %t48 to float*
- %t50 = shl i32 %t10, 3
- %t51 = or i32 %t50, 1
- %t52 = getelementptr i16* %a2, i32 %t51
- %t53 = or i32 %t50, 2
- %t54 = getelementptr i16* %a2, i32 %t53
- %t55 = or i32 %t50, 3
- %t56 = getelementptr i16* %a2, i32 %t55
- %t57 = or i32 %t50, 4
- %t58 = getelementptr i16* %a2, i32 %t57
- %t59 = or i32 %t50, 5
- %t60 = getelementptr i16* %a2, i32 %t59
- %t61 = or i32 %t50, 6
- %t62 = getelementptr i16* %a2, i32 %t61
- %t63 = or i32 %t50, 7
- %t64 = getelementptr i16* %a2, i32 %t63
- %t65 = getelementptr i16* %a2, i32 %t50
- %t66 = load i16* %t52, align 2
- %t67 = icmp eq i16 %t66, 0
- %t68 = load i16* %t54, align 2
- %t69 = icmp eq i16 %t68, 0
- %t70 = and i1 %t67, %t69
- br i1 %t70, label %bb71, label %bb91
-
-bb71:
- %t72 = load i16* %t56, align 2
- %t73 = icmp eq i16 %t72, 0
- br i1 %t73, label %bb74, label %bb91
-
-bb74:
- %t75 = load i16* %t58, align 2
- %t76 = icmp eq i16 %t75, 0
- br i1 %t76, label %bb77, label %bb91
-
-bb77:
- %t78 = load i16* %t60, align 2
- %t79 = icmp eq i16 %t78, 0
- br i1 %t79, label %bb80, label %bb91
-
-bb80:
- %t81 = load i16* %t62, align 2
- %t82 = icmp eq i16 %t81, 0
- br i1 %t82, label %bb83, label %bb91
-
-bb83:
- %t84 = load i16* %t64, align 2
- %t85 = icmp eq i16 %t84, 0
- br i1 %t85, label %bb86, label %bb91
-
-bb86:
- %t87 = load i16* %t65, align 2
- %t88 = sitofp i16 %t87 to float
- %t89 = load float* %t49, align 4
- %t90 = fmul float %t88, %t89
- store float %t90, float* %t25, align 4
- store float %t90, float* %t12, align 4
- store float %t90, float* %t14, align 4
- store float %t90, float* %t16, align 4
- store float %t90, float* %t18, align 4
- store float %t90, float* %t20, align 4
- store float %t90, float* %t22, align 4
- store float %t90, float* %t24, align 4
- br label %bb156
-
-bb91:
- %t92 = load i16* %t65, align 2
- %t93 = sitofp i16 %t92 to float
- %t94 = load float* %t49, align 4
- %t95 = fmul float %t93, %t94
- %t96 = sitofp i16 %t68 to float
- %t97 = load float* %t29, align 4
- %t98 = fmul float %t96, %t97
- %t99 = load i16* %t58, align 2
- %t100 = sitofp i16 %t99 to float
- %t101 = load float* %t32, align 4
- %t102 = fmul float %t100, %t101
- %t103 = load i16* %t62, align 2
- %t104 = sitofp i16 %t103 to float
- %t105 = load float* %t35, align 4
- %t106 = fmul float %t104, %t105
- %t107 = fadd float %t95, %t102
- %t108 = fsub float %t95, %t102
- %t109 = fadd float %t98, %t106
- %t110 = fsub float %t98, %t106
- %t111 = fmul float %t110, 0x3FF6A09E60000000
- %t112 = fsub float %t111, %t109
- %t113 = fadd float %t107, %t109
- %t114 = fsub float %t107, %t109
- %t115 = fadd float %t108, %t112
- %t116 = fsub float %t108, %t112
- %t117 = sitofp i16 %t66 to float
- %t118 = load float* %t38, align 4
- %t119 = fmul float %t117, %t118
- %t120 = load i16* %t56, align 2
- %t121 = sitofp i16 %t120 to float
- %t122 = load float* %t41, align 4
- %t123 = fmul float %t121, %t122
- %t124 = load i16* %t60, align 2
- %t125 = sitofp i16 %t124 to float
- %t126 = load float* %t44, align 4
- %t127 = fmul float %t125, %t126
- %t128 = load i16* %t64, align 2
- %t129 = sitofp i16 %t128 to float
- %t130 = load float* %t47, align 4
- %t131 = fmul float %t129, %t130
- %t132 = fadd float %t127, %t123
- %t133 = fsub float %t127, %t123
- %t134 = fadd float %t119, %t131
- %t135 = fsub float %t119, %t131
- %t136 = fadd float %t134, %t132
- %t137 = fsub float %t134, %t132
- %t138 = fmul float %t137, 0x3FF6A09E60000000
- %t139 = fadd float %t133, %t135
- %t140 = fmul float %t139, 0x3FFD906BC0000000
- %t141 = fmul float %t135, 0x3FF1517A80000000
- %t142 = fsub float %t141, %t140
- %t143 = fmul float %t133, 0xC004E7AEA0000000
- %t144 = fadd float %t143, %t140
- %t145 = fsub float %t144, %t136
- %t146 = fsub float %t138, %t145
- %t147 = fadd float %t142, %t146
- %t148 = fadd float %t113, %t136
- store float %t148, float* %t25, align 4
- %t149 = fsub float %t113, %t136
- store float %t149, float* %t24, align 4
- %t150 = fadd float %t115, %t145
- store float %t150, float* %t12, align 4
- %t151 = fsub float %t115, %t145
- store float %t151, float* %t22, align 4
- %t152 = fadd float %t116, %t146
- store float %t152, float* %t14, align 4
- %t153 = fsub float %t116, %t146
- store float %t153, float* %t20, align 4
- %t154 = fadd float %t114, %t147
- store float %t154, float* %t18, align 4
- %t155 = fsub float %t114, %t147
- store float %t155, float* %t16, align 4
- br label %bb156
-
-bb156:
- %t157 = add i32 %t10, 1
- %t158 = icmp eq i32 %t157, 8
- br i1 %t158, label %bb159, label %bb9
-
-bb159:
- %t160 = add i32 %a4, 7
- %t161 = add i32 %a4, 1
- %t162 = add i32 %a4, 6
- %t163 = add i32 %a4, 2
- %t164 = add i32 %a4, 5
- %t165 = add i32 %a4, 4
- %t166 = add i32 %a4, 3
- br label %bb167
-
-bb167:
- %t168 = phi i32 [ 0, %bb159 ], [ %t293, %bb167 ]
- %t169 = getelementptr i8** %a3, i32 %t168
- %t170 = shl i32 %t168, 3
- %t171 = or i32 %t170, 4
- %t172 = getelementptr [64 x float]* %t, i32 0, i32 %t171
- %t173 = or i32 %t170, 2
- %t174 = getelementptr [64 x float]* %t, i32 0, i32 %t173
- %t175 = or i32 %t170, 6
- %t176 = getelementptr [64 x float]* %t, i32 0, i32 %t175
- %t177 = or i32 %t170, 5
- %t178 = getelementptr [64 x float]* %t, i32 0, i32 %t177
- %t179 = or i32 %t170, 3
- %t180 = getelementptr [64 x float]* %t, i32 0, i32 %t179
- %t181 = or i32 %t170, 1
- %t182 = getelementptr [64 x float]* %t, i32 0, i32 %t181
- %t183 = or i32 %t170, 7
- %t184 = getelementptr [64 x float]* %t, i32 0, i32 %t183
- %t185 = getelementptr [64 x float]* %t, i32 0, i32 %t170
- %t186 = load i8** %t169, align 4
- %t187 = getelementptr inbounds i8* %t186, i32 %a4
- %t188 = load float* %t185, align 4
- %t189 = load float* %t172, align 4
- %t190 = fadd float %t188, %t189
- %t191 = fsub float %t188, %t189
- %t192 = load float* %t174, align 4
- %t193 = load float* %t176, align 4
- %t194 = fadd float %t192, %t193
- %t195 = fsub float %t192, %t193
- %t196 = fmul float %t195, 0x3FF6A09E60000000
- %t197 = fsub float %t196, %t194
- %t198 = fadd float %t190, %t194
- %t199 = fsub float %t190, %t194
- %t200 = fadd float %t191, %t197
- %t201 = fsub float %t191, %t197
- %t202 = load float* %t178, align 4
- %t203 = load float* %t180, align 4
- %t204 = fadd float %t202, %t203
- %t205 = fsub float %t202, %t203
- %t206 = load float* %t182, align 4
- %t207 = load float* %t184, align 4
- %t208 = fadd float %t206, %t207
- %t209 = fsub float %t206, %t207
- %t210 = fadd float %t208, %t204
- %t211 = fsub float %t208, %t204
- %t212 = fmul float %t211, 0x3FF6A09E60000000
- %t213 = fadd float %t205, %t209
- %t214 = fmul float %t213, 0x3FFD906BC0000000
- %t215 = fmul float %t209, 0x3FF1517A80000000
- %t216 = fsub float %t215, %t214
- %t217 = fmul float %t205, 0xC004E7AEA0000000
- %t218 = fadd float %t217, %t214
- %t219 = fsub float %t218, %t210
- %t220 = fsub float %t212, %t219
- %t221 = fadd float %t216, %t220
- %t222 = fadd float %t198, %t210
- %t223 = fptosi float %t222 to i32
- %t224 = add nsw i32 %t223, 4
- %t225 = lshr i32 %t224, 3
- %t226 = and i32 %t225, 1023
- %t227 = add i32 %t226, 128
- %t228 = getelementptr inbounds i8* %t6, i32 %t227
- %t229 = load i8* %t228, align 1
- store i8 %t229, i8* %t187, align 1
- %t230 = fsub float %t198, %t210
- %t231 = fptosi float %t230 to i32
- %t232 = add nsw i32 %t231, 4
- %t233 = lshr i32 %t232, 3
- %t234 = and i32 %t233, 1023
- %t235 = add i32 %t234, 128
- %t236 = getelementptr inbounds i8* %t6, i32 %t235
- %t237 = load i8* %t236, align 1
- %t238 = getelementptr inbounds i8* %t186, i32 %t160
- store i8 %t237, i8* %t238, align 1
- %t239 = fadd float %t200, %t219
- %t240 = fptosi float %t239 to i32
- %t241 = add nsw i32 %t240, 4
- %t242 = lshr i32 %t241, 3
- %t243 = and i32 %t242, 1023
- %t244 = add i32 %t243, 128
- %t245 = getelementptr inbounds i8* %t6, i32 %t244
- %t246 = load i8* %t245, align 1
- %t247 = getelementptr inbounds i8* %t186, i32 %t161
- store i8 %t246, i8* %t247, align 1
- %t248 = fsub float %t200, %t219
- %t249 = fptosi float %t248 to i32
- %t250 = add nsw i32 %t249, 4
- %t251 = lshr i32 %t250, 3
- %t252 = and i32 %t251, 1023
- %t253 = add i32 %t252, 128
- %t254 = getelementptr inbounds i8* %t6, i32 %t253
- %t255 = load i8* %t254, align 1
- %t256 = getelementptr inbounds i8* %t186, i32 %t162
- store i8 %t255, i8* %t256, align 1
- %t257 = fadd float %t201, %t220
- %t258 = fptosi float %t257 to i32
- %t259 = add nsw i32 %t258, 4
- %t260 = lshr i32 %t259, 3
- %t261 = and i32 %t260, 1023
- %t262 = add i32 %t261, 128
- %t263 = getelementptr inbounds i8* %t6, i32 %t262
- %t264 = load i8* %t263, align 1
- %t265 = getelementptr inbounds i8* %t186, i32 %t163
- store i8 %t264, i8* %t265, align 1
- %t266 = fsub float %t201, %t220
- %t267 = fptosi float %t266 to i32
- %t268 = add nsw i32 %t267, 4
- %t269 = lshr i32 %t268, 3
- %t270 = and i32 %t269, 1023
- %t271 = add i32 %t270, 128
- %t272 = getelementptr inbounds i8* %t6, i32 %t271
- %t273 = load i8* %t272, align 1
- %t274 = getelementptr inbounds i8* %t186, i32 %t164
- store i8 %t273, i8* %t274, align 1
- %t275 = fadd float %t199, %t221
- %t276 = fptosi float %t275 to i32
- %t277 = add nsw i32 %t276, 4
- %t278 = lshr i32 %t277, 3
- %t279 = and i32 %t278, 1023
- %t280 = add i32 %t279, 128
- %t281 = getelementptr inbounds i8* %t6, i32 %t280
- %t282 = load i8* %t281, align 1
- %t283 = getelementptr inbounds i8* %t186, i32 %t165
- store i8 %t282, i8* %t283, align 1
- %t284 = fsub float %t199, %t221
- %t285 = fptosi float %t284 to i32
- %t286 = add nsw i32 %t285, 4
- %t287 = lshr i32 %t286, 3
- %t288 = and i32 %t287, 1023
- %t289 = add i32 %t288, 128
- %t290 = getelementptr inbounds i8* %t6, i32 %t289
- %t291 = load i8* %t290, align 1
- %t292 = getelementptr inbounds i8* %t186, i32 %t166
- store i8 %t291, i8* %t292, align 1
- %t293 = add nsw i32 %t168, 1
- %t294 = icmp eq i32 %t293, 8
- br i1 %t294, label %bb295, label %bb167
-
-bb295:
- ret void
-}
-
-%struct.ct_data_s = type { %union.anon, %union.anon }
-%struct.gz_header = type { i32, i32, i32, i32, i8*, i32, i32, i8*, i32, i8*, i32, i32, i32 }
-%struct.internal_state = type { %struct.z_stream*, i32, i8*, i32, i8*, i32, i32, %struct.gz_header*, i32, i8, i32, i32, i32, i32, i8*, i32, i16*, i16*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [573 x %struct.ct_data_s], [61 x %struct.ct_data_s], [39 x %struct.ct_data_s], %struct.tree_desc_s, %struct.tree_desc_s, %struct.tree_desc_s, [16 x i16], [573 x i32], i32, i32, [573 x i8], i8*, i32, i32, i16*, i32, i32, i32, i32, i16, i32 }
-%struct.static_tree_desc = type { i32 }
-%struct.tree_desc_s = type { %struct.ct_data_s*, i32, %struct.static_tree_desc* }
-%struct.z_stream = type { i8*, i32, i32, i8*, i32, i32, i8*, %struct.internal_state*, i8* (i8*, i32, i32)*, void (i8*, i8*)*, i8*, i32, i32, i32 }
-%union.anon = type { i16 }
-
-define i32 @longest_match(%struct.internal_state* %s, i32 %cur_match) nounwind optsize {
-entry:
- %0 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 31 ; <i32*> [#uses=1]
- %1 = load i32* %0, align 4 ; <i32> [#uses=2]
- %2 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 14 ; <i8**> [#uses=1]
- %3 = load i8** %2, align 4 ; <i8*> [#uses=27]
- %4 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 27 ; <i32*> [#uses=1]
- %5 = load i32* %4, align 4 ; <i32> [#uses=17]
- %6 = getelementptr inbounds i8* %3, i32 %5 ; <i8*> [#uses=1]
- %7 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 30 ; <i32*> [#uses=1]
- %8 = load i32* %7, align 4 ; <i32> [#uses=4]
- %9 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 36 ; <i32*> [#uses=1]
- %10 = load i32* %9, align 4 ; <i32> [#uses=2]
- %11 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 11 ; <i32*> [#uses=1]
- %12 = load i32* %11, align 4 ; <i32> [#uses=2]
- %13 = add i32 %12, -262 ; <i32> [#uses=1]
- %14 = icmp ugt i32 %5, %13 ; <i1> [#uses=1]
- br i1 %14, label %bb, label %bb2
-
-bb: ; preds = %entry
- %15 = add i32 %5, 262 ; <i32> [#uses=1]
- %16 = sub i32 %15, %12 ; <i32> [#uses=1]
- br label %bb2
-
-bb2: ; preds = %bb, %entry
- %iftmp.48.0 = phi i32 [ %16, %bb ], [ 0, %entry ] ; <i32> [#uses=1]
- %17 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 16 ; <i16**> [#uses=1]
- %18 = load i16** %17, align 4 ; <i16*> [#uses=1]
- %19 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 13 ; <i32*> [#uses=1]
- %20 = load i32* %19, align 4 ; <i32> [#uses=1]
- %.sum = add i32 %5, 258 ; <i32> [#uses=2]
- %21 = getelementptr inbounds i8* %3, i32 %.sum ; <i8*> [#uses=1]
- %22 = add nsw i32 %5, -1 ; <i32> [#uses=1]
- %.sum30 = add i32 %22, %8 ; <i32> [#uses=1]
- %23 = getelementptr inbounds i8* %3, i32 %.sum30 ; <i8*> [#uses=1]
- %24 = load i8* %23, align 1 ; <i8> [#uses=1]
- %.sum31 = add i32 %8, %5 ; <i32> [#uses=1]
- %25 = getelementptr inbounds i8* %3, i32 %.sum31 ; <i8*> [#uses=1]
- %26 = load i8* %25, align 1 ; <i8> [#uses=1]
- %27 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 35 ; <i32*> [#uses=1]
- %28 = load i32* %27, align 4 ; <i32> [#uses=1]
- %29 = lshr i32 %1, 2 ; <i32> [#uses=1]
- %30 = icmp ult i32 %8, %28 ; <i1> [#uses=1]
- %. = select i1 %30, i32 %1, i32 %29 ; <i32> [#uses=1]
- %31 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 29 ; <i32*> [#uses=1]
- %32 = load i32* %31, align 4 ; <i32> [#uses=4]
- %33 = icmp ugt i32 %10, %32 ; <i1> [#uses=1]
- %nice_match.0.ph = select i1 %33, i32 %32, i32 %10 ; <i32> [#uses=1]
- %34 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 28 ; <i32*> [#uses=1]
- %35 = ptrtoint i8* %21 to i32 ; <i32> [#uses=1]
- %36 = add nsw i32 %5, 257 ; <i32> [#uses=1]
- %tmp81 = add i32 %., -1 ; <i32> [#uses=1]
- br label %bb6
-
-bb6: ; preds = %bb24, %bb2
- %indvar78 = phi i32 [ 0, %bb2 ], [ %indvar.next79, %bb24 ] ; <i32> [#uses=2]
- %best_len.2 = phi i32 [ %8, %bb2 ], [ %best_len.0, %bb24 ] ; <i32> [#uses=8]
- %scan_end1.1 = phi i8 [ %24, %bb2 ], [ %scan_end1.0, %bb24 ] ; <i8> [#uses=6]
- %cur_match_addr.0 = phi i32 [ %cur_match, %bb2 ], [ %90, %bb24 ] ; <i32> [#uses=14]
- %scan_end.1 = phi i8 [ %26, %bb2 ], [ %scan_end.0, %bb24 ] ; <i8> [#uses=6]
- %37 = getelementptr inbounds i8* %3, i32 %cur_match_addr.0 ; <i8*> [#uses=1]
- %.sum32 = add i32 %cur_match_addr.0, %best_len.2 ; <i32> [#uses=1]
- %38 = getelementptr inbounds i8* %3, i32 %.sum32 ; <i8*> [#uses=1]
- %39 = load i8* %38, align 1 ; <i8> [#uses=1]
- %40 = icmp eq i8 %39, %scan_end.1 ; <i1> [#uses=1]
- br i1 %40, label %bb7, label %bb23
-
-bb7: ; preds = %bb6
- %41 = add nsw i32 %best_len.2, -1 ; <i32> [#uses=1]
- %.sum33 = add i32 %41, %cur_match_addr.0 ; <i32> [#uses=1]
- %42 = getelementptr inbounds i8* %3, i32 %.sum33 ; <i8*> [#uses=1]
- %43 = load i8* %42, align 1 ; <i8> [#uses=1]
- %44 = icmp eq i8 %43, %scan_end1.1 ; <i1> [#uses=1]
- br i1 %44, label %bb8, label %bb23
-
-bb8: ; preds = %bb7
- %45 = load i8* %37, align 1 ; <i8> [#uses=1]
- %46 = load i8* %6, align 1 ; <i8> [#uses=1]
- %47 = icmp eq i8 %45, %46 ; <i1> [#uses=1]
- br i1 %47, label %bb9, label %bb23
-
-bb9: ; preds = %bb8
- %.sum34 = add i32 %cur_match_addr.0, 1 ; <i32> [#uses=1]
- %48 = getelementptr inbounds i8* %3, i32 %.sum34 ; <i8*> [#uses=1]
- %49 = load i8* %48, align 1 ; <i8> [#uses=1]
- %.sum88 = add i32 %5, 1 ; <i32> [#uses=1]
- %50 = getelementptr inbounds i8* %3, i32 %.sum88 ; <i8*> [#uses=1]
- %51 = load i8* %50, align 1 ; <i8> [#uses=1]
- %52 = icmp eq i8 %49, %51 ; <i1> [#uses=1]
- br i1 %52, label %bb10, label %bb23
-
-bb10: ; preds = %bb9
- %tmp39 = add i32 %cur_match_addr.0, 10 ; <i32> [#uses=1]
- %tmp41 = add i32 %cur_match_addr.0, 9 ; <i32> [#uses=1]
- %tmp44 = add i32 %cur_match_addr.0, 8 ; <i32> [#uses=1]
- %tmp47 = add i32 %cur_match_addr.0, 7 ; <i32> [#uses=1]
- %tmp50 = add i32 %cur_match_addr.0, 6 ; <i32> [#uses=1]
- %tmp53 = add i32 %cur_match_addr.0, 5 ; <i32> [#uses=1]
- %tmp56 = add i32 %cur_match_addr.0, 4 ; <i32> [#uses=1]
- %tmp59 = add i32 %cur_match_addr.0, 3 ; <i32> [#uses=1]
- br label %bb11
-
-bb11: ; preds = %bb18, %bb10
- %indvar = phi i32 [ %indvar.next, %bb18 ], [ 0, %bb10 ] ; <i32> [#uses=2]
- %tmp = shl i32 %indvar, 3 ; <i32> [#uses=16]
- %tmp40 = add i32 %tmp39, %tmp ; <i32> [#uses=1]
- %scevgep = getelementptr i8* %3, i32 %tmp40 ; <i8*> [#uses=1]
- %tmp42 = add i32 %tmp41, %tmp ; <i32> [#uses=1]
- %scevgep43 = getelementptr i8* %3, i32 %tmp42 ; <i8*> [#uses=1]
- %tmp45 = add i32 %tmp44, %tmp ; <i32> [#uses=1]
- %scevgep46 = getelementptr i8* %3, i32 %tmp45 ; <i8*> [#uses=1]
- %tmp48 = add i32 %tmp47, %tmp ; <i32> [#uses=1]
- %scevgep49 = getelementptr i8* %3, i32 %tmp48 ; <i8*> [#uses=1]
- %tmp51 = add i32 %tmp50, %tmp ; <i32> [#uses=1]
- %scevgep52 = getelementptr i8* %3, i32 %tmp51 ; <i8*> [#uses=1]
- %tmp54 = add i32 %tmp53, %tmp ; <i32> [#uses=1]
- %scevgep55 = getelementptr i8* %3, i32 %tmp54 ; <i8*> [#uses=1]
- %tmp60 = add i32 %tmp59, %tmp ; <i32> [#uses=1]
- %scevgep61 = getelementptr i8* %3, i32 %tmp60 ; <i8*> [#uses=1]
- %tmp62 = add i32 %tmp, 10 ; <i32> [#uses=1]
- %.sum89 = add i32 %5, %tmp62 ; <i32> [#uses=2]
- %scevgep63 = getelementptr i8* %3, i32 %.sum89 ; <i8*> [#uses=2]
- %tmp64 = add i32 %tmp, 9 ; <i32> [#uses=1]
- %.sum90 = add i32 %5, %tmp64 ; <i32> [#uses=1]
- %scevgep65 = getelementptr i8* %3, i32 %.sum90 ; <i8*> [#uses=2]
- %tmp66 = add i32 %tmp, 8 ; <i32> [#uses=1]
- %.sum91 = add i32 %5, %tmp66 ; <i32> [#uses=1]
- %scevgep67 = getelementptr i8* %3, i32 %.sum91 ; <i8*> [#uses=2]
- %tmp6883 = or i32 %tmp, 7 ; <i32> [#uses=1]
- %.sum92 = add i32 %5, %tmp6883 ; <i32> [#uses=1]
- %scevgep69 = getelementptr i8* %3, i32 %.sum92 ; <i8*> [#uses=2]
- %tmp7084 = or i32 %tmp, 6 ; <i32> [#uses=1]
- %.sum93 = add i32 %5, %tmp7084 ; <i32> [#uses=1]
- %scevgep71 = getelementptr i8* %3, i32 %.sum93 ; <i8*> [#uses=2]
- %tmp7285 = or i32 %tmp, 5 ; <i32> [#uses=1]
- %.sum94 = add i32 %5, %tmp7285 ; <i32> [#uses=1]
- %scevgep73 = getelementptr i8* %3, i32 %.sum94 ; <i8*> [#uses=2]
- %tmp7486 = or i32 %tmp, 4 ; <i32> [#uses=1]
- %.sum95 = add i32 %5, %tmp7486 ; <i32> [#uses=1]
- %scevgep75 = getelementptr i8* %3, i32 %.sum95 ; <i8*> [#uses=2]
- %tmp7687 = or i32 %tmp, 3 ; <i32> [#uses=1]
- %.sum96 = add i32 %5, %tmp7687 ; <i32> [#uses=1]
- %scevgep77 = getelementptr i8* %3, i32 %.sum96 ; <i8*> [#uses=2]
- %53 = load i8* %scevgep77, align 1 ; <i8> [#uses=1]
- %54 = load i8* %scevgep61, align 1 ; <i8> [#uses=1]
- %55 = icmp eq i8 %53, %54 ; <i1> [#uses=1]
- br i1 %55, label %bb12, label %bb20
-
-bb12: ; preds = %bb11
- %tmp57 = add i32 %tmp56, %tmp ; <i32> [#uses=1]
- %scevgep58 = getelementptr i8* %3, i32 %tmp57 ; <i8*> [#uses=1]
- %56 = load i8* %scevgep75, align 1 ; <i8> [#uses=1]
- %57 = load i8* %scevgep58, align 1 ; <i8> [#uses=1]
- %58 = icmp eq i8 %56, %57 ; <i1> [#uses=1]
- br i1 %58, label %bb13, label %bb20
-
-bb13: ; preds = %bb12
- %59 = load i8* %scevgep73, align 1 ; <i8> [#uses=1]
- %60 = load i8* %scevgep55, align 1 ; <i8> [#uses=1]
- %61 = icmp eq i8 %59, %60 ; <i1> [#uses=1]
- br i1 %61, label %bb14, label %bb20
-
-bb14: ; preds = %bb13
- %62 = load i8* %scevgep71, align 1 ; <i8> [#uses=1]
- %63 = load i8* %scevgep52, align 1 ; <i8> [#uses=1]
- %64 = icmp eq i8 %62, %63 ; <i1> [#uses=1]
- br i1 %64, label %bb15, label %bb20
-
-bb15: ; preds = %bb14
- %65 = load i8* %scevgep69, align 1 ; <i8> [#uses=1]
- %66 = load i8* %scevgep49, align 1 ; <i8> [#uses=1]
- %67 = icmp eq i8 %65, %66 ; <i1> [#uses=1]
- br i1 %67, label %bb16, label %bb20
-
-bb16: ; preds = %bb15
- %68 = load i8* %scevgep67, align 1 ; <i8> [#uses=1]
- %69 = load i8* %scevgep46, align 1 ; <i8> [#uses=1]
- %70 = icmp eq i8 %68, %69 ; <i1> [#uses=1]
- br i1 %70, label %bb17, label %bb20
-
-bb17: ; preds = %bb16
- %71 = load i8* %scevgep65, align 1 ; <i8> [#uses=1]
- %72 = load i8* %scevgep43, align 1 ; <i8> [#uses=1]
- %73 = icmp eq i8 %71, %72 ; <i1> [#uses=1]
- br i1 %73, label %bb18, label %bb20
-
-bb18: ; preds = %bb17
- %74 = load i8* %scevgep63, align 1 ; <i8> [#uses=1]
- %75 = load i8* %scevgep, align 1 ; <i8> [#uses=1]
- %76 = icmp eq i8 %74, %75 ; <i1> [#uses=1]
- %77 = icmp slt i32 %.sum89, %.sum ; <i1> [#uses=1]
- %or.cond = and i1 %76, %77 ; <i1> [#uses=1]
- %indvar.next = add i32 %indvar, 1 ; <i32> [#uses=1]
- br i1 %or.cond, label %bb11, label %bb20
-
-bb20: ; preds = %bb18, %bb17, %bb16, %bb15, %bb14, %bb13, %bb12, %bb11
- %scan.3 = phi i8* [ %scevgep77, %bb11 ], [ %scevgep75, %bb12 ], [ %scevgep73, %bb13 ], [ %scevgep71, %bb14 ], [ %scevgep69, %bb15 ], [ %scevgep67, %bb16 ], [ %scevgep65, %bb17 ], [ %scevgep63, %bb18 ] ; <i8*> [#uses=1]
- %78 = ptrtoint i8* %scan.3 to i32 ; <i32> [#uses=1]
- %79 = sub nsw i32 %78, %35 ; <i32> [#uses=2]
- %80 = add i32 %79, 258 ; <i32> [#uses=5]
- %81 = icmp sgt i32 %80, %best_len.2 ; <i1> [#uses=1]
- br i1 %81, label %bb21, label %bb23
-
-bb21: ; preds = %bb20
- store i32 %cur_match_addr.0, i32* %34, align 4
- %82 = icmp slt i32 %80, %nice_match.0.ph ; <i1> [#uses=1]
- br i1 %82, label %bb22, label %bb25
-
-bb22: ; preds = %bb21
- %.sum37 = add i32 %36, %79 ; <i32> [#uses=1]
- %83 = getelementptr inbounds i8* %3, i32 %.sum37 ; <i8*> [#uses=1]
- %84 = load i8* %83, align 1 ; <i8> [#uses=1]
- %.sum38 = add i32 %80, %5 ; <i32> [#uses=1]
- %85 = getelementptr inbounds i8* %3, i32 %.sum38 ; <i8*> [#uses=1]
- %86 = load i8* %85, align 1 ; <i8> [#uses=1]
- br label %bb23
-
-bb23: ; preds = %bb22, %bb20, %bb9, %bb8, %bb7, %bb6
- %best_len.0 = phi i32 [ %best_len.2, %bb6 ], [ %best_len.2, %bb7 ], [ %best_len.2, %bb8 ], [ %best_len.2, %bb9 ], [ %80, %bb22 ], [ %best_len.2, %bb20 ] ; <i32> [#uses=3]
- %scan_end1.0 = phi i8 [ %scan_end1.1, %bb6 ], [ %scan_end1.1, %bb7 ], [ %scan_end1.1, %bb8 ], [ %scan_end1.1, %bb9 ], [ %84, %bb22 ], [ %scan_end1.1, %bb20 ] ; <i8> [#uses=1]
- %scan_end.0 = phi i8 [ %scan_end.1, %bb6 ], [ %scan_end.1, %bb7 ], [ %scan_end.1, %bb8 ], [ %scan_end.1, %bb9 ], [ %86, %bb22 ], [ %scan_end.1, %bb20 ] ; <i8> [#uses=1]
- %87 = and i32 %cur_match_addr.0, %20 ; <i32> [#uses=1]
- %88 = getelementptr inbounds i16* %18, i32 %87 ; <i16*> [#uses=1]
- %89 = load i16* %88, align 2 ; <i16> [#uses=1]
- %90 = zext i16 %89 to i32 ; <i32> [#uses=2]
- %91 = icmp ugt i32 %90, %iftmp.48.0 ; <i1> [#uses=1]
- br i1 %91, label %bb24, label %bb25
-
-bb24: ; preds = %bb23
-
-; LSR should use count-down iteration to avoid requiring the trip count
-; in a register.
-
-; CHECK: @ %bb24
-; CHECK: subs{{.*}} {{(r[0-9]+)|(lr)}}, #1
-; CHECK: bne.w
-
- %92 = icmp eq i32 %tmp81, %indvar78 ; <i1> [#uses=1]
- %indvar.next79 = add i32 %indvar78, 1 ; <i32> [#uses=1]
- br i1 %92, label %bb25, label %bb6
-
-bb25: ; preds = %bb24, %bb23, %bb21
- %best_len.1 = phi i32 [ %best_len.0, %bb23 ], [ %best_len.0, %bb24 ], [ %80, %bb21 ] ; <i32> [#uses=2]
- %93 = icmp ugt i32 %best_len.1, %32 ; <i1> [#uses=1]
- %merge = select i1 %93, i32 %32, i32 %best_len.1 ; <i32> [#uses=1]
- ret i32 %merge
-}
diff --git a/test/CodeGen/ARM/machine-cse-cmp.ll b/test/CodeGen/ARM/machine-cse-cmp.ll
index f566974..3ac7d77 100644
--- a/test/CodeGen/ARM/machine-cse-cmp.ll
+++ b/test/CodeGen/ARM/machine-cse-cmp.ll
@@ -10,7 +10,7 @@ entry:
; CHECK: cmp
; CHECK: moveq
; CHECK-NOT: cmp
-; CHECK: moveq
+; CHECK: mov{{eq|ne}}
%tmp1 = icmp eq i32 %cond1, 0
%tmp2 = select i1 %tmp1, i32 %x1, i32 %x2
%tmp3 = select i1 %tmp1, i32 %x2, i32 %x3
diff --git a/test/CodeGen/ARM/opt-shuff-tstore.ll b/test/CodeGen/ARM/opt-shuff-tstore.ll
new file mode 100644
index 0000000..b4da552
--- /dev/null
+++ b/test/CodeGen/ARM/opt-shuff-tstore.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mcpu=cortex-a9 -mtriple=arm-linux-unknown -promote-elements -mattr=+neon < %s | FileCheck %s
+
+; CHECK: func_4_8
+; CHECK: vst1.32
+; CHECK-NEXT: bx lr
+define void @func_4_8(<4 x i8> %param, <4 x i8>* %p) {
+ %r = add <4 x i8> %param, <i8 1, i8 2, i8 3, i8 4>
+ store <4 x i8> %r, <4 x i8>* %p
+ ret void
+}
+
+; CHECK: func_2_16
+; CHECK: vst1.32
+; CHECK-NEXT: bx lr
+define void @func_2_16(<2 x i16> %param, <2 x i16>* %p) {
+ %r = add <2 x i16> %param, <i16 1, i16 2>
+ store <2 x i16> %r, <2 x i16>* %p
+ ret void
+}
diff --git a/test/CodeGen/ARM/reg_asc_order.ll b/test/CodeGen/ARM/reg_asc_order.ll
new file mode 100644
index 0000000..d1d0ee5
--- /dev/null
+++ b/test/CodeGen/ARM/reg_asc_order.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+; Check that memcpy gets lowered to ldm/stm, at least in this very smple case.
+
+%struct.Foo = type { i32, i32, i32, i32 }
+
+define void @_Z10CopyStructP3FooS0_(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind {
+entry:
+;CHECK: ldm
+;CHECK: stm
+ %0 = bitcast %struct.Foo* %a to i8*
+ %1 = bitcast %struct.Foo* %b to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false)
+ ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
diff --git a/test/CodeGen/ARM/reg_sequence.ll b/test/CodeGen/ARM/reg_sequence.ll
index 78b4e7e..05794e4 100644
--- a/test/CodeGen/ARM/reg_sequence.ll
+++ b/test/CodeGen/ARM/reg_sequence.ll
@@ -273,7 +273,7 @@ define arm_aapcs_vfpcc i32 @t10() nounwind {
entry:
; CHECK: t10:
; CHECK: vmov.i32 q[[Q0:[0-9]+]], #0x3f000000
-; CHECK: vmul.f32 q8, q8, d0[0]
+; CHECK: vmul.f32 q8, q8, d[[DREG:[0-1]+]]
; CHECK: vadd.f32 q8, q8, q8
%0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
%1 = insertelement <4 x float> %0, float undef, i32 1 ; <<4 x float>> [#uses=1]
diff --git a/test/CodeGen/ARM/select-imm.ll b/test/CodeGen/ARM/select-imm.ll
index e927b39..c9ac66a 100644
--- a/test/CodeGen/ARM/select-imm.ll
+++ b/test/CodeGen/ARM/select-imm.ll
@@ -64,7 +64,7 @@ define i32 @t4(i32 %a, i32 %b, i32 %x) nounwind {
entry:
; ARM: t4:
; ARM: ldr
-; ARM: movlt
+; ARM: mov{{lt|ge}}
; ARMT2: t4:
; ARMT2: movwlt [[R0:r[0-9]+]], #65365
diff --git a/test/CodeGen/ARM/select.ll b/test/CodeGen/ARM/select.ll
index f1bd7ee..3e07da8 100644
--- a/test/CodeGen/ARM/select.ll
+++ b/test/CodeGen/ARM/select.ll
@@ -76,12 +76,12 @@ define double @f7(double %a, double %b) {
; block generated, odds are good that we have close to the ideal code for this:
;
; CHECK-NEON: _f8:
-; CHECK-NEON: adr r2, LCPI7_0
-; CHECK-NEON-NEXT: movw r3, #1123
-; CHECK-NEON-NEXT: adds r1, r2, #4
-; CHECK-NEON-NEXT: cmp r0, r3
+; CHECK-NEON: adr [[R2:r[0-9]+]], LCPI7_0
+; CHECK-NEON-NEXT: movw [[R3:r[0-9]+]], #1123
+; CHECK-NEON-NEXT: adds {{r.*}}, [[R2]], #4
+; CHECK-NEON-NEXT: cmp r0, [[R3]]
; CHECK-NEON-NEXT: it ne
-; CHECK-NEON-NEXT: movne r1, r2
+; CHECK-NEON-NEXT: movne {{r.*}}, [[R2]]
; CHECK-NEON-NEXT: ldr
; CHECK-NEON: bx
diff --git a/test/CodeGen/ARM/tail-opts.ll b/test/CodeGen/ARM/tail-opts.ll
index 3dc77e2..220b0f1 100644
--- a/test/CodeGen/ARM/tail-opts.ll
+++ b/test/CodeGen/ARM/tail-opts.ll
@@ -16,11 +16,11 @@ declare i8* @choose(i8*, i8*)
; CHECK: tail_duplicate_me:
; CHECK: qux
-; CHECK: qux
; CHECK: movw r{{[0-9]+}}, :lower16:_GHJK
; CHECK: movt r{{[0-9]+}}, :upper16:_GHJK
; CHECK: str r
; CHECK-NEXT: bx r
+; CHECK: qux
; CHECK: movw r{{[0-9]+}}, :lower16:_GHJK
; CHECK: movt r{{[0-9]+}}, :upper16:_GHJK
; CHECK: str r
diff --git a/test/CodeGen/ARM/vdiv_combine.ll b/test/CodeGen/ARM/vdiv_combine.ll
index 1387393..7fddbed 100644
--- a/test/CodeGen/ARM/vdiv_combine.ll
+++ b/test/CodeGen/ARM/vdiv_combine.ll
@@ -8,7 +8,7 @@ declare void @foo_int32x4_t(<4 x i32>)
; Test signed conversion.
; CHECK: t1
-; CHECK-NOT: vdiv
+; CHECK-NOT: {{vdiv|vmul}}
define void @t1() nounwind {
entry:
%tmp = load i32* @iin, align 4, !tbaa !3
@@ -24,7 +24,7 @@ declare void @foo_float32x2_t(<2 x float>)
; Test unsigned conversion.
; CHECK: t2
-; CHECK-NOT: vdiv
+; CHECK-NOT: {{vdiv|vmul}}
define void @t2() nounwind {
entry:
%tmp = load i32* @uin, align 4, !tbaa !3
@@ -38,7 +38,7 @@ entry:
; Test which should not fold due to non-power of 2.
; CHECK: t3
-; CHECK: vdiv
+; CHECK: {{vdiv|vmul}}
define void @t3() nounwind {
entry:
%tmp = load i32* @iin, align 4, !tbaa !3
@@ -52,7 +52,7 @@ entry:
; Test which should not fold due to power of 2 out of range.
; CHECK: t4
-; CHECK: vdiv
+; CHECK: {{vdiv|vmul}}
define void @t4() nounwind {
entry:
%tmp = load i32* @iin, align 4, !tbaa !3
@@ -66,7 +66,7 @@ entry:
; Test case where const is max power of 2 (i.e., 2^32).
; CHECK: t5
-; CHECK-NOT: vdiv
+; CHECK-NOT: {{vdiv|vmul}}
define void @t5() nounwind {
entry:
%tmp = load i32* @iin, align 4, !tbaa !3
@@ -80,7 +80,7 @@ entry:
; Test quadword.
; CHECK: t6
-; CHECK-NOT: vdiv
+; CHECK-NOT: {{vdiv|vmul}}
define void @t6() nounwind {
entry:
%tmp = load i32* @iin, align 4, !tbaa !3
diff --git a/test/CodeGen/ARM/vector-extend-narrow.ll b/test/CodeGen/ARM/vector-extend-narrow.ll
new file mode 100644
index 0000000..1ec36da
--- /dev/null
+++ b/test/CodeGen/ARM/vector-extend-narrow.ll
@@ -0,0 +1,60 @@
+; RUN: llc -mtriple armv7 %s -o - | FileCheck %s
+
+; CHECK: f:
+define float @f(<4 x i16>* nocapture %in) {
+ ; CHECK: vldr
+ ; CHECK: vmovl.u16
+ %1 = load <4 x i16>* %in
+ ; CHECK: vcvt.f32.u32
+ %2 = uitofp <4 x i16> %1 to <4 x float>
+ %3 = extractelement <4 x float> %2, i32 0
+ %4 = extractelement <4 x float> %2, i32 1
+ %5 = extractelement <4 x float> %2, i32 2
+
+ ; CHECK: vadd.f32
+ %6 = fadd float %3, %4
+ %7 = fadd float %6, %5
+
+ ret float %7
+}
+
+; CHECK: g:
+define float @g(<4 x i8>* nocapture %in) {
+ ; CHECK: vldr
+ ; CHECK: vmovl.u8
+ ; CHECK: vmovl.u16
+ %1 = load <4 x i8>* %in
+ ; CHECK: vcvt.f32.u32
+ %2 = uitofp <4 x i8> %1 to <4 x float>
+ %3 = extractelement <4 x float> %2, i32 0
+ %4 = extractelement <4 x float> %2, i32 1
+ %5 = extractelement <4 x float> %2, i32 2
+
+ ; CHECK: vadd.f32
+ %6 = fadd float %3, %4
+ %7 = fadd float %6, %5
+
+ ret float %7
+}
+
+; CHECK: h:
+define <4 x i8> @h(<4 x float> %v) {
+ ; CHECK: vcvt.{{[us]}}32.f32
+ ; CHECK: vmovn.i32
+ %1 = fptoui <4 x float> %v to <4 x i8>
+ ret <4 x i8> %1
+}
+
+; CHECK: i:
+define <4 x i8> @i(<4 x i8>* %x) {
+ ; CHECK: vldr
+ ; CHECK: vmovl.s8
+ ; CHECK: vmovl.s16
+ ; CHECK: vrecpe
+ ; CHECK: vrecps
+ ; CHECK: vmul
+ ; CHECK: vmovn
+ %1 = load <4 x i8>* %x, align 4
+ %2 = sdiv <4 x i8> zeroinitializer, %1
+ ret <4 x i8> %2
+}
diff --git a/test/CodeGen/ARM/vrev.ll b/test/CodeGen/ARM/vrev.ll
index e154334..122ec03 100644
--- a/test/CodeGen/ARM/vrev.ll
+++ b/test/CodeGen/ARM/vrev.ll
@@ -149,12 +149,10 @@ define void @test_with_vcombine(<4 x float>* %v) nounwind {
}
; The type <2 x i16> is legalized to <2 x i32> and need to be trunc-stored
-; to <2 x i16> when stored to memory. Currently ARM scalarizes these stores.
-; See PR 11158
+; to <2 x i16> when stored to memory.
define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp {
; CHECK: test_vrev64:
-; CHECK: vst1.16
-; CHECK: vst1.16
+; CHECK: vst1.32
entry:
%0 = bitcast <4 x i16>* %source to <8 x i16>*
%tmp2 = load <8 x i16>* %0, align 4
diff --git a/test/CodeGen/ARM/widen-vmovs.ll b/test/CodeGen/ARM/widen-vmovs.ll
index 1f5113e..679e3f4 100644
--- a/test/CodeGen/ARM/widen-vmovs.ll
+++ b/test/CodeGen/ARM/widen-vmovs.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -widen-vmovs -mcpu=cortex-a8 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -widen-vmovs -mcpu=cortex-a8 -verify-machineinstrs -disable-code-place | FileCheck %s
target triple = "thumbv7-apple-ios"
-; The 0.0 constant is loaded from the constant pool and kept in a register.
+; The 1.0e+10 constant is loaded from the constant pool and kept in a register.
; CHECK: %entry
; CHECK: vldr s
; The float loop variable is initialized with a vmovs from the constant register.
@@ -10,6 +10,7 @@ target triple = "thumbv7-apple-ios"
; CHECK: , [[DN]]
; CHECK: %for.body.i
; CHECK: vadd.f32 [[DL]], [[DL]], [[DN]]
+; CHECK: %rInnerproduct.exit
;
; This test is verifying:
; - The VMOVS widening is happening.
@@ -24,8 +25,8 @@ for.body4:
br label %for.body.i
for.body.i:
- %tmp3.i = phi float [ 0.000000e+00, %for.body4 ], [ %add.i, %for.body.i ]
- %add.i = fadd float %tmp3.i, 0.000000e+00
+ %tmp3.i = phi float [ 1.000000e+10, %for.body4 ], [ %add.i, %for.body.i ]
+ %add.i = fadd float %tmp3.i, 1.000000e+10
%exitcond.i = icmp eq i32 undef, 41
br i1 %exitcond.i, label %rInnerproduct.exit, label %for.body.i