From 65457b679ae240c1a37da82c5484dac478c47b6d Mon Sep 17 00:00:00 2001 From: Arnold Schwaighofer Date: Tue, 17 Sep 2013 18:06:50 +0000 Subject: Costmodel: Add support for horizontal vector reductions Upcoming SLP vectorization improvements will want to be able to estimate costs of horizontal reductions. Add infrastructure to support this. We model reductions as a series of (shufflevector,add) tuples ultimately followed by an extractelement. For example, for an add-reduction of <4 x float> we could generate the following sequence: (v0, v1, v2, v3) \ \ / / \ \ / + + (v0+v2, v1+v3, undef, undef) \ / ((v0+v2) + (v1+v3), undef, undef) %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 %r = extractelement <4 x float> %bin.rdx8, i32 0 This commit adds a cost model interface "getReductionCost(Opcode, Ty, Pairwise)" that will allow clients to ask for the cost of such a reduction (as backends might generate more efficient code than the cost of the individual instructions summed up). This interface is excercised by the CostModel analysis pass which looks for reduction patterns like the one above - starting at extractelements - and if it sees a matching sequence will call the cost model interface. We will also support a second form of pairwise reduction that is well supported on common architectures (haddps, vpadd, faddp). (v0, v1, v2, v3) \ / \ / (v0+v1, v2+v3, undef, undef) \ / ((v0+v1)+(v2+v3), undef, undef, undef) %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 %r = extractelement <4 x float> %bin.rdx.1, i32 0 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@190876 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/Analysis/CostModel/X86/reduction.ll | 94 ++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 test/Analysis/CostModel/X86/reduction.ll (limited to 'test/Analysis') diff --git a/test/Analysis/CostModel/X86/reduction.ll b/test/Analysis/CostModel/X86/reduction.ll new file mode 100644 index 0000000..37d5f24 --- /dev/null +++ b/test/Analysis/CostModel/X86/reduction.ll @@ -0,0 +1,94 @@ +; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core2 -mtriple=x86_64-apple-darwin | FileCheck %s + +define fastcc float @reduction_cost_float(<4 x float> %rdx) { + %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> + %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf + %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> + %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 + +; Check that we recognize the tree starting at the extractelement as a +; reduction. +; CHECK-LABEL: reduction_cost +; CHECK: cost of 9 {{.*}} extractelement + + %r = extractelement <4 x float> %bin.rdx8, i32 0 + ret float %r +} + +define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) { + %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, + <8 x i32> + %bin.rdx = add <8 x i32> %rdx, %rdx.shuf + %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, + <8 x i32> + %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 + %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, + <8 x i32> + %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 + +; CHECK-LABEL: reduction_cost_int +; CHECK: cost of 23 {{.*}} extractelement + + %r = extractelement <8 x i32> %bin.rdx.3, i32 0 + ret i32 %r +} + +define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) { + %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, + <4 x i32> + %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, + <4 x i32> + %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 + %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, + <4 x i32> + %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, + <4 x i32> + %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 + +; CHECK-LABEL: pairwise_hadd +; CHECK: cost of 11 {{.*}} extractelement + + %r = extractelement <4 x float> %bin.rdx.1, i32 0 + %r2 = fadd float %r, %f1 + ret float %r2 +} +define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) { + %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, + <4 x i32> + %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, + <4 x i32> + %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0 + %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, + <4 x i32> + %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, + <4 x i32> + %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 + +; CHECK-LABEL: pairwise_hadd_assoc +; CHECK: cost of 11 {{.*}} extractelement + + %r = extractelement <4 x float> %bin.rdx.1, i32 0 + %r2 = fadd float %r, %f1 + ret float %r2 +} + +define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) { + %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, + <4 x i32> + %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, + <4 x i32> + %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 + %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, + <4 x i32> + %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 + +; CHECK-LABEL: pairwise_hadd_skip_first +; CHECK: cost of 11 {{.*}} extractelement + + %r = extractelement <4 x float> %bin.rdx.1, i32 0 + %r2 = fadd float %r, %f1 + ret float %r2 +} -- cgit v1.1