5 files changed, 406 insertions, 0 deletions
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index eb70e9c..9104fd4 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -368,6 +368,22 @@ public:
                                    unsigned Alignment,
                                    unsigned AddressSpace) const;
 
+  /// \brief Calculate the cost of performing a vector reduction.
+  ///
+  /// This is the cost of reducing the vector value of type \p Ty to a scalar
+  /// value using the operation denoted by \p Opcode. The form of the reduction
+  /// can either be a pairwise reduction or a reduction that splits the vector
+  /// at every reduction level.
+  ///
+  /// Pairwise:
+  ///  (v0, v1, v2, v3)
+  ///  ((v0+v1), (v2, v3), undef, undef)
+  /// Split:
+  ///  (v0, v1, v2, v3)
+  ///  ((v0+v2), (v1+v3), undef, undef)
+  virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,
+                                    bool IsPairwiseForm) const;
+
   /// \returns The cost of Intrinsic instructions.
   virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                          ArrayRef<Type *> Tys) const;
diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp
index 927508e..6970b87 100644
--- a/lib/Analysis/CostModel.cpp
+++ b/lib/Analysis/CostModel.cpp
@@ -19,6 +19,7 @@
 
 #define CM_NAME "cost-model"
 #define DEBUG_TYPE CM_NAME
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
@@ -26,10 +27,15 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+static cl::opt<bool> EnableReduxCost("costmodel-reduxcost", cl::init(false),
+                                     cl::Hidden,
+                                     cl::desc("Recognize reduction patterns."));
+
 namespace {
   class CostModelAnalysis : public FunctionPass {
 
@@ -105,6 +111,261 @@ static TargetTransformInfo::OperandValueKind getOperandInfo(Value *V) {
   return OpInfo;
 }
 
+static bool matchMask(SmallVectorImpl<int> &M1, SmallVectorImpl<int> &M2) {
+  if (M1.size() != M2.size())
+    return false;
+
+  for (unsigned i = 0, e = M1.size(); i != e; ++i)
+    if (M1[i] != M2[i])
+      return false;
+
+  return true;
+}
+
+static bool matchPairwiseShuffleMask(ShuffleVectorInst *SI, bool IsLeft,
+                                     unsigned Level) {
+  // We don't need a shuffle if we just want to have element 0 in position 0 of
+  // the vector.
+  if (!SI && Level == 0 && IsLeft)
+    return true;
+  else if (!SI)
+    return false;
+
+  SmallVector<int, 32> Mask(SI->getType()->getVectorNumElements(), -1);
+
+  // Build a mask of 0, 2, ... (left) or 1, 3, ... (right) depending on whether
+  // we look at the left or right side.
+  for (unsigned i = 0, e = (1 << Level), val = !IsLeft; i != e; ++i, val += 2)
+    Mask[i] = val;
+
+  SmallVector<int, 16> ActualMask = SI->getShuffleMask();
+  if (!matchMask(Mask, ActualMask))
+    return false;
+
+  return true;
+}
+
+static bool matchPairwiseReductionAtLevel(const BinaryOperator *BinOp,
+                                          unsigned Level, unsigned NumLevels) {
+  // Match one level of pairwise operations.
+  // %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
+  //       <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
+  // %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
+  //       <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  // %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+  if (BinOp == 0)
+    return false;
+
+  Type *VecTy = BinOp->getType();
+  assert(VecTy->isVectorTy() && "Expecting a vector type");
+
+  unsigned Opcode = BinOp->getOpcode();
+  Value *L = BinOp->getOperand(0);
+  Value *R = BinOp->getOperand(1);
+
+  ShuffleVectorInst *LS = dyn_cast<ShuffleVectorInst>(L);
+  if (!LS && Level)
+    return false;
+  ShuffleVectorInst *RS = dyn_cast<ShuffleVectorInst>(R);
+  if (!RS && Level)
+    return false;
+
+  // On level 0 we can omit one shufflevector instruction.
+  if (!Level && !RS && !LS)
+    return false;
+
+  // Shuffle inputs must match.
+  Value *NextLevelOpL = LS ? LS->getOperand(0) : 0;
+  Value *NextLevelOpR = RS ? RS->getOperand(0) : 0;
+  Value *NextLevelOp = 0;
+  if (NextLevelOpR && NextLevelOpL) {
+    // If we have two shuffles their operands must match.
+    if (NextLevelOpL != NextLevelOpR)
+      return false;
+
+    NextLevelOp = NextLevelOpL;
+  } else if (Level == 0 && (NextLevelOpR || NextLevelOpL)) {
+    // On the first level we can omit the shufflevector <0, undef,...>. So the
+    // input to the other shufflevector <1, undef> must match with one of the
+    // inputs to the current binary operation.
+    // Example:
+    //  %NextLevelOpL = shufflevector %R, <1, undef ...>
+    //  %BinOp        = fadd          %NextLevelOpL, %R
+    if (NextLevelOpL && NextLevelOpL != R)
+      return false;
+    else if (NextLevelOpR && NextLevelOpR != L)
+      return false;
+
+    NextLevelOp = NextLevelOpL ? R : L;
+  } else
+    return false;
+
+  // Check that the next levels binary operation exists and matches with the
+  // current one.
+  BinaryOperator *NextLevelBinOp = 0;
+  if (Level + 1 != NumLevels) {
+    if (!(NextLevelBinOp = dyn_cast<BinaryOperator>(NextLevelOp)))
+      return false;
+    else if (NextLevelBinOp->getOpcode() != Opcode)
+      return false;
+  }
+
+  // Shuffle mask for pairwise operation must match.
+  if (matchPairwiseShuffleMask(LS, true, Level)) {
+    if (!matchPairwiseShuffleMask(RS, false, Level))
+      return false;
+  } else if (matchPairwiseShuffleMask(RS, true, Level)) {
+    if (!matchPairwiseShuffleMask(LS, false, Level))
+      return false;
+  } else
+    return false;
+
+  if (++Level == NumLevels)
+    return true;
+
+  // Match next level.
+  return matchPairwiseReductionAtLevel(NextLevelBinOp, Level, NumLevels);
+}
+
+static bool matchPairwiseReduction(const ExtractElementInst *ReduxRoot,
+                                   unsigned &Opcode, Type *&Ty) {
+  if (!EnableReduxCost)
+    return false;
+
+  // Need to extract the first element.
+  ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));
+  unsigned Idx = ~0u;
+  if (CI)
+    Idx = CI->getZExtValue();
+  if (Idx != 0)
+    return false;
+
+  BinaryOperator *RdxStart = dyn_cast<BinaryOperator>(ReduxRoot->getOperand(0));
+  if (!RdxStart)
+    return false;
+
+  Type *VecTy = ReduxRoot->getOperand(0)->getType();
+  unsigned NumVecElems = VecTy->getVectorNumElements();
+  if (!isPowerOf2_32(NumVecElems))
+    return false;
+
+  // We look for a sequence of shuffle,shuffle,add triples like the following
+  // that builds a pairwise reduction tree.
+  //
+  //  (X0, X1, X2, X3)
+  //   (X0 + X1, X2 + X3, undef, undef)
+  //    ((X0 + X1) + (X2 + X3), undef, undef, undef)
+  //
+  // %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
+  //       <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
+  // %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
+  //       <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  // %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+  // %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+  //       <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  // %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+  //       <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  // %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+  // %r = extractelement <4 x float> %bin.rdx8, i32 0
+  if (!matchPairwiseReductionAtLevel(RdxStart, 0,  Log2_32(NumVecElems)))
+    return false;
+
+  Opcode = RdxStart->getOpcode();
+  Ty = VecTy;
+
+  return true;
+}
+
+static std::pair<Value *, ShuffleVectorInst *>
+getShuffleAndOtherOprd(BinaryOperator *B) {
+
+  Value *L = B->getOperand(0);
+  Value *R = B->getOperand(1);
+  ShuffleVectorInst *S = 0;
+
+  if ((S = dyn_cast<ShuffleVectorInst>(L)))
+    return std::make_pair(R, S);
+
+  S = dyn_cast<ShuffleVectorInst>(R);
+  return std::make_pair(L, S);
+}
+
+static bool matchVectorSplittingReduction(const ExtractElementInst *ReduxRoot,
+                                          unsigned &Opcode, Type *&Ty) {
+  if (!EnableReduxCost)
+    return false;
+
+  // Need to extract the first element.
+  ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));
+  unsigned Idx = ~0u;
+  if (CI)
+    Idx = CI->getZExtValue();
+  if (Idx != 0)
+    return false;
+
+  BinaryOperator *RdxStart = dyn_cast<BinaryOperator>(ReduxRoot->getOperand(0));
+  if (!RdxStart)
+    return false;
+  unsigned RdxOpcode = RdxStart->getOpcode();
+
+  Type *VecTy = ReduxRoot->getOperand(0)->getType();
+  unsigned NumVecElems = VecTy->getVectorNumElements();
+  if (!isPowerOf2_32(NumVecElems))
+    return false;
+
+  // We look for a sequence of shuffles and adds like the following matching one
+  // fadd, shuffle vector pair at a time.
+  //
+  // %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef,
+  //                           <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  // %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
+  // %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef,
+  //                          <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  // %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
+  // %r = extractelement <4 x float> %bin.rdx8, i32 0
+
+  unsigned MaskStart = 1;
+  Value *RdxOp = RdxStart;
+  SmallVector<int, 32> ShuffleMask(NumVecElems, 0);
+  unsigned NumVecElemsRemain = NumVecElems;
+  while (NumVecElemsRemain - 1) {
+    // Check for the right reduction operation.
+    BinaryOperator *BinOp;
+    if (!(BinOp = dyn_cast<BinaryOperator>(RdxOp)))
+      return false;
+    if (BinOp->getOpcode() != RdxOpcode)
+      return false;
+
+    Value *NextRdxOp;
+    ShuffleVectorInst *Shuffle;
+    tie(NextRdxOp, Shuffle) = getShuffleAndOtherOprd(BinOp);
+
+    // Check the current reduction operation and the shuffle use the same value.
+    if (Shuffle == 0)
+      return false;
+    if (Shuffle->getOperand(0) != NextRdxOp)
+      return false;
+
+    // Check that shuffle masks matches.
+    for (unsigned j = 0; j != MaskStart; ++j)
+      ShuffleMask[j] = MaskStart + j;
+    // Fill the rest of the mask with -1 for undef.
+    std::fill(&ShuffleMask[MaskStart], ShuffleMask.end(), -1);
+
+    SmallVector<int, 16> Mask = Shuffle->getShuffleMask();
+    if (!matchMask(ShuffleMask, Mask))
+      return false;
+
+    RdxOp = NextRdxOp;
+    NumVecElemsRemain /= 2;
+    MaskStart *= 2;
+  }
+
+  Opcode = RdxOpcode;
+  Ty = VecTy;
+  return true;
+}
+
 unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
   if (!TTI)
     return -1;
@@ -189,6 +450,17 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
     unsigned Idx = -1;
     if (CI)
       Idx = CI->getZExtValue();
+
+    // Try to match a reduction sequence (series of shufflevector and vector
+    // adds followed by a extractelement).
+    unsigned ReduxOpCode;
+    Type *ReduxType;
+
+    if (matchVectorSplittingReduction(EEI, ReduxOpCode, ReduxType))
+      return TTI->getReductionCost(ReduxOpCode, ReduxType, false);
+    else if (matchPairwiseReduction(EEI, ReduxOpCode, ReduxType))
+      return TTI->getReductionCost(ReduxOpCode, ReduxType, true);
+
     return TTI->getVectorInstrCost(I->getOpcode(),
                                    EEI->getOperand(0)->getType(), Idx);
   }
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index b23223d..0353295 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -224,6 +224,11 @@ unsigned TargetTransformInfo::getAddressComputationCost(Type *Tp,
   return PrevTTI->getAddressComputationCost(Tp, IsComplex);
 }
 
+unsigned TargetTransformInfo::getReductionCost(unsigned Opcode, Type *Ty,
+                                               bool IsPairwise) const {
+  return PrevTTI->getReductionCost(Opcode, Ty, IsPairwise);
+}
+
 namespace {
 
 struct NoTTI : ImmutablePass, TargetTransformInfo {
@@ -592,6 +597,10 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
   unsigned getAddressComputationCost(Type *Tp, bool) const {
     return 0;
   }
+
+  unsigned getReductionCost(unsigned, Type *, bool) const {
+    return 1;
+  }
 };
 
 } // end anonymous namespace
diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp
index 9c4b49a..24aa1ab 100644
--- a/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -113,6 +113,7 @@ public:
                                          ArrayRef<Type*> Tys) const;
   virtual unsigned getNumberOfParts(Type *Tp) const;
   virtual unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const;
+  virtual unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise) const;
 
   /// @}
 };
@@ -510,3 +511,17 @@ unsigned BasicTTI::getNumberOfParts(Type *Tp) const {
 unsigned BasicTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
   return 0;
 }
+
+unsigned BasicTTI::getReductionCost(unsigned Opcode, Type *Ty,
+                                    bool IsPairwise) const {
+  assert(Ty->isVectorTy() && "Expect a vector type");
+  unsigned NumVecElts = Ty->getVectorNumElements();
+  unsigned NumReduxLevels = Log2_32(NumVecElts);
+  unsigned ArithCost = NumReduxLevels *
+    TopTTI->getArithmeticInstrCost(Opcode, Ty);
+  // Assume the pairwise shuffles add a cost.
+  unsigned ShuffleCost =
+      NumReduxLevels * (IsPairwise + 1) *
+      TopTTI->getShuffleCost(SK_ExtractSubvector, Ty, NumVecElts / 2, Ty);
+  return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
+}
diff --git a/test/Analysis/CostModel/X86/reduction.ll b/test/Analysis/CostModel/X86/reduction.ll
new file mode 100644
index 0000000..37d5f24
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduction.ll
@@ -0,0 +1,94 @@
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core2 -mtriple=x86_64-apple-darwin | FileCheck %s
+
+define fastcc float @reduction_cost_float(<4 x float> %rdx) {
+  %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
+  %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
+
+; Check that we recognize the tree starting at the extractelement as a
+; reduction.
+; CHECK-LABEL: reduction_cost
+; CHECK:  cost of 9 {{.*}} extractelement
+
+  %r = extractelement <4 x float> %bin.rdx8, i32 0
+  ret float %r
+}
+
+define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
+  %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef,
+   <8 x i32> <i32 4    , i32     5, i32     6, i32     7,
+              i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
+  %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,
+   <8 x i32> <i32 2    , i32 3,     i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
+  %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef,
+   <8 x i32> <i32 1    , i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
+
+; CHECK-LABEL: reduction_cost_int
+; CHECK:  cost of 23 {{.*}} extractelement
+
+  %r = extractelement <8 x i32> %bin.rdx.3, i32 0
+  ret i32 %r
+}
+
+define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
+  %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
+        <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
+  %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
+        <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+  %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+        <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+        <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+
+; CHECK-LABEL: pairwise_hadd
+; CHECK: cost of 11 {{.*}} extractelement
+
+  %r = extractelement <4 x float> %bin.rdx.1, i32 0
+  %r2 = fadd float %r, %f1
+  ret float %r2
+}
+define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
+  %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
+        <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
+  %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
+        <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0
+  %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+        <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+        <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+
+; CHECK-LABEL: pairwise_hadd_assoc
+; CHECK: cost of 11 {{.*}} extractelement
+
+  %r = extractelement <4 x float> %bin.rdx.1, i32 0
+  %r2 = fadd float %r, %f1
+  ret float %r2
+}
+
+define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
+  %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
+        <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
+  %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
+        <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+  %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+        <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
+
+; CHECK-LABEL: pairwise_hadd_skip_first
+; CHECK: cost of 11 {{.*}} extractelement
+
+  %r = extractelement <4 x float> %bin.rdx.1, i32 0
+  %r2 = fadd float %r, %f1
+  ret float %r2
+}