aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/X86/X86TargetTransformInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86/X86TargetTransformInfo.cpp')
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp159
1 files changed, 48 insertions, 111 deletions
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index c04964d..91b9d40 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -14,37 +14,24 @@
///
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "x86tti"
#include "X86.h"
#include "X86TargetMachine.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Target/CostTable.h"
#include "llvm/Target/TargetLowering.h"
using namespace llvm;
+#define DEBUG_TYPE "x86tti"
+
// Declare the pass initialization routine locally as target-specific passes
-// don't havve a target-wide initialization entry point, and so we rely on the
+// don't have a target-wide initialization entry point, and so we rely on the
// pass constructor initialization.
namespace llvm {
void initializeX86TTIPass(PassRegistry &);
}
-static cl::opt<bool>
-UsePartialUnrolling("x86-use-partial-unrolling", cl::init(true),
- cl::desc("Use partial unrolling for some X86 targets"), cl::Hidden);
-static cl::opt<unsigned>
-PartialUnrollingThreshold("x86-partial-unrolling-threshold", cl::init(0),
- cl::desc("Threshold for X86 partial unrolling"), cl::Hidden);
-static cl::opt<unsigned>
-PartialUnrollingMaxBranches("x86-partial-max-branches", cl::init(2),
- cl::desc("Threshold for taken branches in X86 partial unrolling"),
- cl::Hidden);
-
namespace {
class X86TTI final : public ImmutablePass, public TargetTransformInfo {
@@ -56,7 +43,7 @@ class X86TTI final : public ImmutablePass, public TargetTransformInfo {
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
public:
- X86TTI() : ImmutablePass(ID), ST(0), TLI(0) {
+ X86TTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) {
llvm_unreachable("This pass cannot be directly constructed");
}
@@ -87,8 +74,6 @@ public:
/// \name Scalar TTI Implementations
/// @{
PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
- void getUnrollingPreferences(Loop *L,
- UnrollingPreferences &UP) const override;
/// @}
@@ -153,93 +138,6 @@ X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software;
}
-void X86TTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const {
- if (!UsePartialUnrolling)
- return;
- // According to the Intel 64 and IA-32 Architectures Optimization Reference
- // Manual, Intel Core models and later have a loop stream detector
- // (and associated uop queue) that can benefit from partial unrolling.
- // The relevant requirements are:
- // - The loop must have no more than 4 (8 for Nehalem and later) branches
- // taken, and none of them may be calls.
- // - The loop can have no more than 18 (28 for Nehalem and later) uops.
-
- // According to the Software Optimization Guide for AMD Family 15h Processors,
- // models 30h-4fh (Steamroller and later) have a loop predictor and loop
- // buffer which can benefit from partial unrolling.
- // The relevant requirements are:
- // - The loop must have fewer than 16 branches
- // - The loop must have less than 40 uops in all executed loop branches
-
- unsigned MaxBranches, MaxOps;
- if (PartialUnrollingThreshold.getNumOccurrences() > 0) {
- MaxBranches = PartialUnrollingMaxBranches;
- MaxOps = PartialUnrollingThreshold;
- } else if (ST->isAtom()) {
- // On the Atom, the throughput for taken branches is 2 cycles. For small
- // simple loops, expand by a small factor to hide the backedge cost.
- MaxBranches = 2;
- MaxOps = 10;
- } else if (ST->hasFSGSBase() && ST->hasXOP() /* Steamroller and later */) {
- MaxBranches = 16;
- MaxOps = 40;
- } else if (ST->hasFMA4() /* Any other recent AMD */) {
- return;
- } else if (ST->hasAVX() || ST->hasSSE42() /* Nehalem and later */) {
- MaxBranches = 8;
- MaxOps = 28;
- } else if (ST->hasSSSE3() /* Intel Core */) {
- MaxBranches = 4;
- MaxOps = 18;
- } else {
- return;
- }
-
- // Scan the loop: don't unroll loops with calls, and count the potential
- // number of taken branches (this is somewhat conservative because we're
- // counting all block transitions as potential branches while in reality some
- // of these will become implicit via block placement).
- unsigned MaxDepth = 0;
- for (df_iterator<BasicBlock*> DI = df_begin(L->getHeader()),
- DE = df_end(L->getHeader()); DI != DE;) {
- if (!L->contains(*DI)) {
- DI.skipChildren();
- continue;
- }
-
- MaxDepth = std::max(MaxDepth, DI.getPathLength());
- if (MaxDepth > MaxBranches)
- return;
-
- for (BasicBlock::iterator I = DI->begin(), IE = DI->end(); I != IE; ++I)
- if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
- ImmutableCallSite CS(I);
- if (const Function *F = CS.getCalledFunction()) {
- if (!isLoweredToCall(F))
- continue;
- }
-
- return;
- }
-
- ++DI;
- }
-
- // Enable runtime and partial unrolling up to the specified size.
- UP.Partial = UP.Runtime = true;
- UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps;
-
- // Set the maximum count based on the loop depth. The maximum number of
- // branches taken in a loop (including the backedge) is equal to the maximum
- // loop depth (the DFS path length from the loop header to any block in the
- // loop). When the loop is unrolled, this depth (except for the backedge
- // itself) is multiplied by the unrolling factor. This new unrolled depth
- // must be less than the target-specific maximum branch count (which limits
- // the number of taken branches in the uop buffer).
- if (MaxDepth > 1)
- UP.MaxCount = (MaxBranches-1)/(MaxDepth-1);
-}
-
unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
if (Vector && !ST->hasSSE1())
return 0;
@@ -283,6 +181,21 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
+ static const CostTblEntry<MVT::SimpleValueType>
+ AVX2UniformConstCostTable[] = {
+ { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
+ { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
+ { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
+ { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
+ };
+
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasAVX2()) {
+ int Idx = CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second);
+ if (Idx != -1)
+ return LT.first * AVX2UniformConstCostTable[Idx].Cost;
+ }
+
static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = {
// Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
// customize them to detect the cases where shift amount is a scalar one.
@@ -350,10 +263,19 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
{ ISD::SRA, MVT::v8i16, 1 }, // psraw.
{ ISD::SRA, MVT::v4i32, 1 }, // psrad.
+
+ { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
+ { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
+ { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
+ { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
ST->hasSSE2()) {
+ // pmuldq sequence.
+ if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
+ return LT.first * 15;
+
int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second);
if (Idx != -1)
return LT.first * SSE2UniformConstCostTable[Idx].Cost;
@@ -893,6 +815,13 @@ unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
if (BitSize == 0)
return ~0U;
+ // Never hoist constants larger than 128bit, because this might lead to
+ // incorrect code generation or assertions in codegen.
+ // Fixme: Create a cost model for types larger than i128 once the codegen
+ // issues have been fixed.
+ if (BitSize > 128)
+ return TCC_Free;
+
if (Imm == 0)
return TCC_Free;
@@ -908,8 +837,10 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ // There is no cost model for constants with a bit size of 0. Return TCC_Free
+ // here, so that constant hoisting will ignore this constant.
if (BitSize == 0)
- return ~0U;
+ return TCC_Free;
unsigned ImmIdx = ~0U;
switch (Opcode) {
@@ -931,15 +862,19 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
case Instruction::SDiv:
case Instruction::URem:
case Instruction::SRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
case Instruction::ICmp:
ImmIdx = 1;
break;
+ // Always return TCC_Free for the shift value of a shift instruction.
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ if (Idx == 1)
+ return TCC_Free;
+ break;
case Instruction::Trunc:
case Instruction::ZExt:
case Instruction::SExt:
@@ -966,8 +901,10 @@ unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ // There is no cost model for constants with a bit size of 0. Return TCC_Free
+ // here, so that constant hoisting will ignore this constant.
if (BitSize == 0)
- return ~0U;
+ return TCC_Free;
switch (IID) {
default: return TCC_Free;