diff options
Diffstat (limited to 'lib/Target/X86/X86TargetTransformInfo.cpp')
-rw-r--r-- | lib/Target/X86/X86TargetTransformInfo.cpp | 159 |
1 files changed, 48 insertions, 111 deletions
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index c04964d..91b9d40 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -14,37 +14,24 @@ /// //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "x86tti" #include "X86.h" #include "X86TargetMachine.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" #include "llvm/Target/TargetLowering.h" using namespace llvm; +#define DEBUG_TYPE "x86tti" + // Declare the pass initialization routine locally as target-specific passes -// don't havve a target-wide initialization entry point, and so we rely on the +// don't have a target-wide initialization entry point, and so we rely on the // pass constructor initialization. namespace llvm { void initializeX86TTIPass(PassRegistry &); } -static cl::opt<bool> -UsePartialUnrolling("x86-use-partial-unrolling", cl::init(true), - cl::desc("Use partial unrolling for some X86 targets"), cl::Hidden); -static cl::opt<unsigned> -PartialUnrollingThreshold("x86-partial-unrolling-threshold", cl::init(0), - cl::desc("Threshold for X86 partial unrolling"), cl::Hidden); -static cl::opt<unsigned> -PartialUnrollingMaxBranches("x86-partial-max-branches", cl::init(2), - cl::desc("Threshold for taken branches in X86 partial unrolling"), - cl::Hidden); - namespace { class X86TTI final : public ImmutablePass, public TargetTransformInfo { @@ -56,7 +43,7 @@ class X86TTI final : public ImmutablePass, public TargetTransformInfo { unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; public: - X86TTI() : ImmutablePass(ID), ST(0), TLI(0) { + X86TTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) { llvm_unreachable("This pass cannot be directly constructed"); } @@ -87,8 +74,6 @@ public: /// \name Scalar TTI Implementations /// @{ PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override; - void getUnrollingPreferences(Loop *L, - UnrollingPreferences &UP) const override; /// @} @@ -153,93 +138,6 @@ X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const { return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software; } -void X86TTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const { - if (!UsePartialUnrolling) - return; - // According to the Intel 64 and IA-32 Architectures Optimization Reference - // Manual, Intel Core models and later have a loop stream detector - // (and associated uop queue) that can benefit from partial unrolling. - // The relevant requirements are: - // - The loop must have no more than 4 (8 for Nehalem and later) branches - // taken, and none of them may be calls. - // - The loop can have no more than 18 (28 for Nehalem and later) uops. - - // According to the Software Optimization Guide for AMD Family 15h Processors, - // models 30h-4fh (Steamroller and later) have a loop predictor and loop - // buffer which can benefit from partial unrolling. - // The relevant requirements are: - // - The loop must have fewer than 16 branches - // - The loop must have less than 40 uops in all executed loop branches - - unsigned MaxBranches, MaxOps; - if (PartialUnrollingThreshold.getNumOccurrences() > 0) { - MaxBranches = PartialUnrollingMaxBranches; - MaxOps = PartialUnrollingThreshold; - } else if (ST->isAtom()) { - // On the Atom, the throughput for taken branches is 2 cycles. For small - // simple loops, expand by a small factor to hide the backedge cost. - MaxBranches = 2; - MaxOps = 10; - } else if (ST->hasFSGSBase() && ST->hasXOP() /* Steamroller and later */) { - MaxBranches = 16; - MaxOps = 40; - } else if (ST->hasFMA4() /* Any other recent AMD */) { - return; - } else if (ST->hasAVX() || ST->hasSSE42() /* Nehalem and later */) { - MaxBranches = 8; - MaxOps = 28; - } else if (ST->hasSSSE3() /* Intel Core */) { - MaxBranches = 4; - MaxOps = 18; - } else { - return; - } - - // Scan the loop: don't unroll loops with calls, and count the potential - // number of taken branches (this is somewhat conservative because we're - // counting all block transitions as potential branches while in reality some - // of these will become implicit via block placement). - unsigned MaxDepth = 0; - for (df_iterator<BasicBlock*> DI = df_begin(L->getHeader()), - DE = df_end(L->getHeader()); DI != DE;) { - if (!L->contains(*DI)) { - DI.skipChildren(); - continue; - } - - MaxDepth = std::max(MaxDepth, DI.getPathLength()); - if (MaxDepth > MaxBranches) - return; - - for (BasicBlock::iterator I = DI->begin(), IE = DI->end(); I != IE; ++I) - if (isa<CallInst>(I) || isa<InvokeInst>(I)) { - ImmutableCallSite CS(I); - if (const Function *F = CS.getCalledFunction()) { - if (!isLoweredToCall(F)) - continue; - } - - return; - } - - ++DI; - } - - // Enable runtime and partial unrolling up to the specified size. - UP.Partial = UP.Runtime = true; - UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps; - - // Set the maximum count based on the loop depth. The maximum number of - // branches taken in a loop (including the backedge) is equal to the maximum - // loop depth (the DFS path length from the loop header to any block in the - // loop). When the loop is unrolled, this depth (except for the backedge - // itself) is multiplied by the unrolling factor. This new unrolled depth - // must be less than the target-specific maximum branch count (which limits - // the number of taken branches in the uop buffer). - if (MaxDepth > 1) - UP.MaxCount = (MaxBranches-1)/(MaxDepth-1); -} - unsigned X86TTI::getNumberOfRegisters(bool Vector) const { if (Vector && !ST->hasSSE1()) return 0; @@ -283,6 +181,21 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + static const CostTblEntry<MVT::SimpleValueType> + AVX2UniformConstCostTable[] = { + { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence + { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence + { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence + { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence + }; + + if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && + ST->hasAVX2()) { + int Idx = CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second); + if (Idx != -1) + return LT.first * AVX2UniformConstCostTable[Idx].Cost; + } + static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = { // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to // customize them to detect the cases where shift amount is a scalar one. @@ -350,10 +263,19 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. { ISD::SRA, MVT::v8i16, 1 }, // psraw. { ISD::SRA, MVT::v4i32, 1 }, // psrad. + + { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence + { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence + { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence + { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence }; if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && ST->hasSSE2()) { + // pmuldq sequence. + if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) + return LT.first * 15; + int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second); if (Idx != -1) return LT.first * SSE2UniformConstCostTable[Idx].Cost; @@ -893,6 +815,13 @@ unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { if (BitSize == 0) return ~0U; + // Never hoist constants larger than 128bit, because this might lead to + // incorrect code generation or assertions in codegen. + // Fixme: Create a cost model for types larger than i128 once the codegen + // issues have been fixed. + if (BitSize > 128) + return TCC_Free; + if (Imm == 0) return TCC_Free; @@ -908,8 +837,10 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); + // There is no cost model for constants with a bit size of 0. Return TCC_Free + // here, so that constant hoisting will ignore this constant. if (BitSize == 0) - return ~0U; + return TCC_Free; unsigned ImmIdx = ~0U; switch (Opcode) { @@ -931,15 +862,19 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, case Instruction::SDiv: case Instruction::URem: case Instruction::SRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: case Instruction::And: case Instruction::Or: case Instruction::Xor: case Instruction::ICmp: ImmIdx = 1; break; + // Always return TCC_Free for the shift value of a shift instruction. + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + if (Idx == 1) + return TCC_Free; + break; case Instruction::Trunc: case Instruction::ZExt: case Instruction::SExt: @@ -966,8 +901,10 @@ unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); + // There is no cost model for constants with a bit size of 0. Return TCC_Free + // here, so that constant hoisting will ignore this constant. if (BitSize == 0) - return ~0U; + return TCC_Free; switch (IID) { default: return TCC_Free; |