diff options
author | Stephen Hines <srhines@google.com> | 2014-04-23 16:57:46 -0700 |
---|---|---|
committer | Stephen Hines <srhines@google.com> | 2014-04-24 15:53:16 -0700 |
commit | 36b56886974eae4f9c5ebc96befd3e7bfe5de338 (patch) | |
tree | e6cfb69fbbd937f450eeb83bfb83b9da3b01275a /lib/Target/X86/X86TargetTransformInfo.cpp | |
parent | 69a8640022b04415ae9fac62f8ab090601d8f889 (diff) | |
download | external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.zip external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.tar.gz external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.tar.bz2 |
Update to LLVM 3.5a.
Change-Id: Ifadecab779f128e62e430c2b4f6ddd84953ed617
Diffstat (limited to 'lib/Target/X86/X86TargetTransformInfo.cpp')
-rw-r--r-- | lib/Target/X86/X86TargetTransformInfo.cpp | 410 |
1 files changed, 355 insertions, 55 deletions
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index f88a666..c04964d 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -17,10 +17,14 @@ #define DEBUG_TYPE "x86tti" #include "X86.h" #include "X86TargetMachine.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetLowering.h" #include "llvm/Target/CostTable.h" +#include "llvm/Target/TargetLowering.h" using namespace llvm; // Declare the pass initialization routine locally as target-specific passes @@ -30,9 +34,20 @@ namespace llvm { void initializeX86TTIPass(PassRegistry &); } +static cl::opt<bool> +UsePartialUnrolling("x86-use-partial-unrolling", cl::init(true), + cl::desc("Use partial unrolling for some X86 targets"), cl::Hidden); +static cl::opt<unsigned> +PartialUnrollingThreshold("x86-partial-unrolling-threshold", cl::init(0), + cl::desc("Threshold for X86 partial unrolling"), cl::Hidden); +static cl::opt<unsigned> +PartialUnrollingMaxBranches("x86-partial-max-branches", cl::init(2), + cl::desc("Threshold for taken branches in X86 partial unrolling"), + cl::Hidden); + namespace { -class X86TTI : public ImmutablePass, public TargetTransformInfo { +class X86TTI final : public ImmutablePass, public TargetTransformInfo { const X86Subtarget *ST; const X86TargetLowering *TLI; @@ -46,20 +61,16 @@ public: } X86TTI(const X86TargetMachine *TM) - : ImmutablePass(ID), ST(TM->getSubtargetImpl()), - TLI(TM->getTargetLowering()) { + : ImmutablePass(ID), ST(TM->getSubtargetImpl()), + TLI(TM->getTargetLowering()) { initializeX86TTIPass(*PassRegistry::getPassRegistry()); } - virtual void initializePass() { + void initializePass() override { pushTTIStack(this); } - virtual void finalizePass() { - popTTIStack(); - } - - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { TargetTransformInfo::getAnalysisUsage(AU); } @@ -67,7 +78,7 @@ public: static char ID; /// Provide necessary pointer adjustments for the two base classes. - virtual void *getAdjustedAnalysisPointer(const void *ID) { + void *getAdjustedAnalysisPointer(const void *ID) override { if (ID == &TargetTransformInfo::ID) return (TargetTransformInfo*)this; return this; @@ -75,35 +86,43 @@ public: /// \name Scalar TTI Implementations /// @{ - virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const; + PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override; + void getUnrollingPreferences(Loop *L, + UnrollingPreferences &UP) const override; /// @} /// \name Vector TTI Implementations /// @{ - virtual unsigned getNumberOfRegisters(bool Vector) const; - virtual unsigned getRegisterBitWidth(bool Vector) const; - virtual unsigned getMaximumUnrollFactor() const; - virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind, - OperandValueKind) const; - virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, - int Index, Type *SubTp) const; - virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst, - Type *Src) const; - virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) const; - virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) const; - virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src, - unsigned Alignment, - unsigned AddressSpace) const; - - virtual unsigned getAddressComputationCost(Type *PtrTy, bool IsComplex) const; - - virtual unsigned getReductionCost(unsigned Opcode, Type *Ty, - bool IsPairwiseForm) const; + unsigned getNumberOfRegisters(bool Vector) const override; + unsigned getRegisterBitWidth(bool Vector) const override; + unsigned getMaximumUnrollFactor() const override; + unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind, + OperandValueKind) const override; + unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, + int Index, Type *SubTp) const override; + unsigned getCastInstrCost(unsigned Opcode, Type *Dst, + Type *Src) const override; + unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy) const override; + unsigned getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) const override; + unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace) const override; + + unsigned getAddressComputationCost(Type *PtrTy, + bool IsComplex) const override; + + unsigned getReductionCost(unsigned Opcode, Type *Ty, + bool IsPairwiseForm) const override; + + unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override; + + unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, + Type *Ty) const override; + unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty) const override; /// @} }; @@ -134,6 +153,93 @@ X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const { return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software; } +void X86TTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const { + if (!UsePartialUnrolling) + return; + // According to the Intel 64 and IA-32 Architectures Optimization Reference + // Manual, Intel Core models and later have a loop stream detector + // (and associated uop queue) that can benefit from partial unrolling. + // The relevant requirements are: + // - The loop must have no more than 4 (8 for Nehalem and later) branches + // taken, and none of them may be calls. + // - The loop can have no more than 18 (28 for Nehalem and later) uops. + + // According to the Software Optimization Guide for AMD Family 15h Processors, + // models 30h-4fh (Steamroller and later) have a loop predictor and loop + // buffer which can benefit from partial unrolling. + // The relevant requirements are: + // - The loop must have fewer than 16 branches + // - The loop must have less than 40 uops in all executed loop branches + + unsigned MaxBranches, MaxOps; + if (PartialUnrollingThreshold.getNumOccurrences() > 0) { + MaxBranches = PartialUnrollingMaxBranches; + MaxOps = PartialUnrollingThreshold; + } else if (ST->isAtom()) { + // On the Atom, the throughput for taken branches is 2 cycles. For small + // simple loops, expand by a small factor to hide the backedge cost. + MaxBranches = 2; + MaxOps = 10; + } else if (ST->hasFSGSBase() && ST->hasXOP() /* Steamroller and later */) { + MaxBranches = 16; + MaxOps = 40; + } else if (ST->hasFMA4() /* Any other recent AMD */) { + return; + } else if (ST->hasAVX() || ST->hasSSE42() /* Nehalem and later */) { + MaxBranches = 8; + MaxOps = 28; + } else if (ST->hasSSSE3() /* Intel Core */) { + MaxBranches = 4; + MaxOps = 18; + } else { + return; + } + + // Scan the loop: don't unroll loops with calls, and count the potential + // number of taken branches (this is somewhat conservative because we're + // counting all block transitions as potential branches while in reality some + // of these will become implicit via block placement). + unsigned MaxDepth = 0; + for (df_iterator<BasicBlock*> DI = df_begin(L->getHeader()), + DE = df_end(L->getHeader()); DI != DE;) { + if (!L->contains(*DI)) { + DI.skipChildren(); + continue; + } + + MaxDepth = std::max(MaxDepth, DI.getPathLength()); + if (MaxDepth > MaxBranches) + return; + + for (BasicBlock::iterator I = DI->begin(), IE = DI->end(); I != IE; ++I) + if (isa<CallInst>(I) || isa<InvokeInst>(I)) { + ImmutableCallSite CS(I); + if (const Function *F = CS.getCalledFunction()) { + if (!isLoweredToCall(F)) + continue; + } + + return; + } + + ++DI; + } + + // Enable runtime and partial unrolling up to the specified size. + UP.Partial = UP.Runtime = true; + UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps; + + // Set the maximum count based on the loop depth. The maximum number of + // branches taken in a loop (including the backedge) is equal to the maximum + // loop depth (the DFS path length from the loop header to any block in the + // loop). When the loop is unrolled, this depth (except for the backedge + // itself) is multiplied by the unrolling factor. This new unrolled depth + // must be less than the target-specific maximum branch count (which limits + // the number of taken branches in the uop buffer). + if (MaxDepth > 1) + UP.MaxCount = (MaxBranches-1)/(MaxDepth-1); +} + unsigned X86TTI::getNumberOfRegisters(bool Vector) const { if (Vector && !ST->hasSSE1()) return 0; @@ -214,6 +320,13 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // Look for AVX2 lowering tricks. if (ST->hasAVX2()) { + if (ISD == ISD::SHL && LT.second == MVT::v16i16 && + (Op2Info == TargetTransformInfo::OK_UniformConstantValue || + Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) + // On AVX2, a packed v16i16 shift left by a constant build_vector + // is lowered into a vector multiply (vpmullw). + return LT.first; + int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second); if (Idx != -1) return LT.first * AVX2CostTable[Idx].Cost; @@ -246,6 +359,20 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, return LT.first * SSE2UniformConstCostTable[Idx].Cost; } + if (ISD == ISD::SHL && + Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { + EVT VT = LT.second; + if ((VT == MVT::v8i16 && ST->hasSSE2()) || + (VT == MVT::v4i32 && ST->hasSSE41())) + // Vector shift left by non uniform constant can be lowered + // into vector multiply (pmullw/pmulld). + return LT.first; + if (VT == MVT::v4i32 && ST->hasSSE2()) + // A vector shift left by non uniform constant is converted + // into a vector multiply; the new multiply is eventually + // lowered into a sequence of shuffles and 2 x pmuludq. + ISD = ISD::MUL; + } static const CostTblEntry<MVT::SimpleValueType> SSE2CostTable[] = { // We don't correctly identify costs of casts because they are marked as @@ -260,6 +387,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SHL, MVT::v8i16, 8*10 }, // Scalarized. { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. { ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized. + { ISD::SHL, MVT::v4i64, 4*10 }, // Scalarized. { ISD::SRL, MVT::v16i8, 16*10 }, // Scalarized. { ISD::SRL, MVT::v8i16, 8*10 }, // Scalarized. @@ -297,6 +425,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // We don't have to scalarize unsupported ops. We can issue two half-sized // operations and we only need to extract the upper YMM half. // Two ops + 1 extract + 1 insert = 4. + { ISD::MUL, MVT::v16i16, 4 }, { ISD::MUL, MVT::v8i32, 4 }, { ISD::SUB, MVT::v8i32, 4 }, { ISD::ADD, MVT::v8i32, 4 }, @@ -312,7 +441,15 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // Look for AVX1 lowering tricks. if (ST->hasAVX() && !ST->hasAVX2()) { - int Idx = CostTableLookup(AVX1CostTable, ISD, LT.second); + EVT VT = LT.second; + + // v16i16 and v8i32 shifts by non-uniform constants are lowered into a + // sequence of extract + two vector multiply + insert. + if (ISD == ISD::SHL && (VT == MVT::v8i32 || VT == MVT::v16i16) && + Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) + ISD = ISD::MUL; + + int Idx = CostTableLookup(AVX1CostTable, ISD, VT); if (Idx != -1) return LT.first * AVX1CostTable[Idx].Cost; } @@ -332,7 +469,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // 2x pmuludq, 2x shuffle. if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() && !ST->hasSSE41()) - return 6; + return LT.first * 6; // Fallback to the default implementation. return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, @@ -400,16 +537,58 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); static const TypeConversionCostTblEntry<MVT::SimpleValueType> + AVX2ConversionTbl[] = { + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, + + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, + { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, + { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 4 }, + }; + + static const TypeConversionCostTblEntry<MVT::SimpleValueType> AVXConversionTbl[] = { - { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, - { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, - { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, - { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, - { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1 }, - { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, + + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 }, + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 }, + { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 }, + { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 }, { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 }, @@ -436,17 +615,32 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, - - { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 1 }, + // The generic code to compute the scalar overhead is currently broken. + // Workaround this limitation by estimating the scalarization overhead + // here. We have roughly 10 instructions per scalar element. + // Multiply that by the vector width. + // FIXME: remove that when PR19268 is fixed. + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 4*10 }, + + { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 }, { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, - { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 6 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 9 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 8 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 }, - { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 }, + // This node is expanded into scalarized operations but BasicTTI is overly + // optimistic estimating its cost. It computes 3 per element (one + // vector-extract, one scalar conversion and one vector-insert). The + // problem is that the inserts form a read-modify-write chain so latency + // should be factored in too. Inflating the cost per element by 1. + { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 }, }; + if (ST->hasAVX2()) { + int Idx = ConvertCostTableLookup(AVX2ConversionTbl, ISD, + DstTy.getSimpleVT(), SrcTy.getSimpleVT()); + if (Idx != -1) + return AVX2ConversionTbl[Idx].Cost; + } + if (ST->hasAVX()) { int Idx = ConvertCostTableLookup(AVXConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()); @@ -555,7 +749,7 @@ unsigned X86TTI::getScalarizationOverhead(Type *Ty, bool Insert, unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const { - // Handle non power of two vectors such as <3 x float> + // Handle non-power-of-two vectors such as <3 x float> if (VectorType *VTy = dyn_cast<VectorType>(Src)) { unsigned NumElem = VTy->getVectorNumElements(); @@ -570,7 +764,7 @@ unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, // Cost = 128 bit store + unpack + 64 bit store. return 3; - // Assume that all other non power-of-two numbers are scalarized. + // Assume that all other non-power-of-two numbers are scalarized. if (!isPowerOf2_32(NumElem)) { unsigned Cost = TargetTransformInfo::getMemoryOpCost(Opcode, VTy->getScalarType(), @@ -692,3 +886,109 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy, return TargetTransformInfo::getReductionCost(Opcode, ValTy, IsPairwise); } +unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + if (BitSize == 0) + return ~0U; + + if (Imm == 0) + return TCC_Free; + + if (Imm.getBitWidth() <= 64 && + (isInt<32>(Imm.getSExtValue()) || isUInt<32>(Imm.getZExtValue()))) + return TCC_Basic; + else + return 2 * TCC_Basic; +} + +unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, + Type *Ty) const { + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + if (BitSize == 0) + return ~0U; + + unsigned ImmIdx = ~0U; + switch (Opcode) { + default: return TCC_Free; + case Instruction::GetElementPtr: + // Always hoist the base address of a GetElementPtr. This prevents the + // creation of new constants for every base constant that gets constant + // folded with the offset. + if (Idx == 0) + return 2 * TCC_Basic; + return TCC_Free; + case Instruction::Store: + ImmIdx = 0; + break; + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::ICmp: + ImmIdx = 1; + break; + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::IntToPtr: + case Instruction::PtrToInt: + case Instruction::BitCast: + case Instruction::PHI: + case Instruction::Call: + case Instruction::Select: + case Instruction::Ret: + case Instruction::Load: + break; + } + + if ((Idx == ImmIdx) && + Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) + return TCC_Free; + + return X86TTI::getIntImmCost(Imm, Ty); +} + +unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, + const APInt &Imm, Type *Ty) const { + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + if (BitSize == 0) + return ~0U; + + switch (IID) { + default: return TCC_Free; + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: + if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) + return TCC_Free; + break; + case Intrinsic::experimental_stackmap: + if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) + return TCC_Free; + break; + case Intrinsic::experimental_patchpoint_void: + case Intrinsic::experimental_patchpoint_i64: + if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) + return TCC_Free; + break; + } + return X86TTI::getIntImmCost(Imm, Ty); +} |