diff options
-rw-r--r-- | lib/Analysis/CostModel.cpp | 11 | ||||
-rw-r--r-- | lib/CodeGen/BasicTargetTransformInfo.cpp | 79 | ||||
-rw-r--r-- | test/Analysis/CostModel/X86/intrinsic-cost.ll | 32 |
3 files changed, 110 insertions, 12 deletions
diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp index 44684a9..df70d15 100644 --- a/lib/Analysis/CostModel.cpp +++ b/lib/Analysis/CostModel.cpp @@ -23,6 +23,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Value.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" @@ -189,6 +190,16 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const { 0); return -1; } + case Instruction::Call: + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + SmallVector<Type*, 4> Tys; + for (unsigned J = 0, JE = II->getNumArgOperands(); J != JE; ++J) + Tys.push_back(II->getArgOperand(J)->getType()); + + return TTI->getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), + Tys); + } + return -1; default: // We don't have any information on this instruction. return -1; diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp index e8b5b4f..4cd1b80 100644 --- a/lib/CodeGen/BasicTargetTransformInfo.cpp +++ b/lib/CodeGen/BasicTargetTransformInfo.cpp @@ -379,22 +379,77 @@ unsigned BasicTTI::getMemoryOpCost(unsigned Opcode, Type *Src, return LT.first; } -unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy, +unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> Tys) const { - // assume that we need to scalarize this intrinsic. - unsigned ScalarizationCost = 0; - unsigned ScalarCalls = 1; - if (RetTy->isVectorTy()) { - ScalarizationCost = getScalarizationOverhead(RetTy, true, false); - ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements()); - } - for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { - if (Tys[i]->isVectorTy()) { - ScalarizationCost += getScalarizationOverhead(Tys[i], false, true); + unsigned ISD = 0; + switch (IID) { + default: { + // Assume that we need to scalarize this intrinsic. + unsigned ScalarizationCost = 0; + unsigned ScalarCalls = 1; + if (RetTy->isVectorTy()) { + ScalarizationCost = getScalarizationOverhead(RetTy, true, false); ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements()); } + for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { + if (Tys[i]->isVectorTy()) { + ScalarizationCost += getScalarizationOverhead(Tys[i], false, true); + ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements()); + } + } + + return ScalarCalls + ScalarizationCost; + } + // Look for intrinsics that can be lowered directly or turned into a scalar + // intrinsic call. + case Intrinsic::sqrt: ISD = ISD::FSQRT; break; + case Intrinsic::sin: ISD = ISD::FSIN; break; + case Intrinsic::cos: ISD = ISD::FCOS; break; + case Intrinsic::exp: ISD = ISD::FEXP; break; + case Intrinsic::exp2: ISD = ISD::FEXP2; break; + case Intrinsic::log: ISD = ISD::FLOG; break; + case Intrinsic::log10: ISD = ISD::FLOG10; break; + case Intrinsic::log2: ISD = ISD::FLOG2; break; + case Intrinsic::fabs: ISD = ISD::FABS; break; + case Intrinsic::floor: ISD = ISD::FFLOOR; break; + case Intrinsic::ceil: ISD = ISD::FCEIL; break; + case Intrinsic::trunc: ISD = ISD::FTRUNC; break; + case Intrinsic::rint: ISD = ISD::FRINT; break; + case Intrinsic::pow: ISD = ISD::FPOW; break; + case Intrinsic::fma: ISD = ISD::FMA; break; + case Intrinsic::fmuladd: ISD = ISD::FMA; break; // FIXME: mul + add? + } + + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(RetTy); + + if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { + // The operation is legal. Assume it costs 1. + // If the type is split to multiple registers, assume that thre is some + // overhead to this. + // TODO: Once we have extract/insert subvector cost we need to use them. + if (LT.first > 1) + return LT.first * 2; + return LT.first * 1; } - return ScalarCalls + ScalarizationCost; + + if (!TLI->isOperationExpand(ISD, LT.second)) { + // If the operation is custom lowered then assume + // thare the code is twice as expensive. + return LT.first * 2; + } + + // Else, assume that we need to scalarize this intrinsic. For math builtins + // this will emit a costly libcall, adding call overhead and spills. Make it + // very expensive. + if (RetTy->isVectorTy()) { + unsigned Num = RetTy->getVectorNumElements(); + unsigned Cost = TopTTI->getIntrinsicInstrCost(IID, RetTy->getScalarType(), + Tys); + return 10 * Cost * Num; + } + + // This is going to be turned into a library call, make it expensive. + return 10; } unsigned BasicTTI::getNumberOfParts(Type *Tp) const { diff --git a/test/Analysis/CostModel/X86/intrinsic-cost.ll b/test/Analysis/CostModel/X86/intrinsic-cost.ll new file mode 100644 index 0000000..e235a36 --- /dev/null +++ b/test/Analysis/CostModel/X86/intrinsic-cost.ll @@ -0,0 +1,32 @@ +; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core2 -cost-model -analyze < %s | FileCheck %s -check-prefix=CORE2 +; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=corei7 -cost-model -analyze < %s | FileCheck %s -check-prefix=COREI7 + +; If SSE4.1 roundps instruction is available it is cheap to lower, otherwise +; it'll be scalarized into calls which are expensive. +define void @test1(float* nocapture %f) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds float* %f, i64 %index + %1 = bitcast float* %0 to <4 x float>* + %wide.load = load <4 x float>* %1, align 4 + %2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load) + store <4 x float> %2, <4 x float>* %1, align 4 + %index.next = add i64 %index, 4 + %3 = icmp eq i64 %index.next, 1024 + br i1 %3, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; CORE2: Printing analysis 'Cost Model Analysis' for function 'test1': +; CORE2: Cost Model: Found an estimated cost of 400 for instruction: %2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load) + +; COREI7: Printing analysis 'Cost Model Analysis' for function 'test1': +; COREI7: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load) + +} + +declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone |