diff options
-rw-r--r-- | include/llvm/Analysis/TargetTransformInfo.h | 6 | ||||
-rw-r--r-- | lib/Analysis/CostModel.cpp | 5 | ||||
-rw-r--r-- | lib/Analysis/TargetTransformInfo.cpp | 7 | ||||
-rw-r--r-- | lib/CodeGen/BasicTargetTransformInfo.cpp | 5 | ||||
-rw-r--r-- | lib/Target/ARM/ARMTargetTransformInfo.cpp | 13 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 22 | ||||
-rw-r--r-- | test/Analysis/CostModel/ARM/gep.ll | 43 | ||||
-rw-r--r-- | test/Analysis/CostModel/ARM/insertelement.ll | 6 | ||||
-rw-r--r-- | test/Analysis/CostModel/X86/gep.ll | 40 |
9 files changed, 134 insertions, 13 deletions
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index 681b838..e1331a1 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -314,6 +314,12 @@ public: /// split during legalization. Zero is returned when the answer is unknown. virtual unsigned getNumberOfParts(Type *Tp) const; + /// \returns The cost of the address computation. For most targets this can be + /// merged into the instruction indexing mode. Some targets might want to + /// distinguish between address computation for memory operations on vector + /// types and scalar types. Such targets should override this function. + virtual unsigned getAddressComputationCost(Type *Ty) const; + /// @} /// Analysis group identification. diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp index 1784512..8435e39 100644 --- a/lib/Analysis/CostModel.cpp +++ b/lib/Analysis/CostModel.cpp @@ -85,6 +85,11 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const { return -1; switch (I->getOpcode()) { + case Instruction::GetElementPtr:{ + Type *ValTy = I->getOperand(0)->getType()->getPointerElementType(); + return TTI->getAddressComputationCost(ValTy); + } + case Instruction::Ret: case Instruction::PHI: case Instruction::Br: { diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 9fc21fd..72421a0 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -196,6 +196,9 @@ unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const { return PrevTTI->getNumberOfParts(Tp); } +unsigned TargetTransformInfo::getAddressComputationCost(Type *Tp) const { + return PrevTTI->getAddressComputationCost(Tp); +} namespace { @@ -535,6 +538,10 @@ struct NoTTI : ImmutablePass, TargetTransformInfo { unsigned getNumberOfParts(Type *Tp) const { return 0; } + + unsigned getAddressComputationCost(Type *Tp) const { + return 0; + } }; } // end anonymous namespace diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp index ea5e937..e8b5b4f 100644 --- a/lib/CodeGen/BasicTargetTransformInfo.cpp +++ b/lib/CodeGen/BasicTargetTransformInfo.cpp @@ -101,6 +101,7 @@ public: virtual unsigned getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy, ArrayRef<Type*> Tys) const; virtual unsigned getNumberOfParts(Type *Tp) const; + virtual unsigned getAddressComputationCost(Type *Ty) const; /// @} }; @@ -400,3 +401,7 @@ unsigned BasicTTI::getNumberOfParts(Type *Tp) const { std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); return LT.first; } + +unsigned BasicTTI::getAddressComputationCost(Type *Ty) const { + return 0; +} diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index 1f91e0e..f6fa319 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -120,6 +120,8 @@ public: unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const; unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const; + + unsigned getAddressComputationCost(Type *Val) const; /// @} }; @@ -304,12 +306,13 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst, unsigned ARMTTI::getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index) const { - // Penalize inserting into an D-subregister. + // Penalize inserting into an D-subregister. We end up with a three times + // lower estimated throughput on swift. if (ST->isSwift() && Opcode == Instruction::InsertElement && ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32) - return 2; + return 3; return TargetTransformInfo::getVectorInstrCost(Opcode, ValTy, Index); } @@ -326,3 +329,9 @@ unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); } + +unsigned ARMTTI::getAddressComputationCost(Type *Ty) const { + // In many cases the address computation is not merged into the instruction + // addressing mode. + return 1; +} diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 91d5659..f12b0bf 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3056,9 +3056,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { case Instruction::GetElementPtr: - // We mark this instruction as zero-cost because scalar GEPs are usually - // lowered to the intruction addressing mode. At the moment we don't - // generate vector geps. + // We mark this instruction as zero-cost because the cost of GEPs in + // vectorized code depends on whether the corresponding memory instruction + // is scalarized or not. Therefore, we handle GEPs with the memory + // instruction cost. return 0; case Instruction::Br: { return TTI.getCFInstrCost(I->getOpcode()); @@ -3113,9 +3114,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { unsigned AS = SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace(); Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand(); - + // We add the cost of address computation here instead of with the gep + // instruction because only here we know whether the operation is + // scalarized. if (VF == 1) - return TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + return TTI.getAddressComputationCost(VectorTy) + + TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); // Scalarized loads/stores. int Stride = Legal->isConsecutivePtr(Ptr); @@ -3135,15 +3139,17 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { VectorTy, i); } - // The cost of the scalar stores. + // The cost of the scalar loads/stores. + Cost += VF * TTI.getAddressComputationCost(ValTy->getScalarType()); Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, AS); return Cost; } // Wide load/stores. - unsigned Cost = TTI.getMemoryOpCost(I->getOpcode(), VectorTy, - Alignment, AS); + unsigned Cost = TTI.getAddressComputationCost(VectorTy); + Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + if (Reverse) Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); diff --git a/test/Analysis/CostModel/ARM/gep.ll b/test/Analysis/CostModel/ARM/gep.ll new file mode 100644 index 0000000..a63b87d --- /dev/null +++ b/test/Analysis/CostModel/ARM/gep.ll @@ -0,0 +1,43 @@ +; RUN: opt -cost-model -analyze -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" +target triple = "thumbv7-apple-ios6.0.0" + +define void @test_geps() { + ; Cost of scalar integer geps should be one. We can't always expect it to be + ; folded into the instruction addressing mode. +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i8* + %a0 = getelementptr inbounds i8* undef, i32 0 +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16* + %a1 = getelementptr inbounds i16* undef, i32 0 +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32* + %a2 = getelementptr inbounds i32* undef, i32 0 + +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64* + %a3 = getelementptr inbounds i64* undef, i32 0 + + ; Cost of scalar floating point geps should be one. We cannot fold the address + ; computation. +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds float* + %a4 = getelementptr inbounds float* undef, i32 0 +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds double* + %a5 = getelementptr inbounds double* undef, i32 0 + + + ; Cost of vector geps should be one. We cannot fold the address computation. +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i8>* + %a7 = getelementptr inbounds <4 x i8>* undef, i32 0 +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i16>* + %a8 = getelementptr inbounds <4 x i16>* undef, i32 0 +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i32>* + %a9 = getelementptr inbounds <4 x i32>* undef, i32 0 +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i64>* + %a10 = getelementptr inbounds <4 x i64>* undef, i32 0 +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x float>* + %a11 = getelementptr inbounds <4 x float>* undef, i32 0 +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x double>* + %a12 = getelementptr inbounds <4 x double>* undef, i32 0 + + + ret void +} diff --git a/test/Analysis/CostModel/ARM/insertelement.ll b/test/Analysis/CostModel/ARM/insertelement.ll index 2d43e4d..f951b08 100644 --- a/test/Analysis/CostModel/ARM/insertelement.ll +++ b/test/Analysis/CostModel/ARM/insertelement.ll @@ -12,7 +12,7 @@ define void @insertelement_i8(%T_i8* %saddr, %T_i8v* %vaddr) { %v0 = load %T_i8v* %vaddr %v1 = load %T_i8* %saddr -;CHECK: estimated cost of 2 for {{.*}} insertelement <8 x i8> +;CHECK: estimated cost of 3 for {{.*}} insertelement <8 x i8> %v2 = insertelement %T_i8v %v0, %T_i8 %v1, i32 1 store %T_i8v %v2, %T_i8v* %vaddr ret void @@ -26,7 +26,7 @@ define void @insertelement_i16(%T_i16* %saddr, %T_i16v* %vaddr) { %v0 = load %T_i16v* %vaddr %v1 = load %T_i16* %saddr -;CHECK: estimated cost of 2 for {{.*}} insertelement <4 x i16> +;CHECK: estimated cost of 3 for {{.*}} insertelement <4 x i16> %v2 = insertelement %T_i16v %v0, %T_i16 %v1, i32 1 store %T_i16v %v2, %T_i16v* %vaddr ret void @@ -39,7 +39,7 @@ define void @insertelement_i32(%T_i32* %saddr, %T_i32v* %vaddr) { %v0 = load %T_i32v* %vaddr %v1 = load %T_i32* %saddr -;CHECK: estimated cost of 2 for {{.*}} insertelement <2 x i32> +;CHECK: estimated cost of 3 for {{.*}} insertelement <2 x i32> %v2 = insertelement %T_i32v %v0, %T_i32 %v1, i32 1 store %T_i32v %v2, %T_i32v* %vaddr ret void diff --git a/test/Analysis/CostModel/X86/gep.ll b/test/Analysis/CostModel/X86/gep.ll new file mode 100644 index 0000000..877184a --- /dev/null +++ b/test/Analysis/CostModel/X86/gep.ll @@ -0,0 +1,40 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + + +define void @test_geps() { + ; Cost of should be zero. We expect it to be folded into + ; the instruction addressing mode. +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8* + %a0 = getelementptr inbounds i8* undef, i32 0 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16* + %a1 = getelementptr inbounds i16* undef, i32 0 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32* + %a2 = getelementptr inbounds i32* undef, i32 0 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i64* + %a3 = getelementptr inbounds i64* undef, i32 0 + +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds float* + %a4 = getelementptr inbounds float* undef, i32 0 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds double* + %a5 = getelementptr inbounds double* undef, i32 0 + + ; Vector geps should also have zero cost. +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i8>* + %a7 = getelementptr inbounds <4 x i8>* undef, i32 0 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i16>* + %a8 = getelementptr inbounds <4 x i16>* undef, i32 0 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i32>* + %a9 = getelementptr inbounds <4 x i32>* undef, i32 0 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i64>* + %a10 = getelementptr inbounds <4 x i64>* undef, i32 0 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x float>* + %a11 = getelementptr inbounds <4 x float>* undef, i32 0 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x double>* + %a12 = getelementptr inbounds <4 x double>* undef, i32 0 + + + ret void +} |