diff options
-rw-r--r-- | lib/Target/TargetTransformImpl.cpp | 67 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/X86/conversion-cost.ll | 48 |
2 files changed, 89 insertions, 26 deletions
diff --git a/lib/Target/TargetTransformImpl.cpp b/lib/Target/TargetTransformImpl.cpp index 38c704f..dbbf37a 100644 --- a/lib/Target/TargetTransformImpl.cpp +++ b/lib/Target/TargetTransformImpl.cpp @@ -211,40 +211,55 @@ unsigned VectorTargetTransformImpl::getCastInstrCost(unsigned Opcode, Type *Dst, std::pair<unsigned, EVT> DstLT = getTypeLegalizationCost(Dst->getContext(), TLI->getValueType(Dst)); - // If the cast is between same-sized registers, then the check is simple. - if (SrcLT.first == DstLT.first && - SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) { - // Just check the op cost: - if (!TLI->isOperationExpand(ISD, DstLT.second)) { - // The operation is legal. Assume it costs 1. Multiply - // by the type-legalization overhead. - return SrcLT.first * 1; - } + // Handle scalar conversions. + if (!Src->isVectorTy() && !Dst->isVectorTy()) { + // Just check the op cost. If the operation is legal then assume it costs 1. + if (!TLI->isOperationExpand(ISD, DstLT.second)) + return 1; + + // Assume that illegal scalar instruction are expensive. + return 4; } - unsigned ScalarizationCost = 1; + // Check vector-to-vector casts. + if (Dst->isVectorTy() && Src->isVectorTy()) { - // Otherwise, assume that the cast is scalarized. - if (Dst->isVectorTy()) { - unsigned Num = Dst->getVectorNumElements(); - unsigned Cost = getCastInstrCost(Opcode, Src->getScalarType(), - Dst->getScalarType()); - // return the cost of multiple scalar invocation plus the cost of inserting - // and extracting the values. - ScalarizationCost *= getScalarizationOverhead(Dst, true, true) + Num * Cost; - } + // If the cast is between same-sized registers, then the check is simple. + if (SrcLT.first == DstLT.first && + SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) { - if (Src->isVectorTy()) { - unsigned Num = Src->getVectorNumElements(); + // Bitcast between types that are legalized to the same type are free. + if (Opcode == Instruction::BitCast) + return 0; + + // Just check the op cost. If the operation is legal then assume it costs + // 1 and multiply by the type-legalization overhead. + if (!TLI->isOperationExpand(ISD, DstLT.second)) + return SrcLT.first * 1; + } + + // If we are converting vectors and the operation is illegal, or + // if the vectors are legalized to different types, estimate the + // scalarization costs. + unsigned Num = Dst->getVectorNumElements(); unsigned Cost = getCastInstrCost(Opcode, Dst->getScalarType(), Src->getScalarType()); - // return the cost of multiple scalar invocation plus the cost of inserting - // and extracting the values. - ScalarizationCost *= getScalarizationOverhead(Src, true, true) + Num * Cost; + + // Return the cost of multiple scalar invocation plus the cost of + // inserting and extracting the values. + return getScalarizationOverhead(Dst, true, true) + Num * Cost; } - return ScalarizationCost; -} + // We already handled vector-to-vector and scalar-to-scalar conversions. This + // is where we handle bitcast between vectors and scalars. We need to assume + // that the conversion is scalarized in one way or another. + if (Opcode == Instruction::BitCast) + // Illegal bitcasts are done by storing and loading from a stack slot. + return (Src->isVectorTy()? getScalarizationOverhead(Src, false, true):0) + + (Dst->isVectorTy()? getScalarizationOverhead(Dst, true, false):0); + + llvm_unreachable("Unhandled cast"); + } unsigned VectorTargetTransformImpl::getCFInstrCost(unsigned Opcode) const { return 1; diff --git a/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/test/Transforms/LoopVectorize/X86/conversion-cost.ll new file mode 100644 index 0000000..8582613 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/conversion-cost.ll @@ -0,0 +1,48 @@ +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +;CHECK: @conversion_cost1 +;CHECK: store <8 x i8> +;CHECK: ret +define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp { + %1 = icmp sgt i32 %n, 3 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 3, %0 ] + %2 = trunc i64 %indvars.iv to i8 + %3 = getelementptr inbounds i8* %A, i64 %indvars.iv + store i8 %2, i8* %3, align 1 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + ret i32 undef +} + +;CHECK: @conversion_cost2 +;CHECK: store <8 x float> +;CHECK: ret +define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp { + %1 = icmp sgt i32 %n, 9 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ] + %2 = add nsw i64 %indvars.iv, 3 + %3 = trunc i64 %2 to i32 + %4 = sitofp i32 %3 to float + %5 = getelementptr inbounds float* %B, i64 %indvars.iv + store float %4, float* %5, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + ret i32 undef +} |