diff options
-rw-r--r-- | include/llvm/Analysis/TargetTransformInfo.h | 3 | ||||
-rw-r--r-- | lib/Analysis/TargetTransformInfo.cpp | 8 | ||||
-rw-r--r-- | lib/CodeGen/BasicTargetTransformInfo.cpp | 5 | ||||
-rw-r--r-- | lib/Target/ARM/ARMTargetTransformInfo.cpp | 10 | ||||
-rw-r--r-- | lib/Target/X86/X86TargetTransformInfo.cpp | 17 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 51 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/ARM/gcc-examples.ll | 60 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/X86/avx1.ll | 2 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/X86/conversion-cost.ll | 2 |
9 files changed, 153 insertions, 5 deletions
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index 1679c4f..4379425 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -148,6 +148,9 @@ public: /// set to false, it returns the number of scalar registers. virtual unsigned getNumberOfRegisters(bool Vector) const; + /// \return The width of the largest scalar or vector register type. + virtual unsigned getRegisterBitWidth(bool Vector) const; + /// \return The maximum unroll factor that the vectorizer should try to /// perform for this target. This number depends on the level of parallelism /// and the number of execution units in the CPU. diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 02af2d3..3ef74eb 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -92,6 +92,10 @@ unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const { return PrevTTI->getNumberOfRegisters(Vector); } +unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const { + return PrevTTI->getRegisterBitWidth(Vector); +} + unsigned TargetTransformInfo::getMaximumUnrollFactor() const { return PrevTTI->getMaximumUnrollFactor(); } @@ -220,6 +224,10 @@ struct NoTTI : ImmutablePass, TargetTransformInfo { return 8; } + unsigned getRegisterBitWidth(bool Vector) const { + return 32; + } + unsigned getMaximumUnrollFactor() const { return 1; } diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp index 2f3ac9a..3892cc4 100644 --- a/lib/CodeGen/BasicTargetTransformInfo.cpp +++ b/lib/CodeGen/BasicTargetTransformInfo.cpp @@ -84,6 +84,7 @@ public: virtual unsigned getNumberOfRegisters(bool Vector) const; virtual unsigned getMaximumUnrollFactor() const; + virtual unsigned getRegisterBitWidth(bool Vector) const; virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const; virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const; @@ -183,6 +184,10 @@ unsigned BasicTTI::getNumberOfRegisters(bool Vector) const { return 1; } +unsigned BasicTTI::getRegisterBitWidth(bool Vector) const { + return 32; +} + unsigned BasicTTI::getMaximumUnrollFactor() const { return 1; } diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index 634004a..404a6ff 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -94,6 +94,16 @@ public: return 16; } + unsigned getRegisterBitWidth(bool Vector) const { + if (Vector) { + if (ST->hasNEON()) + return 128; + return 0; + } + + return 32; + } + unsigned getMaximumUnrollFactor() const { // These are out of order CPUs: if (ST->isCortexA15() || ST->isSwift()) diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 6ab08cb..675c896 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -83,6 +83,7 @@ public: /// @{ virtual unsigned getNumberOfRegisters(bool Vector) const; + virtual unsigned getRegisterBitWidth(bool Vector) const; virtual unsigned getMaximumUnrollFactor() const; virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const; virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, @@ -165,11 +166,27 @@ X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const { } unsigned X86TTI::getNumberOfRegisters(bool Vector) const { + if (Vector && !ST->hasSSE1()) + return 0; + if (ST->is64Bit()) return 16; return 8; } +unsigned X86TTI::getRegisterBitWidth(bool Vector) const { + if (Vector) { + if (ST->hasAVX()) return 256; + if (ST->hasSSE1()) return 128; + return 0; + } + + if (ST->is64Bit()) + return 64; + return 32; + +} + unsigned X86TTI::getMaximumUnrollFactor() const { if (ST->isAtom()) return 1; diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index c29f416..cde4bb8 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -113,9 +113,6 @@ static const unsigned MaxLoopSizeThreshold = 32; /// number of pointers. Notice that the check is quadratic! static const unsigned RuntimeMemoryCheckThreshold = 4; -/// This is the highest vector width that we try to generate. -static const unsigned MaxVectorSize = 8; - namespace { // Forward declarations. @@ -523,6 +520,10 @@ public: /// possible. unsigned selectVectorizationFactor(bool OptForSize, unsigned UserVF); + /// \returns The size (in bits) of the widest type in the code that + /// needs to be vectorized. We ignore values that remain scalar such as + /// 64 bit loop indices. + unsigned getWidestType(); /// \return The most profitable unroll factor. /// If UserUF is non-zero then this method finds the best unroll-factor @@ -2621,6 +2622,20 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n"); + unsigned WidestType = getWidestType(); + unsigned WidestRegister = TTI.getRegisterBitWidth(true); + unsigned MaxVectorSize = WidestRegister / WidestType; + DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); + DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n"); + + if (MaxVectorSize == 0) { + DEBUG(dbgs() << "LV: The target has no vector registers.\n"); + return 1; + } + + assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements" + " into one vector."); + unsigned VF = MaxVectorSize; // If we optimize the program for size, avoid creating the tail loop. @@ -2672,6 +2687,36 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, return Width; } +unsigned LoopVectorizationCostModel::getWidestType() { + unsigned MaxWidth = 8; + + // For each block. + for (Loop::block_iterator bb = TheLoop->block_begin(), + be = TheLoop->block_end(); bb != be; ++bb) { + BasicBlock *BB = *bb; + + // For each instruction in the loop. + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + if (Legal->isUniformAfterVectorization(it)) + continue; + + Type *T = it->getType(); + + if (StoreInst *ST = dyn_cast<StoreInst>(it)) + T = ST->getValueOperand()->getType(); + + // PHINodes and pointers are difficult to analyze, but we catch all other + // uses of the types in other instructions. + if (isa<PHINode>(it) || T->isPointerTy() || T->isVoidTy()) + continue; + + MaxWidth = std::max(MaxWidth, T->getScalarSizeInBits()); + } + } + + return MaxWidth; +} + unsigned LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, unsigned UserUF) { diff --git a/test/Transforms/LoopVectorize/ARM/gcc-examples.ll b/test/Transforms/LoopVectorize/ARM/gcc-examples.ll new file mode 100644 index 0000000..6af4af6 --- /dev/null +++ b/test/Transforms/LoopVectorize/ARM/gcc-examples.ll @@ -0,0 +1,60 @@ +; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S -dce | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" +target triple = "thumbv7-apple-ios3.0.0" + +@b = common global [2048 x i32] zeroinitializer, align 16 +@c = common global [2048 x i32] zeroinitializer, align 16 +@a = common global [2048 x i32] zeroinitializer, align 16 + +; Select VF = 8; +;CHECK: @example1 +;CHECK: load <4 x i32> +;CHECK: add nsw <4 x i32> +;CHECK: store <4 x i32> +;CHECK: ret void +define void @example1() nounwind uwtable ssp { + br label %1 + +; <label>:1 ; preds = %1, %0 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv + %3 = load i32* %2, align 4 + %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv + %5 = load i32* %4, align 4 + %6 = add nsw i32 %5, %3 + %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv + store i32 %6, i32* %7, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 256 + br i1 %exitcond, label %8, label %1 + +; <label>:8 ; preds = %1 + ret void +} + +;CHECK: @example10b +;CHECK: load <2 x i16> +;CHECK: sext <2 x i16> +;CHECK: store <2 x i32> +;CHECK: ret void +define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp { + br label %1 + +; <label>:1 ; preds = %1, %0 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds i16* %sb, i64 %indvars.iv + %3 = load i16* %2, align 2 + %4 = sext i16 %3 to i32 + %5 = getelementptr inbounds i32* %ia, i64 %indvars.iv + store i32 %4, i32* %5, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 1024 + br i1 %exitcond, label %6, label %1 + +; <label>:6 ; preds = %1 + ret void +} + diff --git a/test/Transforms/LoopVectorize/X86/avx1.ll b/test/Transforms/LoopVectorize/X86/avx1.ll index a2d176a..a85c6fe 100644 --- a/test/Transforms/LoopVectorize/X86/avx1.ll +++ b/test/Transforms/LoopVectorize/X86/avx1.ll @@ -27,7 +27,7 @@ define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwta ;CHECK: @read_mod_i64 -;CHECK: load <8 x i64> +;CHECK: load <4 x i64> ;CHECK: ret i32 define i32 @read_mod_i64(i64* nocapture %a, i32 %n) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 0 diff --git a/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/test/Transforms/LoopVectorize/X86/conversion-cost.ll index 60c742e..23d9233 100644 --- a/test/Transforms/LoopVectorize/X86/conversion-cost.ll +++ b/test/Transforms/LoopVectorize/X86/conversion-cost.ll @@ -4,7 +4,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 target triple = "x86_64-apple-macosx10.8.0" ;CHECK: @conversion_cost1 -;CHECK: store <8 x i8> +;CHECK: store <32 x i8> ;CHECK: ret define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 3 |