diff options
author | Arnold Schwaighofer <aschwaighofer@apple.com> | 2013-11-01 03:05:07 +0000 |
---|---|---|
committer | Arnold Schwaighofer <aschwaighofer@apple.com> | 2013-11-01 03:05:07 +0000 |
commit | 0097e155025767c11790912dcf780f82dffaffb1 (patch) | |
tree | bd5e660c6fe3dcd9d182c532fe9ff3e506d1851d | |
parent | d272a1223314a69e4678816feeff2cfb3e740f8f (diff) | |
download | external_llvm-0097e155025767c11790912dcf780f82dffaffb1.zip external_llvm-0097e155025767c11790912dcf780f82dffaffb1.tar.gz external_llvm-0097e155025767c11790912dcf780f82dffaffb1.tar.bz2 |
LoopVectorizer: If dependency checks fail try runtime checks
When a dependence check fails we can still try to vectorize loops with runtime
array bounds checks.
This helps linpack to vectorize a loop in dgefa. And we are back to 2x of the
scalar performance on a corei7-avx.
radar://15339680
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@193853 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 52 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/runtime-check.ll | 28 |
2 files changed, 75 insertions, 5 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index e972326..f18707c 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3061,7 +3061,7 @@ public: /// non-intersection. bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck, unsigned &NumComparisons, ScalarEvolution *SE, - Loop *TheLoop); + Loop *TheLoop, bool ShouldCheckStride = false); /// \brief Goes over all memory accesses, checks whether a RT check is needed /// and builds sets of dependent accesses. @@ -3075,6 +3075,7 @@ public: bool isRTCheckNeeded() { return IsRTCheckNeeded; } bool isDependencyCheckNeeded() { return !CheckDeps.empty(); } + void resetDepChecks() { CheckDeps.clear(); } MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; } @@ -3129,10 +3130,15 @@ static bool hasComputableBounds(ScalarEvolution *SE, Value *Ptr) { return AR->isAffine(); } +/// \brief Check the stride of the pointer and ensure that it does not wrap in +/// the address space. +static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr, + const Loop *Lp); + bool AccessAnalysis::canCheckPtrAtRT( LoopVectorizationLegality::RuntimePointerCheck &RtCheck, unsigned &NumComparisons, ScalarEvolution *SE, - Loop *TheLoop) { + Loop *TheLoop, bool ShouldCheckStride) { // Find pointers with computable bounds. We are going to use this information // to place a runtime bound check. unsigned NumReadPtrChecks = 0; @@ -3160,7 +3166,10 @@ bool AccessAnalysis::canCheckPtrAtRT( else ++NumReadPtrChecks; - if (hasComputableBounds(SE, Ptr)) { + if (hasComputableBounds(SE, Ptr) && + // When we run after a failing dependency check we have to make sure we + // don't have wrapping pointers. + (!ShouldCheckStride || isStridedPtr(SE, DL, Ptr, TheLoop) == 1)) { // The id of the dependence set. unsigned DepId; @@ -3342,8 +3351,9 @@ public: typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; - MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L) : - SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0) {} + MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L) + : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0), + ShouldRetryWithRuntimeCheck(false) {} /// \brief Register the location (instructions are given increasing numbers) /// of a write access. @@ -3373,6 +3383,10 @@ public: /// the accesses safely with. unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } + /// \brief In same cases when the dependency check fails we can still + /// vectorize the loop with a dynamic array access check. + bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; } + private: ScalarEvolution *SE; DataLayout *DL; @@ -3390,6 +3404,10 @@ private: // We can access this many bytes in parallel safely. unsigned MaxSafeDepDistBytes; + /// \brief If we see a non constant dependence distance we can still try to + /// vectorize this loop with runtime checks. + bool ShouldRetryWithRuntimeCheck; + /// \brief Check whether there is a plausible dependence between the two /// accesses. /// @@ -3587,6 +3605,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist); if (!C) { DEBUG(dbgs() << "LV: Dependence because of non constant distance\n"); + ShouldRetryWithRuntimeCheck = true; return true; } @@ -3876,6 +3895,29 @@ bool LoopVectorizationLegality::canVectorizeMemory() { CanVecMem = DepChecker.areDepsSafe(DependentAccesses, Accesses.getDependenciesToCheck()); MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes(); + + if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) { + DEBUG(dbgs() << "LV: Retrying with memory checks\n"); + NeedRTCheck = true; + + // Clear the dependency checks. We assume they are not needed. + Accesses.resetDepChecks(); + + PtrRtCheck.reset(); + PtrRtCheck.Need = true; + + CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, + TheLoop, true); + // Check that we did not collect too many pointers or found an unsizeable + // pointer. + if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { + DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n"); + PtrRtCheck.reset(); + return false; + } + + CanVecMem = true; + } } DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << diff --git a/test/Transforms/LoopVectorize/runtime-check.ll b/test/Transforms/LoopVectorize/runtime-check.ll index 4772256..d15479d 100644 --- a/test/Transforms/LoopVectorize/runtime-check.ll +++ b/test/Transforms/LoopVectorize/runtime-check.ll @@ -34,3 +34,31 @@ for.body: ; preds = %entry, %for.body for.end: ; preds = %for.body, %entry ret i32 undef } + +; Make sure that we try to vectorize loops with a runtime check if the +; dependency check fails. + +; CHECK-LABEL: test_runtime_check +; CHECK: <4 x float> +define void @test_runtime_check(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n) { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %ind.sum = add i64 %iv, %offset + %arr.idx = getelementptr inbounds float* %a, i64 %ind.sum + %l1 = load float* %arr.idx, align 4 + %ind.sum2 = add i64 %iv, %offset2 + %arr.idx2 = getelementptr inbounds float* %a, i64 %ind.sum2 + %l2 = load float* %arr.idx2, align 4 + %m = fmul fast float %b, %l2 + %ad = fadd fast float %l1, %m + store float %ad, float* %arr.idx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, %n + br i1 %exitcond, label %loopexit, label %for.body + +loopexit: + ret void +} |