diff options
author | Chandler Carruth <chandlerc@gmail.com> | 2013-07-26 08:20:39 +0000 |
---|---|---|
committer | Chandler Carruth <chandlerc@gmail.com> | 2013-07-26 08:20:39 +0000 |
commit | 8d93d41027b6f71b33b8da82c69766498bb1519a (patch) | |
tree | dac86b502fd820855cc69d64b4133efe284209f8 /test/Transforms/Mem2Reg | |
parent | 6ee1464ba599f1afbed502fa1b3ac18c8577fd97 (diff) | |
download | external_llvm-8d93d41027b6f71b33b8da82c69766498bb1519a.zip external_llvm-8d93d41027b6f71b33b8da82c69766498bb1519a.tar.gz external_llvm-8d93d41027b6f71b33b8da82c69766498bb1519a.tar.bz2 |
Re-implement the analysis of uses in mem2reg to be significantly more
robust. It now uses an InstVisitor and worklist to actually walk the
uses of the Alloca transitively and detect the pattern which we can
directly promote: loads & stores of the whole alloca and instructions we
can completely ignore.
Also, with this new implementation teach both the predicate for testing
whether we can promote and the promotion engine itself to use the same
code so we no longer have strange divergence between the two code paths.
I've added some silly test cases to demonstrate that we can handle
slightly more degenerate code patterns now. See the below for why this
is even interesting.
Performance impact: roughly 1% regression in the performance of SROA or
ScalarRepl on a large C++-ish test case where most of the allocas are
basically ready for promotion. The reason is because of silly redundant
work that I've left FIXMEs for and which I'll address in the next
commit. I wanted to separate this commit as it changes the behavior.
Once the redundant work in removing the dead uses of the alloca is
fixed, this code appears to be faster than the old version. =]
So why is this useful? Because the previous requirement for promotion
required a *specific* visit pattern of the uses of the alloca to verify:
we *had* to look for no more than 1 intervening use. The end goal is to
have SROA automatically detect when an alloca is already promotable and
directly hand it to the mem2reg machinery rather than trying to
partition and rewrite it. This is a 25% or more performance improvement
for SROA, and a significant chunk of the delta between it and
ScalarRepl. To get there, we need to make mem2reg actually capable of
promoting allocas which *look* promotable to SROA without have SROA do
tons of work to massage the code into just the right form.
This is actually the tip of the iceberg. There are tremendous potential
savings we can realize here by de-duplicating work between mem2reg and
SROA.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@187191 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test/Transforms/Mem2Reg')
-rw-r--r-- | test/Transforms/Mem2Reg/ignore-lifetime.ll | 26 | ||||
-rw-r--r-- | test/Transforms/Mem2Reg/use-analysis.ll | 70 |
2 files changed, 70 insertions, 26 deletions
diff --git a/test/Transforms/Mem2Reg/ignore-lifetime.ll b/test/Transforms/Mem2Reg/ignore-lifetime.ll deleted file mode 100644 index 5e4f9bf..0000000 --- a/test/Transforms/Mem2Reg/ignore-lifetime.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: opt -mem2reg -S -o - < %s | FileCheck %s - -declare void @llvm.lifetime.start(i64 %size, i8* nocapture %ptr) -declare void @llvm.lifetime.end(i64 %size, i8* nocapture %ptr) - -define void @test1() { -; CHECK: test1 -; CHECK-NOT: alloca - %A = alloca i32 - %B = bitcast i32* %A to i8* - call void @llvm.lifetime.start(i64 2, i8* %B) - store i32 1, i32* %A - call void @llvm.lifetime.end(i64 2, i8* %B) - ret void -} - -define void @test2() { -; CHECK: test2 -; CHECK-NOT: alloca - %A = alloca {i8, i16} - %B = getelementptr {i8, i16}* %A, i32 0, i32 0 - call void @llvm.lifetime.start(i64 2, i8* %B) - store {i8, i16} zeroinitializer, {i8, i16}* %A - call void @llvm.lifetime.end(i64 2, i8* %B) - ret void -} diff --git a/test/Transforms/Mem2Reg/use-analysis.ll b/test/Transforms/Mem2Reg/use-analysis.ll new file mode 100644 index 0000000..b08b1f1 --- /dev/null +++ b/test/Transforms/Mem2Reg/use-analysis.ll @@ -0,0 +1,70 @@ +; RUN: opt -mem2reg -S -o - < %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64" + +declare void @llvm.lifetime.start(i64 %size, i8* nocapture %ptr) +declare void @llvm.lifetime.end(i64 %size, i8* nocapture %ptr) + +define void @test1() { +; Ensure we can look through a bitcast to i8* and the addition of lifetime +; markers. +; +; CHECK-LABEL: @test1( +; CHECK-NOT: alloca +; CHECK: ret void + + %A = alloca i32 + %B = bitcast i32* %A to i8* + call void @llvm.lifetime.start(i64 2, i8* %B) + store i32 1, i32* %A + call void @llvm.lifetime.end(i64 2, i8* %B) + ret void +} + +define void @test2() { +; Ensure we can look through a GEP to i8* and the addition of lifetime +; markers. +; +; CHECK-LABEL: @test2( +; CHECK-NOT: alloca +; CHECK: ret void + + %A = alloca {i8, i16} + %B = getelementptr {i8, i16}* %A, i32 0, i32 0 + call void @llvm.lifetime.start(i64 2, i8* %B) + store {i8, i16} zeroinitializer, {i8, i16}* %A + call void @llvm.lifetime.end(i64 2, i8* %B) + ret void +} + +define i32 @test3(i32 %x) { +; CHECK-LABEL: @test3( +; +; Check that we recursively walk the uses of the alloca and thus can see +; through round trip bitcasts, dead bitcasts, GEPs, multiple GEPs, and lifetime +; markers. +entry: + %a = alloca i32 +; CHECK-NOT: alloca + + %b = bitcast i32* %a to i8* + %b2 = getelementptr inbounds i8* %b, i32 0 + %b3 = getelementptr inbounds i8* %b2, i32 0 + call void @llvm.lifetime.start(i64 -1, i8* %b3) +; CHECK-NOT: call void @llvm.lifetime.start + + store i32 %x, i32* %a +; CHECK-NOT: store + + %dead = bitcast i32* %a to i4096* + %dead1 = bitcast i4096* %dead to i42* + %dead2 = getelementptr inbounds i32* %a, i32 %x +; CHECK-NOT: bitcast +; CHECK-NOT: getelementptr + + %ret = load i32* %a +; CHECK-NOT: load + + ret i32 %ret +; CHECK: ret i32 %x +} |