diff options
Diffstat (limited to 'test/Transforms/EarlyCSE')
-rw-r--r-- | test/Transforms/EarlyCSE/AArch64/intrinsics.ll | 232 | ||||
-rw-r--r-- | test/Transforms/EarlyCSE/AArch64/lit.local.cfg | 5 | ||||
-rw-r--r-- | test/Transforms/EarlyCSE/basic.ll | 12 |
3 files changed, 248 insertions, 1 deletions
diff --git a/test/Transforms/EarlyCSE/AArch64/intrinsics.ll b/test/Transforms/EarlyCSE/AArch64/intrinsics.ll new file mode 100644 index 0000000..d166ff1 --- /dev/null +++ b/test/Transforms/EarlyCSE/AArch64/intrinsics.ll @@ -0,0 +1,232 @@ +; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -early-cse | FileCheck %s +; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -passes=early-cse | FileCheck %s + +define <4 x i32> @test_cse(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) { +entry: +; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE. +; CHECK-LABEL: @test_cse +; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8 + %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 + %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] + %cmp = icmp slt i32 %i.0, %n + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = bitcast i32* %a to i8* + %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> + %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> + %3 = bitcast <16 x i8> %1 to <4 x i32> + %4 = bitcast <16 x i8> %2 to <4 x i32> + call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) + %5 = bitcast i32* %a to i8* + %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5) + %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 + %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret <4 x i32> %res.0 +} + +define <4 x i32> @test_cse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) { +entry: +; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE. +; CHECK-LABEL: @test_cse2 +; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0) +; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) + %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 + %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] + %cmp = icmp slt i32 %i.0, %n + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = bitcast i32* %a to i8* + %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> + %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> + %3 = bitcast <16 x i8> %1 to <4 x i32> + %4 = bitcast <16 x i8> %2 to <4 x i32> + call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0) + call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) + %5 = bitcast i32* %a to i8* + %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5) + %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 + %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret <4 x i32> %res.0 +} + +define <4 x i32> @test_cse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 { +entry: +; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE. +; CHECK-LABEL: @test_cse3 +; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8 +; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8 + %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 + %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] + %cmp = icmp slt i32 %i.0, %n + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = bitcast i32* %a to i8* + %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %0) + %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 + %1 = bitcast i32* %a to i8* + %vld22 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %1) + %vld22.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 0 + %vld22.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 1 + %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld22.fca.0.extract) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret <4 x i32> %res.0 +} + + +define <4 x i32> @test_nocse(i32* %a, i32* %b, [2 x <4 x i32>] %s.coerce, i32 %n) { +entry: +; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized +; away by Early CSE. +; CHECK-LABEL: @test_nocse +; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8 + %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 + %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] + %cmp = icmp slt i32 %i.0, %n + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = bitcast i32* %a to i8* + %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> + %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> + %3 = bitcast <16 x i8> %1 to <4 x i32> + %4 = bitcast <16 x i8> %2 to <4 x i32> + call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) + store i32 0, i32* %b, align 4 + %5 = bitcast i32* %a to i8* + %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5) + %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 + %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret <4 x i32> %res.0 +} + +define <4 x i32> @test_nocse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) { +entry: +; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due +; to mismatch between st2 and ld3. +; CHECK-LABEL: @test_nocse2 +; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8 + %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 + %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] + %cmp = icmp slt i32 %i.0, %n + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = bitcast i32* %a to i8* + %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> + %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> + %3 = bitcast <16 x i8> %1 to <4 x i32> + %4 = bitcast <16 x i8> %2 to <4 x i32> + call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) + %5 = bitcast i32* %a to i8* + %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5) + %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0 + %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2 + %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.2.extract) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret <4 x i32> %res.0 +} + +define <4 x i32> @test_nocse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) { +entry: +; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to +; mismatch between st2 and st3. +; CHECK-LABEL: @test_nocse3 +; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0i8 +; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8 + %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 + %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] + %cmp = icmp slt i32 %i.0, %n + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = bitcast i32* %a to i8* + %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> + %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> + %3 = bitcast <16 x i8> %1 to <4 x i32> + %4 = bitcast <16 x i8> %2 to <4 x i32> + call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> %4, <4 x i32> %3, <4 x i32> %3, i8* %0) + call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0) + %5 = bitcast i32* %a to i8* + %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5) + %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1 + %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.0.extract) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret <4 x i32> %res.0 +} + +; Function Attrs: nounwind +declare void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32>, <4 x i32>, i8* nocapture) + +; Function Attrs: nounwind +declare void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32>, <4 x i32>, <4 x i32>, i8* nocapture) + +; Function Attrs: nounwind readonly +declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8*) + +; Function Attrs: nounwind readonly +declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8*) + +define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) { +entry: + %add = add <4 x i32> %__p0, %__p1 + ret <4 x i32> %add +} diff --git a/test/Transforms/EarlyCSE/AArch64/lit.local.cfg b/test/Transforms/EarlyCSE/AArch64/lit.local.cfg new file mode 100644 index 0000000..6642d28 --- /dev/null +++ b/test/Transforms/EarlyCSE/AArch64/lit.local.cfg @@ -0,0 +1,5 @@ +config.suffixes = ['.ll'] + +targets = set(config.root.targets_to_build.split()) +if not 'AArch64' in targets: + config.unsupported = True diff --git a/test/Transforms/EarlyCSE/basic.ll b/test/Transforms/EarlyCSE/basic.ll index 155d36f..a36a103 100644 --- a/test/Transforms/EarlyCSE/basic.ll +++ b/test/Transforms/EarlyCSE/basic.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -S -early-cse | FileCheck %s +; RUN: opt < %s -S -passes=early-cse | FileCheck %s declare void @llvm.assume(i1) nounwind @@ -192,4 +193,13 @@ define void @test11(i32 *%P) { ; CHECK-NEXT: ret void } - +; CHECK-LABEL: @test12( +define i32 @test12(i1 %B, i32* %P1, i32* %P2) { + %load0 = load i32* %P1 + %1 = load atomic i32* %P2 seq_cst, align 4 + %load1 = load i32* %P1 + %sel = select i1 %B, i32 %load0, i32 %load1 + ret i32 %sel + ; CHECK: load i32* %P1 + ; CHECK: load i32* %P1 +} |