diff options
Diffstat (limited to 'test/CodeGen/X86/insertps-O0-bug.ll')
-rw-r--r-- | test/CodeGen/X86/insertps-O0-bug.ll | 52 |
1 files changed, 52 insertions, 0 deletions
diff --git a/test/CodeGen/X86/insertps-O0-bug.ll b/test/CodeGen/X86/insertps-O0-bug.ll new file mode 100644 index 0000000..e89ac26 --- /dev/null +++ b/test/CodeGen/X86/insertps-O0-bug.ll @@ -0,0 +1,52 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 -O0 < %s | FileCheck %s + +; Check that at -O0, the backend doesn't attempt to canonicalize a vector load +; used by an INSERTPS into a scalar load plus scalar_to_vector. +; +; In order to fold a load into the memory operand of an INSERTPSrm, the backend +; tries to canonicalize a vector load in input to an INSERTPS node into a +; scalar load plus scalar_to_vector. This would allow ISel to match the +; INSERTPSrm variant rather than a load plus INSERTPSrr. +; +; However, ISel can only select an INSERTPSrm if folding a load into the operand +; of an insertps is considered to be profitable. +; +; In the example below: +; +; __m128 test(__m128 a, __m128 *b) { +; __m128 c = _mm_insert_ps(a, *b, 1 << 6); +; return c; +; } +; +; At -O0, the backend would attempt to canonicalize the load to 'b' into +; a scalar load in the hope of matching an INSERTPSrm. +; However, ISel would fail to recognize an INSERTPSrm since load folding is +; always considered unprofitable at -O0. This would leave the insertps mask +; in an invalid state. +; +; The problem with the canonicalization rule performed by the backend is that +; it assumes ISel to always be able to match an INSERTPSrm. This assumption is +; not always correct at -O0. In this example, FastISel fails to lower the +; arguments needed by the entry block. This is enough to enable the DAGCombiner +; and eventually trigger the canonicalization on the INSERTPS node. +; +; This test checks that the vector load in input to the insertps is not +; canonicalized into a scalar load plus scalar_to_vector (a movss). + +define <4 x float> @test(<4 x float> %a, <4 x float>* %b) { +; CHECK-LABEL: test: +; CHECK: movaps (%rdi), [[REG:%[a-z0-9]+]] +; CHECK-NOT: movss +; CHECK: insertps $64, [[REG]], +; CHECK: ret +entry: + %0 = load <4 x float>* %b, align 16 + %1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %0, i32 64) + %2 = alloca <4 x float>, align 16 + store <4 x float> %1, <4 x float>* %2, align 16 + %3 = load <4 x float>* %2, align 16 + ret <4 x float> %3 +} + + +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) |