1 files changed, 52 insertions, 0 deletions
diff --git a/test/CodeGen/X86/insertps-O0-bug.ll b/test/CodeGen/X86/insertps-O0-bug.ll
new file mode 100644
index 0000000..e89ac26
--- /dev/null
+++ b/test/CodeGen/X86/insertps-O0-bug.ll
@@ -0,0 +1,52 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 -O0 < %s | FileCheck %s
+
+; Check that at -O0, the backend doesn't attempt to canonicalize a vector load
+; used by an INSERTPS into a scalar load plus scalar_to_vector.
+;
+; In order to fold a load into the memory operand of an INSERTPSrm, the backend
+; tries to canonicalize a vector load in input to an INSERTPS node into a
+; scalar load plus scalar_to_vector. This would allow ISel to match the
+; INSERTPSrm variant rather than a load plus INSERTPSrr.
+;
+; However, ISel can only select an INSERTPSrm if folding a load into the operand
+; of an insertps is considered to be profitable.
+;
+; In the example below:
+;
+; __m128 test(__m128 a, __m128 *b) {
+;   __m128 c = _mm_insert_ps(a, *b, 1 << 6);
+;   return c;
+; }
+;
+; At -O0, the backend would attempt to canonicalize the load to 'b' into
+; a scalar load in the hope of matching an INSERTPSrm.
+; However, ISel would fail to recognize an INSERTPSrm since load folding is
+; always considered unprofitable at -O0. This would leave the insertps mask
+; in an invalid state.
+;
+; The problem with the canonicalization rule performed by the backend is that
+; it assumes ISel to always be able to match an INSERTPSrm. This assumption is
+; not always correct at -O0. In this example, FastISel fails to lower the
+; arguments needed by the entry block. This is enough to enable the DAGCombiner
+; and eventually trigger the canonicalization on the INSERTPS node.
+;
+; This test checks that the vector load in input to the insertps is not
+; canonicalized into a scalar load plus scalar_to_vector (a movss).
+
+define <4 x float> @test(<4 x float> %a, <4 x float>* %b) {
+; CHECK-LABEL: test:
+; CHECK: movaps (%rdi), [[REG:%[a-z0-9]+]]
+; CHECK-NOT: movss
+; CHECK: insertps $64, [[REG]],
+; CHECK: ret
+entry:
+  %0 = load <4 x float>* %b, align 16
+  %1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %0, i32 64)
+  %2 = alloca <4 x float>, align 16
+  store <4 x float> %1, <4 x float>* %2, align 16
+  %3 = load <4 x float>* %2, align 16
+  ret <4 x float> %3
+}
+
+
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32)