diff options
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 9 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrSSE.td | 25 | ||||
-rw-r--r-- | test/CodeGen/X86/sse-align-12.ll | 50 |
3 files changed, 71 insertions, 13 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 91da4c0..a5eb00a 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1578,7 +1578,7 @@ static bool isUndefOrEqual(SDOperand Op, unsigned Val) { bool X86::isPSHUFDMask(SDNode *N) { assert(N->getOpcode() == ISD::BUILD_VECTOR); - if (N->getNumOperands() != 4) + if (N->getNumOperands() != 2 && N->getNumOperands() != 4) return false; // Check if the value doesn't reference the second vector. @@ -1586,7 +1586,7 @@ bool X86::isPSHUFDMask(SDNode *N) { SDOperand Arg = N->getOperand(i); if (Arg.getOpcode() == ISD::UNDEF) continue; assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); - if (cast<ConstantSDNode>(Arg)->getValue() >= 4) + if (cast<ConstantSDNode>(Arg)->getValue() >= e) return false; } @@ -2767,7 +2767,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) { // If VT is integer, try PSHUF* first, then SHUFP*. if (MVT::isInteger(VT)) { - if (X86::isPSHUFDMask(PermMask.Val) || + // MMX doesn't have PSHUFD; it does have PSHUFW. While it's theoretically + // possible to shuffle a v2i32 using PSHUFW, that's not yet implemented. + if (((MVT::getSizeInBits(VT) != 64 || NumElems == 4) && + X86::isPSHUFDMask(PermMask.Val)) || X86::isPSHUFHWMask(PermMask.Val) || X86::isPSHUFLWMask(PermMask.Val)) { if (V2.getOpcode() != ISD::UNDEF) diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 63065c9..7ed69ea 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -808,7 +808,7 @@ let isTwoAddress = 1 in { "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (v4f32 (vector_shuffle - VR128:$src1, (load addr:$src2), + VR128:$src1, (memopv4f32 addr:$src2), SHUFP_shuffle_mask:$src3)))]>; let AddedComplexity = 10 in { @@ -824,7 +824,7 @@ let isTwoAddress = 1 in { "unpckhps\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (v4f32 (vector_shuffle - VR128:$src1, (load addr:$src2), + VR128:$src1, (memopv4f32 addr:$src2), UNPCKH_shuffle_mask)))]>; def UNPCKLPSrr : PSI<0x14, MRMSrcReg, @@ -839,7 +839,7 @@ let isTwoAddress = 1 in { "unpcklps\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (v4f32 (vector_shuffle - VR128:$src1, (load addr:$src2), + VR128:$src1, (memopv4f32 addr:$src2), UNPCKL_shuffle_mask)))]>; } // AddedComplexity } // isTwoAddress @@ -1561,7 +1561,7 @@ let isTwoAddress = 1 in { "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (v2f64 (vector_shuffle - VR128:$src1, (load addr:$src2), + VR128:$src1, (memopv2f64 addr:$src2), SHUFP_shuffle_mask:$src3)))]>; let AddedComplexity = 10 in { @@ -1577,7 +1577,7 @@ let isTwoAddress = 1 in { "unpckhpd\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (v2f64 (vector_shuffle - VR128:$src1, (load addr:$src2), + VR128:$src1, (memopv2f64 addr:$src2), UNPCKH_shuffle_mask)))]>; def UNPCKLPDrr : PDI<0x14, MRMSrcReg, @@ -1592,7 +1592,7 @@ let isTwoAddress = 1 in { "unpcklpd\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (v2f64 (vector_shuffle - VR128:$src1, (load addr:$src2), + VR128:$src1, (memopv2f64 addr:$src2), UNPCKL_shuffle_mask)))]>; } // AddedComplexity } // isTwoAddress @@ -1782,7 +1782,7 @@ let isTwoAddress = 1 in { (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), "pandn\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1), - (load addr:$src2))))]>; + (memopv2i64 addr:$src2))))]>; } // SSE2 Integer comparison @@ -2419,6 +2419,11 @@ def : Pat<(vector_shuffle (v4f32 VR128:$src1), (undef), SHUFP_unary_shuffle_mask:$sm), (SHUFPSrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>, Requires<[HasSSE1]>; +// Special unary SHUFPDrri case. +def : Pat<(vector_shuffle (v2f64 VR128:$src1), (undef), + SHUFP_unary_shuffle_mask:$sm), + (SHUFPDrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>, + Requires<[HasSSE2]>; // Unary v4f32 shuffle with PSHUF* in order to fold a load. def : Pat<(vector_shuffle (memopv4f32 addr:$src1), (undef), SHUFP_unary_shuffle_mask:$sm), @@ -2583,13 +2588,13 @@ def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))), (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))), - (load addr:$src2))), + (memopv2i64 addr:$src2))), (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))), - (load addr:$src2))), + (memopv2i64 addr:$src2))), (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))), - (load addr:$src2))), + (memopv2i64 addr:$src2))), (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; // Use movaps / movups for SSE integer load / store (one byte shorter). diff --git a/test/CodeGen/X86/sse-align-12.ll b/test/CodeGen/X86/sse-align-12.ll new file mode 100644 index 0000000..731d429 --- /dev/null +++ b/test/CodeGen/X86/sse-align-12.ll @@ -0,0 +1,50 @@ +; RUN: llvm-as < %s | llc -march=x86-64 | grep unpck | wc -l | grep 2 +; RUN: llvm-as < %s | llc -march=x86-64 | grep shuf | wc -l | grep 2 +; RUN: llvm-as < %s | llc -march=x86-64 | grep ps | wc -l | grep 4 +; RUN: llvm-as < %s | llc -march=x86-64 | grep pd | wc -l | grep 4 +; RUN: llvm-as < %s | llc -march=x86-64 | grep movup | wc -l | grep 4 + +define <4 x float> @a(<4 x float>* %y) +{ + %x = load <4 x float>* %y, align 4 + %a = extractelement <4 x float> %x, i32 0 + %b = extractelement <4 x float> %x, i32 1 + %c = extractelement <4 x float> %x, i32 2 + %d = extractelement <4 x float> %x, i32 3 + %p = insertelement <4 x float> undef, float %d, i32 0 + %q = insertelement <4 x float> %p, float %c, i32 1 + %r = insertelement <4 x float> %q, float %b, i32 2 + %s = insertelement <4 x float> %r, float %a, i32 3 + ret <4 x float> %s +} +define <4 x float> @b(<4 x float>* %y, <4 x float> %z) +{ + %x = load <4 x float>* %y, align 4 + %a = extractelement <4 x float> %x, i32 2 + %b = extractelement <4 x float> %x, i32 3 + %c = extractelement <4 x float> %z, i32 2 + %d = extractelement <4 x float> %z, i32 3 + %p = insertelement <4 x float> undef, float %c, i32 0 + %q = insertelement <4 x float> %p, float %a, i32 1 + %r = insertelement <4 x float> %q, float %d, i32 2 + %s = insertelement <4 x float> %r, float %b, i32 3 + ret <4 x float> %s +} +define <2 x double> @c(<2 x double>* %y) +{ + %x = load <2 x double>* %y, align 8 + %a = extractelement <2 x double> %x, i32 0 + %c = extractelement <2 x double> %x, i32 1 + %p = insertelement <2 x double> undef, double %c, i32 0 + %r = insertelement <2 x double> %p, double %a, i32 1 + ret <2 x double> %r +} +define <2 x double> @d(<2 x double>* %y, <2 x double> %z) +{ + %x = load <2 x double>* %y, align 8 + %a = extractelement <2 x double> %x, i32 1 + %c = extractelement <2 x double> %z, i32 1 + %p = insertelement <2 x double> undef, double %c, i32 0 + %r = insertelement <2 x double> %p, double %a, i32 1 + ret <2 x double> %r +} |