From 73c504af9d86a426532ee32c5d07a4b872794675 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Sun, 15 Apr 2012 11:18:59 +0000 Subject: Added VPERM optimization for AVX2 shuffles git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@154761 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 41 +++++++++++++++++++++++++++++++++ lib/Target/X86/X86ISelLowering.h | 4 ++++ lib/Target/X86/X86InstrFragmentsSIMD.td | 4 ++++ lib/Target/X86/X86InstrInfo.cpp | 4 ++-- lib/Target/X86/X86InstrSSE.td | 27 ++++++++++++++++++++-- test/CodeGen/X86/avx2-vperm.ll | 34 +++++++++++++++++++++++++++ 6 files changed, 110 insertions(+), 4 deletions(-) create mode 100755 test/CodeGen/X86/avx2-vperm.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 1da9405..d618889 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -2935,6 +2935,8 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::VPERMILP: + case X86ISD::VPERMQ: + case X86ISD::VPERMPD: return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); } } @@ -3976,6 +3978,27 @@ unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { return Index / NumElemsPerChunk; } +/// getShuffleCLImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions. +/// Handles 256-bit. +static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) { + EVT VT = N->getValueType(0); + + assert((VT.is256BitVector() && VT.getVectorNumElements() == 4) && + "Unsupported vector type for VPERMQ/VPERMPD"); + + unsigned NumElts = VT.getVectorNumElements(); + + unsigned Mask = 0; + for (unsigned i = 0; i != NumElts; ++i) { + int Elt = N->getMaskElt(i); + if (Elt < 0) + continue; + Mask |= Elt << (i*2); + } + + return Mask; +} /// isZeroNode - Returns true if Elt is a constant zero or a floating point /// constant +0.0. bool X86::isZeroNode(SDValue Elt) { @@ -6627,6 +6650,20 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(Op, Subtarget, DAG); if (BlendOp.getNode()) return BlendOp; + if (V2IsUndef && HasAVX2 && (VT == MVT::v8i32 || VT == MVT::v8f32)) { + SmallVector permclMask; + for (unsigned i = 0; i != 8; ++i) { + permclMask.push_back(DAG.getConstant((M[i] >= 0)?M[i]:0x80, MVT::i32)); + } + return DAG.getNode(VT.isInteger()? X86ISD::VPERMD:X86ISD::VPERMPS, dl, VT, + DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, + &permclMask[0], 8), V1); + + } + if (V2IsUndef && HasAVX2 && (VT == MVT::v4i64 || VT == MVT::v4f64)) + return getTargetShuffleNode(VT.isInteger()? X86ISD::VPERMQ : X86ISD::VPERMPD, dl, VT, V1, + getShuffleCLImmediate(SVOp), DAG); + //===--------------------------------------------------------------------===// // Since no target specific shuffle was selected for this generic one, @@ -11141,6 +11178,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; + case X86ISD::VPERMD: return "X86ISD::VPERMD"; + case X86ISD::VPERMQ: return "X86ISD::VPERMQ"; + case X86ISD::VPERMPS: return "X86ISD::VPERMPS"; + case X86ISD::VPERMPD: return "X86ISD::VPERMPD"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index a1511f4..aac7468 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -285,6 +285,10 @@ namespace llvm { UNPCKL, UNPCKH, VPERMILP, + VPERMD, + VPERMQ, + VPERMPS, + VPERMPD, VPERM2X128, VBROADCAST, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 041a64f..c61ca46 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -155,6 +155,10 @@ def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>; def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>; def X86VPermilp : SDNode<"X86ISD::VPERMILP", SDTShuff2OpI>; +def X86VPermd : SDNode<"X86ISD::VPERMD", SDTShuff2Op>; +def X86VPermps : SDNode<"X86ISD::VPERMPS", SDTShuff2Op>; +def X86VPermq : SDNode<"X86ISD::VPERMQ", SDTShuff2OpI>; +def X86VPermpd : SDNode<"X86ISD::VPERMPD", SDTShuff2OpI>; def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 307c96b..b12c1db 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -1049,9 +1049,9 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, TB_ALIGN_32 }, { X86::VPERM2I128rr, X86::VPERM2I128rm, TB_ALIGN_32 }, { X86::VPERMDYrr, X86::VPERMDYrm, TB_ALIGN_32 }, - { X86::VPERMPDYrr, X86::VPERMPDYrm, TB_ALIGN_32 }, + { X86::VPERMPDYri, X86::VPERMPDYmi, TB_ALIGN_32 }, { X86::VPERMPSYrr, X86::VPERMPSYrm, TB_ALIGN_32 }, - { X86::VPERMQYrr, X86::VPERMQYrm, TB_ALIGN_32 }, + { X86::VPERMQYri, X86::VPERMQYmi, TB_ALIGN_32 }, { X86::VPHADDDYrr, X86::VPHADDDYrm, TB_ALIGN_32 }, { X86::VPHADDSWrr256, X86::VPHADDSWrm256, TB_ALIGN_32 }, { X86::VPHADDWYrr, X86::VPHADDWYrm, TB_ALIGN_32 }, diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 408ab16..8210965 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -7746,12 +7746,12 @@ defm VPERMPS : avx2_perm<0x16, "vpermps", memopv8f32, int_x86_avx2_permps>; multiclass avx2_perm_imm opc, string OpcodeStr, PatFrag mem_frag, Intrinsic Int> { - def Yrr : AVX2AIi8, VEX; - def Yrm : AVX2AIi8, VEX_W; +let Predicates = [HasAVX2] in { +def : Pat<(v8i32 (X86VPermd VR256:$src1, VR256:$src2)), + (VPERMDYrr VR256:$src1, VR256:$src2)>; +def : Pat<(v8f32 (X86VPermps VR256:$src1, VR256:$src2)), + (VPERMPSYrr VR256:$src1, VR256:$src2)>; + +def : Pat<(v4i64 (X86VPermq VR256:$src1, (i8 imm:$imm))), + (VPERMQYri VR256:$src1, imm:$imm)>; +def : Pat<(v4f64 (X86VPermpd VR256:$src1, (i8 imm:$imm))), + (VPERMPDYri VR256:$src1, imm:$imm)>; + +def : Pat<(v8i32 (X86VPermps VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))), + (VPERMDYrm VR256:$src1, addr:$src2)>; +def : Pat<(v8f32 (X86VPermps VR256:$src1, (memopv8f32 addr:$src2))), + (VPERMPSYrm VR256:$src1, addr:$src2)>; + +def : Pat<(v4i64 (X86VPermq (memopv4i64 addr:$src1), (i8 imm:$imm))), + (VPERMQYmi addr:$src1, imm:$imm)>; +def : Pat<(v4f64 (X86VPermpd (memopv4f64 addr:$src1), (i8 imm:$imm))), + (VPERMPDYmi addr:$src1, imm:$imm)>; + +} + //===----------------------------------------------------------------------===// // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks // diff --git a/test/CodeGen/X86/avx2-vperm.ll b/test/CodeGen/X86/avx2-vperm.ll new file mode 100755 index 0000000..d576d0e --- /dev/null +++ b/test/CodeGen/X86/avx2-vperm.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s + +define <8 x i32> @perm_cl_int_8x32(<8 x i32> %A) nounwind readnone { +entry: +; CHECK: perm_cl_int_8x32 +; CHECK: vpermd + %B = shufflevector <8 x i32> %A, <8 x i32> undef, <8 x i32> + ret <8 x i32> %B +} + + +define <8 x float> @perm_cl_fp_8x32(<8 x float> %A) nounwind readnone { +entry: +; CHECK: perm_cl_fp_8x32 +; CHECK: vpermps + %B = shufflevector <8 x float> %A, <8 x float> undef, <8 x i32> + ret <8 x float> %B +} + +define <4 x i64> @perm_cl_int_4x64(<4 x i64> %A) nounwind readnone { +entry: +; CHECK: perm_cl_int_4x64 +; CHECK: vpermq + %B = shufflevector <4 x i64> %A, <4 x i64> undef, <4 x i32> + ret <4 x i64> %B +} + +define <4 x double> @perm_cl_fp_4x64(<4 x double> %A) nounwind readnone { +entry: +; CHECK: perm_cl_fp_4x64 +; CHECK: vpermpd + %B = shufflevector <4 x double> %A, <4 x double> undef, <4 x i32> + ret <4 x double> %B +} -- cgit v1.1