diff options
| -rw-r--r-- | lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 | ||||
| -rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 75 | ||||
| -rw-r--r-- | lib/Target/X86/X86InstrFragmentsSIMD.td | 7 | ||||
| -rw-r--r-- | lib/Target/X86/X86InstrSSE.td | 20 | ||||
| -rwxr-xr-x | test/CodeGen/X86/avx2-conversions.ll | 68 | 
5 files changed, 158 insertions, 14 deletions
| diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 1ed1ee7..e72c8d5 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4520,8 +4520,10 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {      SDValue Op = N0.getOperand(0);      if (Op.getValueType().bitsLT(VT)) {        Op = DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, Op); +      AddToWorkList(Op.getNode());      } else if (Op.getValueType().bitsGT(VT)) {        Op = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Op); +      AddToWorkList(Op.getNode());      }      return DAG.getZeroExtendInReg(Op, N->getDebugLoc(),                                    N0.getValueType().getScalarType()); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index a03b97f..5e52b84 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1222,6 +1222,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)    setTargetDAGCombine(ISD::LOAD);    setTargetDAGCombine(ISD::STORE);    setTargetDAGCombine(ISD::ZERO_EXTEND); +  setTargetDAGCombine(ISD::ANY_EXTEND);    setTargetDAGCombine(ISD::SIGN_EXTEND);    setTargetDAGCombine(ISD::TRUNCATE);    setTargetDAGCombine(ISD::SINT_TO_FP); @@ -13033,6 +13034,20 @@ SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,    if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) { +    if (Subtarget->hasAVX2()) { +      // AVX2: v4i64 -> v4i32 + +      // VPERMD +      static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; + +      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v8i32, Op); +      Op = DAG.getVectorShuffle(MVT::v8i32, dl, Op, DAG.getUNDEF(MVT::v8i32), +                                ShufMask); + +      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op, DAG.getIntPtrConstant(0)); +    } + +    // AVX: v4i64 -> v4i32      SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,                            DAG.getIntPtrConstant(0)); @@ -13057,6 +13072,40 @@ SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,    }    if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) { +    if (Subtarget->hasAVX2()) { +      // AVX2: v8i32 -> v8i16 + +      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v32i8, Op); +      // PSHUFB +      SmallVector<SDValue,32> pshufbMask; +      for (unsigned i = 0; i < 2; ++i) { +        pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8)); +        pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8)); +        pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8)); +        pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8)); +        pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8)); +        pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8)); +        pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8)); +        pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8)); +        for (unsigned j = 0; j < 8; ++j) +          pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); +      } +      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v32i8, &pshufbMask[0],  +                               32); +      Op = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, Op, BV); + +      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i64, Op); + +      static const int ShufMask[] = {0,  2,  -1,  -1}; +      Op = DAG.getVectorShuffle(MVT::v4i64, dl,  Op, DAG.getUNDEF(MVT::v4i64),  +                                &ShufMask[0]); + +      Op =  DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, +                        DAG.getIntPtrConstant(0)); + +      return DAG.getNode(ISD::BITCAST, dl, VT, Op); +    } +      SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,                            DAG.getIntPtrConstant(0)); @@ -14822,15 +14871,6 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,    if (!Subtarget->hasAVX())       return SDValue(); -  // Optimize vectors in AVX mode -  // Sign extend  v8i16 to v8i32 and -  //              v4i32 to v4i64 -  // -  // Divide input vector into two parts -  // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} -  // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 -  // concat the vectors to original VT -    EVT VT = N->getValueType(0);    SDValue Op = N->getOperand(0);    EVT OpVT = Op.getValueType(); @@ -14839,6 +14879,19 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,    if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) ||        (VT == MVT::v8i32 && OpVT == MVT::v8i16)) { +    if (Subtarget->hasAVX2()) { +      return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, Op); +    } + +    // Optimize vectors in AVX mode +    // Sign extend  v8i16 to v8i32 and +    //              v4i32 to v4i64 +    // +    // Divide input vector into two parts +    // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} +    // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 +    // concat the vectors to original VT +      unsigned NumElems = OpVT.getVectorNumElements();      SmallVector<int,8> ShufMask1(NumElems, -1);      for (unsigned i = 0; i < NumElems/2; i++) ShufMask1[i] = i; @@ -14906,6 +14959,9 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,      if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) ||          ((VT == MVT::v4i64) && (OpVT == MVT::v4i32)))  { +      if (Subtarget->hasAVX2()) +        return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, N0); +        SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);        SDValue OpLo = getTargetShuffleNode(X86ISD::UNPCKL, dl, OpVT, N0, ZeroVec,                                            DAG); @@ -15108,6 +15164,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,    case X86ISD::FAND:        return PerformFANDCombine(N, DAG);    case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);    case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG); +  case ISD::ANY_EXTEND:    case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, Subtarget);    case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);    case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG, DCI); diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 35801e4..ffc6cbe 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -71,9 +71,14 @@ def X86insrtps : SDNode<"X86ISD::INSERTPS",                                        SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>;  def X86vzmovl  : SDNode<"X86ISD::VZEXT_MOVL",                   SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>; + +def X86vzmovly  : SDNode<"X86ISD::VZEXT_MOVL", +                 SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,  +                                      SDTCisOpSmallerThanOp<1, 0> ]>>; +  def X86vsmovl  : SDNode<"X86ISD::VSEXT_MOVL",                   SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisInt<1>, SDTCisInt<0>]>>; -                  +  def X86vzload  : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,                          [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;  def X86vshldq  : SDNode<"X86ISD::VSHLDQ",    SDTIntShiftOp>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 65e3c1e..450d29a 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -5730,14 +5730,26 @@ let Predicates = [HasSSE41] in {              (PMOVZXDQrm addr:$src)>;  } +let Predicates = [HasAVX2] in { +  let AddedComplexity = 15 in { +    def : Pat<(v4i64 (X86vzmovly (v4i32 VR128:$src))), +              (VPMOVZXDQYrr VR128:$src)>; +    def : Pat<(v8i32 (X86vzmovly (v8i16 VR128:$src))), +              (VPMOVZXWDYrr VR128:$src)>; +  } + +  def : Pat<(v4i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>; +  def : Pat<(v8i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>; +} +  let Predicates = [HasAVX] in { -def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>; -def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>; +  def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>; +  def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;  }  let Predicates = [HasSSE41] in { -def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>; -def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>; +  def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>; +  def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;  } diff --git a/test/CodeGen/X86/avx2-conversions.ll b/test/CodeGen/X86/avx2-conversions.ll new file mode 100755 index 0000000..fe87de9 --- /dev/null +++ b/test/CodeGen/X86/avx2-conversions.ll @@ -0,0 +1,68 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
 +
 +; CHECK: trunc4
 +; CHECK: vpermd
 +; CHECK-NOT: vinsert
 +; CHECK: ret
 +define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
 +  %B = trunc <4 x i64> %A to <4 x i32>
 +  ret <4 x i32>%B
 +}
 +
 +; CHECK: trunc8
 +; CHECK: vpshufb
 +; CHECK-NOT: vinsert
 +; CHECK: ret
 +
 +define <8 x i16> @trunc8(<8 x i32> %A) nounwind {
 +  %B = trunc <8 x i32> %A to <8 x i16>
 +  ret <8 x i16>%B
 +}
 +
 +; CHECK: sext4
 +; CHECK: vpmovsxdq
 +; CHECK-NOT: vinsert
 +; CHECK: ret
 +define <4 x i64> @sext4(<4 x i32> %A) nounwind {
 +  %B = sext <4 x i32> %A to <4 x i64>
 +  ret <4 x i64>%B
 +}
 +
 +; CHECK: sext8
 +; CHECK: vpmovsxwd
 +; CHECK-NOT: vinsert
 +; CHECK: ret
 +define <8 x i32> @sext8(<8 x i16> %A) nounwind {
 +  %B = sext <8 x i16> %A to <8 x i32>
 +  ret <8 x i32>%B
 +}
 +
 +; CHECK: zext4
 +; CHECK: vpmovzxdq
 +; CHECK-NOT: vinsert
 +; CHECK: ret
 +define <4 x i64> @zext4(<4 x i32> %A) nounwind {
 +  %B = zext <4 x i32> %A to <4 x i64>
 +  ret <4 x i64>%B
 +}
 +
 +; CHECK: zext8
 +; CHECK: vpmovzxwd
 +; CHECK-NOT: vinsert
 +; CHECK: ret
 +define <8 x i32> @zext8(<8 x i16> %A) nounwind {
 +  %B = zext <8 x i16> %A to <8 x i32>
 +  ret <8 x i32>%B
 +}
 +; CHECK: zext_8i8_8i32
 +; CHECK: vpmovzxwd
 +; CHECK: vpand
 +; CHECK: ret
 +define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind {
 +  %B = zext <8 x i8> %A to <8 x i32>  
 +  ret <8 x i32>%B
 +}
 +
 +
 +
 +
 | 
