diff options
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 1800 |
1 files changed, 1290 insertions, 510 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 9ffe29f..081c558 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -16,6 +16,7 @@ #include "X86ISelLowering.h" #include "Utils/X86ShuffleDecode.h" #include "X86.h" +#include "X86CallingConv.h" #include "X86InstrBuilder.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" @@ -91,7 +92,7 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, VecIdx); return Result; - + } /// Generate a DAG to grab 128-bits from a vector > 128 bits. This /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 @@ -631,7 +632,7 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) + if (Subtarget->isOSWindows() && !Subtarget->isTargetEnvMacho()) setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? MVT::i64 : MVT::i32, Custom); else if (TM.Options.EnableSegmentedStacks) @@ -1150,9 +1151,6 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FNEG, MVT::v4f64, Custom); setOperationAction(ISD::FABS, MVT::v4f64, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); @@ -1160,7 +1158,6 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); - setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); @@ -1193,10 +1190,16 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { setOperationAction(ISD::FMA, MVT::v8f32, Legal); @@ -1330,7 +1333,16 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FMA, MVT::v16f32, Legal); setOperationAction(ISD::SDIV, MVT::v16i32, Custom); - + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal); + } setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); @@ -1390,6 +1402,9 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::AND, MVT::v8i64, Legal); setOperationAction(ISD::OR, MVT::v8i64, Legal); setOperationAction(ISD::XOR, MVT::v8i64, Legal); + setOperationAction(ISD::AND, MVT::v16i32, Legal); + setOperationAction(ISD::OR, MVT::v16i32, Legal); + setOperationAction(ISD::XOR, MVT::v16i32, Legal); // Custom lower several nodes. for (int i = MVT::FIRST_VECTOR_VALUETYPE; @@ -1409,14 +1424,6 @@ void X86TargetLowering::resetOperationActions() { if (!VT.is512BitVector()) continue; - if (VT != MVT::v8i64) { - setOperationAction(ISD::XOR, VT, Promote); - AddPromotedToType (ISD::XOR, VT, MVT::v8i64); - setOperationAction(ISD::OR, VT, Promote); - AddPromotedToType (ISD::OR, VT, MVT::v8i64); - setOperationAction(ISD::AND, VT, Promote); - AddPromotedToType (ISD::AND, VT, MVT::v8i64); - } if ( EltSize >= 32) { setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); @@ -1434,8 +1441,6 @@ void X86TargetLowering::resetOperationActions() { if (!VT.is512BitVector()) continue; - setOperationAction(ISD::LOAD, VT, Promote); - AddPromotedToType (ISD::LOAD, VT, MVT::v8i64); setOperationAction(ISD::SELECT, VT, Promote); AddPromotedToType (ISD::SELECT, VT, MVT::v8i64); } @@ -1452,6 +1457,7 @@ void X86TargetLowering::resetOperationActions() { // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. @@ -1541,7 +1547,16 @@ void X86TargetLowering::resetOperationActions() { } EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { - if (!VT.isVector()) return MVT::i8; + if (!VT.isVector()) + return MVT::i8; + + const TargetMachine &TM = getTargetMachine(); + if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) + switch(VT.getVectorNumElements()) { + case 8: return MVT::v8i1; + case 16: return MVT::v16i1; + } + return VT.changeVectorElementTypeToInteger(); } @@ -1750,6 +1765,13 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, return true; } +bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, + unsigned DestAS) const { + assert(SrcAS != DestAS && "Expected different address spaces!"); + + return SrcAS < 256 && DestAS < 256; +} + //===----------------------------------------------------------------------===// // Return Value Calling Convention Implementation //===----------------------------------------------------------------------===// @@ -1767,6 +1789,11 @@ X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, return CCInfo.CheckReturn(Outs, RetCC_X86); } +const uint16_t *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { + static const uint16_t ScratchRegs[] = { X86::R11, 0 }; + return ScratchRegs; +} + SDValue X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -3532,7 +3559,7 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference /// the second operand. -static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) { +static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) { if (VT == MVT::v4f32 || VT == MVT::v4i32 ) return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); if (VT == MVT::v2f64 || VT == MVT::v2i64) @@ -3542,7 +3569,7 @@ static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) { /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFHW. -static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { +static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) return false; @@ -3571,7 +3598,7 @@ static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFLW. -static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { +static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) return false; @@ -3600,14 +3627,14 @@ static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PALIGNR. -static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT, +static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT, const X86Subtarget *Subtarget) { if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) || (VT.is256BitVector() && !Subtarget->hasInt256())) return false; unsigned NumElts = VT.getVectorNumElements(); - unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; // Do not handle 64-bit element shuffles with palignr. @@ -3690,10 +3717,7 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, /// specifies a shuffle of elements that is suitable for input to 128/256-bit /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be /// reverse of what x86 shuffles want. -static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256, - bool Commuted = false) { - if (!HasFp256 && VT.is256BitVector()) - return false; +static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) { unsigned NumElems = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits()/128; @@ -3702,6 +3726,10 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256, if (NumLaneElems != 2 && NumLaneElems != 4) return false; + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + bool symetricMaskRequired = + (VT.getSizeInBits() >= 256) && (EltSize == 32); + // VSHUFPSY divides the resulting vector into 4 chunks. // The sources are also splitted into 4 chunks, and each destination // chunk must come from a different source chunk. @@ -3721,6 +3749,7 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256, // // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 // + SmallVector<int, 4> MaskVal(NumLaneElems, -1); unsigned HalfLaneElems = NumLaneElems/2; for (unsigned l = 0; l != NumElems; l += NumLaneElems) { for (unsigned i = 0; i != NumLaneElems; ++i) { @@ -3731,9 +3760,13 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256, // For VSHUFPSY, the mask of the second half must be the same as the // first but with the appropriate offsets. This works in the same way as // VPERMILPS works with masks. - if (NumElems != 8 || l == 0 || Mask[i] < 0) + if (!symetricMaskRequired || Idx < 0) continue; - if (!isUndefOrEqual(Idx, Mask[i]+l)) + if (MaskVal[i] < 0) { + MaskVal[i] = Idx - l; + continue; + } + if ((signed)(Idx - l) != MaskVal[i]) return false; } } @@ -3743,7 +3776,7 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256, /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVHLPS. -static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) { +static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) { if (!VT.is128BitVector()) return false; @@ -3762,7 +3795,7 @@ static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) { /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, /// <2, 3, 2, 3> -static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) { +static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) { if (!VT.is128BitVector()) return false; @@ -3779,7 +3812,7 @@ static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) { /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. -static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) { +static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) { if (!VT.is128BitVector()) return false; @@ -3801,7 +3834,7 @@ static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) { /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVLHPS. -static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) { +static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) { if (!VT.is128BitVector()) return false; @@ -3827,7 +3860,7 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) { static SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { - MVT VT = SVOp->getValueType(0).getSimpleVT(); + MVT VT = SVOp->getSimpleValueType(0); SDLoc dl(SVOp); if (VT != MVT::v8i32 && VT != MVT::v8f32) @@ -3870,70 +3903,92 @@ SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKL. -static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT, +static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT, bool HasInt256, bool V2IsSplat = false) { - unsigned NumElts = VT.getVectorNumElements(); - assert((VT.is128BitVector() || VT.is256BitVector()) && - "Unsupported vector type for unpckh"); + assert(VT.getSizeInBits() >= 128 && + "Unsupported vector type for unpckl"); - if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) + // AVX defines UNPCK* to operate independently on 128-bit lanes. + unsigned NumLanes; + unsigned NumOf256BitLanes; + unsigned NumElts = VT.getVectorNumElements(); + if (VT.is256BitVector()) { + if (NumElts != 4 && NumElts != 8 && + (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; + NumLanes = 2; + NumOf256BitLanes = 1; + } else if (VT.is512BitVector()) { + assert(VT.getScalarType().getSizeInBits() >= 32 && + "Unsupported vector type for unpckh"); + NumLanes = 2; + NumOf256BitLanes = 2; + } else { + NumLanes = 1; + NumOf256BitLanes = 1; + } - // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate - // independently on 128-bit lanes. - unsigned NumLanes = VT.getSizeInBits()/128; - unsigned NumLaneElts = NumElts/NumLanes; + unsigned NumEltsInStride = NumElts/NumOf256BitLanes; + unsigned NumLaneElts = NumEltsInStride/NumLanes; - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { - int BitI = Mask[l+i]; - int BitI1 = Mask[l+i+1]; - if (!isUndefOrEqual(BitI, j)) - return false; - if (V2IsSplat) { - if (!isUndefOrEqual(BitI1, NumElts)) + for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { + for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { + for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l256*NumEltsInStride+l+i]; + int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; + if (!isUndefOrEqual(BitI, j+l256*NumElts)) return false; - } else { - if (!isUndefOrEqual(BitI1, j + NumElts)) + if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) + return false; + if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) return false; } } } - return true; } /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKH. -static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT, +static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT, bool HasInt256, bool V2IsSplat = false) { - unsigned NumElts = VT.getVectorNumElements(); - - assert((VT.is128BitVector() || VT.is256BitVector()) && + assert(VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckh"); - if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) + // AVX defines UNPCK* to operate independently on 128-bit lanes. + unsigned NumLanes; + unsigned NumOf256BitLanes; + unsigned NumElts = VT.getVectorNumElements(); + if (VT.is256BitVector()) { + if (NumElts != 4 && NumElts != 8 && + (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; + NumLanes = 2; + NumOf256BitLanes = 1; + } else if (VT.is512BitVector()) { + assert(VT.getScalarType().getSizeInBits() >= 32 && + "Unsupported vector type for unpckh"); + NumLanes = 2; + NumOf256BitLanes = 2; + } else { + NumLanes = 1; + NumOf256BitLanes = 1; + } - // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate - // independently on 128-bit lanes. - unsigned NumLanes = VT.getSizeInBits()/128; - unsigned NumLaneElts = NumElts/NumLanes; + unsigned NumEltsInStride = NumElts/NumOf256BitLanes; + unsigned NumLaneElts = NumEltsInStride/NumLanes; - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { - int BitI = Mask[l+i]; - int BitI1 = Mask[l+i+1]; - if (!isUndefOrEqual(BitI, j)) - return false; - if (V2IsSplat) { - if (isUndefOrEqual(BitI1, NumElts)) + for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { + for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { + for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l256*NumEltsInStride+l+i]; + int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; + if (!isUndefOrEqual(BitI, j+l256*NumElts)) return false; - } else { - if (!isUndefOrEqual(BitI1, j+NumElts)) + if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) + return false; + if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) return false; } } @@ -3944,10 +3999,12 @@ static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT, /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, /// <0, 0, 1, 1> -static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { +static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { unsigned NumElts = VT.getVectorNumElements(); bool Is256BitVec = VT.is256BitVector(); + if (VT.is512BitVector()) + return false; assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); @@ -3985,9 +4042,12 @@ static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, /// <2, 2, 3, 3> -static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { +static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { unsigned NumElts = VT.getVectorNumElements(); + if (VT.is512BitVector()) + return false; + assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); @@ -4040,7 +4100,7 @@ static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) { /// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> /// The first half comes from the second half of V1 and the second half from the /// the second half of V2. -static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { +static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) { if (!HasFp256 || !VT.is256BitVector()) return false; @@ -4072,7 +4132,7 @@ static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { - MVT VT = SVOp->getValueType(0).getSimpleVT(); + MVT VT = SVOp->getSimpleValueType(0); unsigned HalfSize = VT.getVectorNumElements()/2; @@ -4093,6 +4153,44 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { return (FstHalf | (SndHalf << 4)); } +// Symetric in-lane mask. Each lane has 4 elements (for imm8) +static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) { + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (EltSize < 32) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + Imm8 = 0; + if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) { + for (unsigned i = 0; i != NumElts; ++i) { + if (Mask[i] < 0) + continue; + Imm8 |= Mask[i] << (i*2); + } + return true; + } + + unsigned LaneSize = 4; + SmallVector<int, 4> MaskVal(LaneSize, -1); + + for (unsigned l = 0; l != NumElts; l += LaneSize) { + for (unsigned i = 0; i != LaneSize; ++i) { + if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) + return false; + if (Mask[i+l] < 0) + continue; + if (MaskVal[i] < 0) { + MaskVal[i] = Mask[i+l] - l; + Imm8 |= MaskVal[i] << (i*2); + continue; + } + if (Mask[i+l] != (signed)(MaskVal[i]+l)) + return false; + } + } + return true; +} + /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to VPERMILPD*. /// Note that VPERMIL mask matching is different depending whether theunderlying @@ -4100,38 +4198,39 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { /// to the same elements of the low, but to the higher half of the source. /// In VPERMILPD the two lanes could be shuffled independently of each other /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. -static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { - if (!HasFp256) +static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) { + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (VT.getSizeInBits() < 256 || EltSize < 32) return false; - + bool symetricMaskRequired = (EltSize == 32); unsigned NumElts = VT.getVectorNumElements(); - // Only match 256-bit with 32/64-bit types - if (!VT.is256BitVector() || (NumElts != 4 && NumElts != 8)) - return false; unsigned NumLanes = VT.getSizeInBits()/128; unsigned LaneSize = NumElts/NumLanes; + // 2 or 4 elements in one lane + + SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1); for (unsigned l = 0; l != NumElts; l += LaneSize) { for (unsigned i = 0; i != LaneSize; ++i) { if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) return false; - if (NumElts != 8 || l == 0) - continue; - // VPERMILPS handling - if (Mask[i] < 0) - continue; - if (!isUndefOrEqual(Mask[i+l], Mask[i]+l)) - return false; + if (symetricMaskRequired) { + if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) { + ExpectedMaskVal[i] = Mask[i+l] - l; + continue; + } + if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l)) + return false; + } } } - return true; } /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse /// of what x86 movss want. X86 movs requires the lowest element to be lowest /// element of vector 2 and the other elements to come from vector 1 in order. -static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT, +static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT, bool V2IsSplat = false, bool V2IsUndef = false) { if (!VT.is128BitVector()) return false; @@ -4155,7 +4254,7 @@ static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT, /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSHDUP. /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> -static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT, +static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT, const X86Subtarget *Subtarget) { if (!Subtarget->hasSSE3()) return false; @@ -4163,7 +4262,8 @@ static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT, unsigned NumElems = VT.getVectorNumElements(); if ((VT.is128BitVector() && NumElems != 4) || - (VT.is256BitVector() && NumElems != 8)) + (VT.is256BitVector() && NumElems != 8) || + (VT.is512BitVector() && NumElems != 16)) return false; // "i+1" is the value the indexed mask element must have @@ -4178,7 +4278,7 @@ static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT, /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSLDUP. /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> -static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT, +static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT, const X86Subtarget *Subtarget) { if (!Subtarget->hasSSE3()) return false; @@ -4186,7 +4286,8 @@ static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT, unsigned NumElems = VT.getVectorNumElements(); if ((VT.is128BitVector() && NumElems != 4) || - (VT.is256BitVector() && NumElems != 8)) + (VT.is256BitVector() && NumElems != 8) || + (VT.is512BitVector() && NumElems != 16)) return false; // "i" is the value the indexed mask element must have @@ -4201,7 +4302,7 @@ static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT, /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 256-bit /// version of MOVDDUP. -static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { +static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) { if (!HasFp256 || !VT.is256BitVector()) return false; @@ -4221,7 +4322,7 @@ static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 128-bit /// version of MOVDDUP. -static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) { +static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) { if (!VT.is128BitVector()) return false; @@ -4247,7 +4348,7 @@ static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { uint64_t Index = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); - MVT VT = N->getValueType(0).getSimpleVT(); + MVT VT = N->getSimpleValueType(0); unsigned ElSize = VT.getVectorElementType().getSizeInBits(); bool Result = (Index * ElSize) % vecWidth == 0; @@ -4265,7 +4366,7 @@ static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { uint64_t Index = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); - MVT VT = N->getValueType(0).getSimpleVT(); + MVT VT = N->getSimpleValueType(0); unsigned ElSize = VT.getVectorElementType().getSizeInBits(); bool Result = (Index * ElSize) % vecWidth == 0; @@ -4292,9 +4393,9 @@ bool X86::isVEXTRACT256Index(SDNode *N) { /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. /// Handles 128-bit and 256-bit. static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { - MVT VT = N->getValueType(0).getSimpleVT(); + MVT VT = N->getSimpleValueType(0); - assert((VT.is128BitVector() || VT.is256BitVector()) && + assert((VT.getSizeInBits() >= 128) && "Unsupported vector type for PSHUF/SHUFP"); // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate @@ -4303,10 +4404,10 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; - assert((NumLaneElts == 2 || NumLaneElts == 4) && - "Only supports 2 or 4 elements per lane"); + assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) && + "Only supports 2, 4 or 8 elements per lane"); - unsigned Shift = (NumLaneElts == 4) ? 1 : 0; + unsigned Shift = (NumLaneElts >= 4) ? 1 : 0; unsigned Mask = 0; for (unsigned i = 0; i != NumElts; ++i) { int Elt = N->getMaskElt(i); @@ -4322,7 +4423,7 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { - MVT VT = N->getValueType(0).getSimpleVT(); + MVT VT = N->getSimpleValueType(0); assert((VT == MVT::v8i16 || VT == MVT::v16i16) && "Unsupported vector type for PSHUFHW"); @@ -4346,7 +4447,7 @@ static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { - MVT VT = N->getValueType(0).getSimpleVT(); + MVT VT = N->getSimpleValueType(0); assert((VT == MVT::v8i16 || VT == MVT::v16i16) && "Unsupported vector type for PSHUFHW"); @@ -4370,11 +4471,12 @@ static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { - MVT VT = SVOp->getValueType(0).getSimpleVT(); - unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3; + MVT VT = SVOp->getSimpleValueType(0); + unsigned EltSize = VT.is512BitVector() ? 1 : + VT.getVectorElementType().getSizeInBits() >> 3; unsigned NumElts = VT.getVectorNumElements(); - unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; int Val = 0; @@ -4399,7 +4501,7 @@ static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { uint64_t Index = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); - MVT VecVT = N->getOperand(0).getValueType().getSimpleVT(); + MVT VecVT = N->getOperand(0).getSimpleValueType(); MVT ElVT = VecVT.getVectorElementType(); unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); @@ -4414,7 +4516,7 @@ static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { uint64_t Index = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); - MVT VecVT = N->getValueType(0).getSimpleVT(); + MVT VecVT = N->getSimpleValueType(0); MVT ElVT = VecVT.getVectorElementType(); unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); @@ -4449,27 +4551,6 @@ unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 256); } -/// getShuffleCLImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions. -/// Handles 256-bit. -static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) { - MVT VT = N->getValueType(0).getSimpleVT(); - - unsigned NumElts = VT.getVectorNumElements(); - - assert((VT.is256BitVector() && NumElts == 4) && - "Unsupported vector type for VPERMQ/VPERMPD"); - - unsigned Mask = 0; - for (unsigned i = 0; i != NumElts; ++i) { - int Elt = N->getMaskElt(i); - if (Elt < 0) - continue; - Mask |= Elt << (i*2); - } - - return Mask; -} /// isZeroNode - Returns true if Elt is a constant zero or a floating point /// constant +0.0. bool X86::isZeroNode(SDValue Elt) { @@ -4484,7 +4565,7 @@ bool X86::isZeroNode(SDValue Elt) { /// their permute mask. static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { - MVT VT = SVOp->getValueType(0).getSimpleVT(); + MVT VT = SVOp->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); SmallVector<int, 8> MaskVec; @@ -4506,7 +4587,7 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, /// match movhlps. The lower half elements should come from upper half of /// V1 (and in order), and the upper half elements should come from the upper /// half of V2 (and in order). -static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) { +static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) { if (!VT.is128BitVector()) return false; if (VT.getVectorNumElements() != 4) @@ -4563,7 +4644,7 @@ static bool WillBeConstantPoolLoad(SDNode *N) { /// half of V2 (and in order). And since V1 will become the source of the /// MOVLP, it must be either a vector load or a scalar load to vector. static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, - ArrayRef<int> Mask, EVT VT) { + ArrayRef<int> Mask, MVT VT) { if (!VT.is128BitVector()) return false; @@ -4659,6 +4740,11 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, array_lengthof(Ops)); } + } else if (VT.is512BitVector()) { // AVX-512 + SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, + Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16); } else llvm_unreachable("Unexpected vector type"); @@ -4715,7 +4801,7 @@ static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, } /// getUnpackl - Returns a vector_shuffle node for an unpackl operation. -static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, +static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector<int, 8> Mask; @@ -4727,7 +4813,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, } /// getUnpackh - Returns a vector_shuffle node for an unpackh operation. -static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, +static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector<int, 8> Mask; @@ -4743,7 +4829,7 @@ static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, // Generate shuffles which repeat i16 and i8 several times until they can be // represented by v4f32 and then be manipulated by target suported shuffles. static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { - EVT VT = V.getValueType(); + MVT VT = V.getSimpleValueType(); int NumElems = VT.getVectorNumElements(); SDLoc dl(V); @@ -4761,7 +4847,7 @@ static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { /// getLegalSplat - Generate a legal splat with supported x86 shuffles static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { - EVT VT = V.getValueType(); + MVT VT = V.getSimpleValueType(); SDLoc dl(V); if (VT.is128BitVector()) { @@ -4787,7 +4873,7 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { /// PromoteSplat - Splat is promoted to target supported vector shuffles. static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { - EVT SrcVT = SV->getValueType(0); + MVT SrcVT = SV->getSimpleValueType(0); SDValue V1 = SV->getOperand(0); SDLoc dl(SV); @@ -4810,7 +4896,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { // instruction because the target has no such instruction. Generate shuffles // which repeat i16 and i8 several times until they fit in i32, and then can // be manipulated by target suported shuffles. - EVT EltVT = SrcVT.getVectorElementType(); + MVT EltVT = SrcVT.getVectorElementType(); if (EltVT == MVT::i8 || EltVT == MVT::i16) V1 = PromoteSplati8i16(V1, DAG, EltNo); @@ -4832,7 +4918,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, bool IsZero, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - EVT VT = V2.getValueType(); + MVT VT = V2.getSimpleValueType(); SDValue V1 = IsZero ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); unsigned NumElems = VT.getVectorNumElements(); @@ -4950,7 +5036,7 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { - MVT ShufVT = V.getValueType().getSimpleVT(); + MVT ShufVT = V.getSimpleValueType(); unsigned NumElems = ShufVT.getVectorNumElements(); SmallVector<int, 16> ShuffleMask; bool IsUnary; @@ -5048,7 +5134,8 @@ bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, /// logical left shift of a vector. static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { - unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); + unsigned NumElems = + SVOp->getSimpleValueType(0).getVectorNumElements(); unsigned NumZeros = getNumOfConsecutiveZeros( SVOp, NumElems, false /* check zeros from right */, DAG, SVOp->getMaskElt(0)); @@ -5082,7 +5169,8 @@ static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, /// logical left shift of a vector. static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { - unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); + unsigned NumElems = + SVOp->getSimpleValueType(0).getVectorNumElements(); unsigned NumZeros = getNumOfConsecutiveZeros( SVOp, NumElems, true /* check zeros from left */, DAG, NumElems - SVOp->getMaskElt(NumElems - 1) - 1); @@ -5118,7 +5206,7 @@ static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { // Although the logic below support any bitwidth size, there are no // shift instructions which handle more than 128-bit vectors. - if (!SVOp->getValueType(0).is128BitVector()) + if (!SVOp->getSimpleValueType(0).is128BitVector()) return false; if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || @@ -5223,9 +5311,8 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, TLI.getScalarShiftAmountTy(SrcOp.getValueType())))); } -SDValue -X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl, - SelectionDAG &DAG) const { +static SDValue +LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { // Check if the scalar load can be widened into a vector load. And if // the address is "base + cst" see if the cst can be "absorbed" into @@ -5288,7 +5375,10 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl, LD->getPointerInfo().getWithOffset(StartOffset), false, false, false, 0); - SmallVector<int, 8> Mask(NumElems, EltNo); + SmallVector<int, 8> Mask; + for (unsigned i = 0; i != NumElems; ++i) + Mask.push_back(EltNo); + return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); } @@ -5305,7 +5395,8 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl, /// rather than undef via VZEXT_LOAD, but we do not detect that case today. /// There's even a handy isZeroNode for that purpose. static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, - SDLoc &DL, SelectionDAG &DAG) { + SDLoc &DL, SelectionDAG &DAG, + bool isAfterLegalize) { EVT EltVT = VT.getVectorElementType(); unsigned NumElems = Elts.size(); @@ -5341,7 +5432,13 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, // load of the entire vector width starting at the base pointer. If we found // consecutive loads for the low half, generate a vzext_load node. if (LastLoadedElt == NumElems - 1) { + + if (isAfterLegalize && + !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT)) + return SDValue(); + SDValue NewLd = SDValue(); + if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), @@ -5398,12 +5495,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, /// a scalar load, or a constant. /// The VBROADCAST node is returned when a pattern is found, /// or SDValue() otherwise. -SDValue -X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, + SelectionDAG &DAG) { if (!Subtarget->hasFp256()) return SDValue(); - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && @@ -5450,7 +5547,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { return SDValue(); // Use the register form of the broadcast instruction available on AVX2. - if (VT.is256BitVector()) + if (VT.getSizeInBits() >= 256) Sc = Extract128BitVector(Sc, 0, DAG, dl); return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); } @@ -5492,7 +5589,8 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { assert(C && "Invalid constant type"); - SDValue CP = DAG.getConstantPool(C, getPointerTy()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(), @@ -5528,12 +5626,12 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue -X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); +static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); // Skip if insert_vec_elt is not supported. - if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) return SDValue(); SDLoc DL(Op); @@ -5606,7 +5704,7 @@ X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) && "Unexpected type in LowerBUILD_VECTORvXi1!"); @@ -5645,13 +5743,16 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, DAG.getConstant(Immediate, MVT::i16)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0)); } - if (!isSplatVector(Op.getNode())) - llvm_unreachable("Unsupported predicate operation"); - + // Splat vector (with undefs) SDValue In = Op.getOperand(0); + for (unsigned i = 1, e = Op.getNumOperands(); i != e; ++i) { + if (Op.getOperand(i) != In && Op.getOperand(i).getOpcode() != ISD::UNDEF) + llvm_unreachable("Unsupported predicate operation"); + } + SDValue EFLAGS, X86CC; if (In.getOpcode() == ISD::SETCC) { SDValue Op0 = In.getOperand(0); @@ -5679,7 +5780,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { // res = allOnes ### CMOVNE -1, %res // else // res = allZero - MVT InVT = In.getValueType().getSimpleVT(); + MVT InVT = In.getSimpleValueType(); SDValue Bit1 = DAG.getNode(ISD::AND, dl, InVT, In, DAG.getConstant(1, InVT)); EFLAGS = EmitTest(Bit1, X86::COND_NE, DAG); X86CC = DAG.getConstant(X86::COND_NE, MVT::i8); @@ -5708,7 +5809,7 @@ SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); MVT ExtVT = VT.getVectorElementType(); unsigned NumElems = Op.getNumOperands(); @@ -5720,7 +5821,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (ISD::isBuildVectorAllZeros(Op.getNode())) { // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. - if (VT == MVT::v4i32 || VT == MVT::v8i32) + if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) return Op; return getZeroVector(VT, Subtarget, DAG, dl); @@ -5733,10 +5834,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256())) return Op; - return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); + if (!VT.is512BitVector()) + return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); } - SDValue Broadcast = LowerVectorBroadcast(Op, DAG); + SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG); if (Broadcast.getNode()) return Broadcast; @@ -5815,7 +5917,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || (ExtVT == MVT::i64 && Subtarget->is64Bit())) { - if (VT.is256BitVector()) { + if (VT.is256BitVector() || VT.is512BitVector()) { SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, Item, DAG.getIntPtrConstant(0)); @@ -5980,7 +6082,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { V[i] = Op.getOperand(i); // Check for elements which are consecutive loads. - SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); + SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false); if (LD.getNode()) return LD; @@ -6044,7 +6146,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // to create 256-bit vectors from two other 128-bit ones. static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); - MVT ResVT = Op.getValueType().getSimpleVT(); + MVT ResVT = Op.getSimpleValueType(); assert((ResVT.is256BitVector() || ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); @@ -6073,10 +6175,14 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); SDLoc dl(SVOp); - MVT VT = SVOp->getValueType(0).getSimpleVT(); + MVT VT = SVOp->getSimpleValueType(0); MVT EltVT = VT.getVectorElementType(); unsigned NumElems = VT.getVectorNumElements(); + // There is no blend with immediate in AVX-512. + if (VT.is512BitVector()) + return SDValue(); + if (!Subtarget->hasSSE41() || EltVT == MVT::i8) return SDValue(); if (!Subtarget->hasInt256() && VT == MVT::v16i16) @@ -6382,10 +6488,10 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, // 1. [ssse3] 1 x pshufb // 2. [ssse3] 2 x pshufb + 1 x por // 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw -static -SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, - SelectionDAG &DAG, - const X86TargetLowering &TLI) { +static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, + const X86Subtarget* Subtarget, + SelectionDAG &DAG) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); SDLoc dl(SVOp); @@ -6401,7 +6507,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, // present, fall back to case 3. // If SSSE3, use 1 pshufb instruction per vector with elements in the result. - if (TLI.getSubtarget()->hasSSSE3()) { + if (Subtarget->hasSSSE3()) { SmallVector<SDValue,16> pshufbMask; // If all result elements are from one input vector, then only translate @@ -6514,7 +6620,7 @@ static SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - MVT VT = SVOp->getValueType(0).getSimpleVT(); + MVT VT = SVOp->getSimpleValueType(0); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); SDLoc dl(SVOp); @@ -6562,7 +6668,7 @@ SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, static SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { - MVT VT = SVOp->getValueType(0).getSimpleVT(); + MVT VT = SVOp->getSimpleValueType(0); SDLoc dl(SVOp); unsigned NumElems = VT.getVectorNumElements(); MVT NewVT; @@ -6599,7 +6705,7 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, /// getVZextMovL - Return a zero-extending vector move low node. /// -static SDValue getVZextMovL(MVT VT, EVT OpVT, +static SDValue getVZextMovL(MVT VT, MVT OpVT, SDValue SrcOp, SelectionDAG &DAG, const X86Subtarget *Subtarget, SDLoc dl) { if (VT == MVT::v2f64 || VT == MVT::v4f32) { @@ -6641,7 +6747,7 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { if (NewOp.getNode()) return NewOp; - MVT VT = SVOp->getValueType(0).getSimpleVT(); + MVT VT = SVOp->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); unsigned NumLaneElems = NumElems / 2; @@ -6753,7 +6859,7 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); SDLoc dl(SVOp); - MVT VT = SVOp->getValueType(0).getSimpleVT(); + MVT VT = SVOp->getSimpleValueType(0); assert(VT.is128BitVector() && "Unsupported vector size"); @@ -6904,7 +7010,7 @@ static bool MayFoldVectorLoad(SDValue V) { static SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) { - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); // Canonizalize to v2f64. V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); @@ -6918,7 +7024,7 @@ SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); assert(VT != MVT::v2i64 && "unsupported shuffle type"); @@ -6936,7 +7042,7 @@ static SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); assert((VT == MVT::v4i32 || VT == MVT::v4f32) && "unsupported shuffle type"); @@ -6952,7 +7058,7 @@ static SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); unsigned NumElems = VT.getVectorNumElements(); // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second @@ -7006,13 +7112,13 @@ SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) { } // Reduce a vector shuffle to zext. -SDValue -X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { // PMOVZX is only available from SSE41. if (!Subtarget->hasSSE41()) return SDValue(); - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); // Only AVX2 support 256-bit vector integer extending. if (!Subtarget->hasInt256() && VT.is256BitVector()) @@ -7051,12 +7157,11 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } - LLVMContext *Context = DAG.getContext(); unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift; - EVT NeVT = EVT::getIntegerVT(*Context, NBits); - EVT NVT = EVT::getVectorVT(*Context, NeVT, NumElems >> Shift); + MVT NeVT = MVT::getIntegerVT(NBits); + MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift); - if (!isTypeLegal(NVT)) + if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT)) return SDValue(); // Simplify the operand as it's prepared to be fed into shuffle. @@ -7064,8 +7169,8 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { if (V1.getOpcode() == ISD::BITCAST && V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && - V1.getOperand(0) - .getOperand(0).getValueType().getSizeInBits() == SignificantBits) { + V1.getOperand(0).getOperand(0) + .getSimpleValueType().getSizeInBits() == SignificantBits) { // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) SDValue V = V1.getOperand(0).getOperand(0).getOperand(0); ConstantSDNode *CIdx = @@ -7074,19 +7179,19 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { // selection to fold it. Otherwise, we will short the conversion sequence. if (CIdx && CIdx->getZExtValue() == 0 && (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) { - if (V.getValueSizeInBits() > V1.getValueSizeInBits()) { + MVT FullVT = V.getSimpleValueType(); + MVT V1VT = V1.getSimpleValueType(); + if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) { // The "ext_vec_elt" node is wider than the result node. // In this case we should extract subvector from V. // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)). - unsigned Ratio = V.getValueSizeInBits() / V1.getValueSizeInBits(); - EVT FullVT = V.getValueType(); - EVT SubVecVT = EVT::getVectorVT(*Context, - FullVT.getVectorElementType(), + unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits(); + MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(), FullVT.getVectorNumElements()/Ratio); V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V, DAG.getIntPtrConstant(0)); } - V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V); + V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V); } } @@ -7094,10 +7199,11 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { DAG.getNode(X86ISD::VZEXT, DL, NVT, V1)); } -SDValue -X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { +static SDValue +NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); @@ -7108,13 +7214,13 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { // Handle splat operations if (SVOp->isSplat()) { // Use vbroadcast whenever the splat comes from a foldable load - SDValue Broadcast = LowerVectorBroadcast(Op, DAG); + SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG); if (Broadcast.getNode()) return Broadcast; } // Check integer expanding shuffles. - SDValue NewOp = LowerVectorIntExtend(Op, DAG); + SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG); if (NewOp.getNode()) return NewOp; @@ -7132,7 +7238,7 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { if (ISD::isBuildVectorAllZeros(V2.getNode())) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); if (NewOp.getNode()) { - MVT NewVT = NewOp.getValueType().getSimpleVT(); + MVT NewVT = NewOp.getSimpleValueType(); if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT, true, false)) return getVZextMovL(VT, NewVT, NewOp.getOperand(0), @@ -7141,7 +7247,7 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); if (NewOp.getNode()) { - MVT NewVT = NewOp.getValueType().getSimpleVT(); + MVT NewVT = NewOp.getSimpleValueType(); if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT)) return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget, dl); @@ -7156,7 +7262,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); unsigned NumElems = VT.getVectorNumElements(); bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; @@ -7194,7 +7300,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // Normalize the input vectors. Here splats, zeroed vectors, profitable // narrowing and commutation of operands should be handled. The actual code // doesn't include all of those, work in progress... - SDValue NewOp = NormalizeVectorShuffle(Op, DAG); + SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG); if (NewOp.getNode()) return NewOp; @@ -7354,7 +7460,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } // Normalize the node to match x86 shuffle ops if needed - if (!V2IsUndef && (isSHUFPMask(M, VT, HasFp256, /* Commuted */ true))) + if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true))) return CommuteVectorShuffle(SVOp, DAG); // The checks below are all present in isShuffleMaskLegal, but they are @@ -7377,7 +7483,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { getShufflePSHUFLWImmediate(SVOp), DAG); - if (isSHUFPMask(M, VT, HasFp256)) + if (isSHUFPMask(M, VT)) return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, getShuffleSHUFImmediate(SVOp), DAG); @@ -7396,8 +7502,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); // Handle VPERMILPS/D* permutations - if (isVPERMILPMask(M, VT, HasFp256)) { - if (HasInt256 && VT == MVT::v8i32) + if (isVPERMILPMask(M, VT)) { + if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32) return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, getShuffleSHUFImmediate(SVOp), DAG); return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, @@ -7413,21 +7519,28 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (BlendOp.getNode()) return BlendOp; - if (V2IsUndef && HasInt256 && (VT == MVT::v8i32 || VT == MVT::v8f32)) { - SmallVector<SDValue, 8> permclMask; - for (unsigned i = 0; i != 8; ++i) { - permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32)); + unsigned Imm8; + if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8)) + return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG); + + if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) || + VT.is512BitVector()) { + MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits()); + MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems); + SmallVector<SDValue, 16> permclMask; + for (unsigned i = 0; i != NumElems; ++i) { + permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT)); } - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, - &permclMask[0], 8); - // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 - return DAG.getNode(X86ISD::VPERMV, dl, VT, - DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1); - } - if (V2IsUndef && HasInt256 && (VT == MVT::v4i64 || VT == MVT::v4f64)) - return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, - getShuffleCLImmediate(SVOp), DAG); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, + &permclMask[0], NumElems); + if (V2IsUndef) + // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 + return DAG.getNode(X86ISD::VPERMV, dl, VT, + DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1); + return DAG.getNode(X86ISD::VPERMV3, dl, VT, + DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1, V2); + } //===--------------------------------------------------------------------===// // Since no target specific shuffle was selected for this generic one, @@ -7443,7 +7556,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } if (VT == MVT::v16i8) { - SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); + SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG); if (NewOp.getNode()) return NewOp; } @@ -7467,10 +7580,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - if (!Op.getOperand(0).getValueType().getSimpleVT().is128BitVector()) + if (!Op.getOperand(0).getSimpleValueType().is128BitVector()) return SDValue(); if (VT.getSizeInBits() == 8) { @@ -7532,21 +7645,38 @@ SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); - if (!isa<ConstantSDNode>(Op.getOperand(1))) - return SDValue(); - SDValue Vec = Op.getOperand(0); - MVT VecVT = Vec.getValueType().getSimpleVT(); + MVT VecVT = Vec.getSimpleValueType(); + SDValue Idx = Op.getOperand(1); + if (!isa<ConstantSDNode>(Idx)) { + if (VecVT.is512BitVector() || + (VecVT.is256BitVector() && Subtarget->hasInt256() && + VecVT.getVectorElementType().getSizeInBits() == 32)) { + + MVT MaskEltVT = + MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits()); + MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() / + MaskEltVT.getSizeInBits()); + + Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT); + SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT, + getZeroVector(MaskVT, Subtarget, DAG, dl), + Idx, DAG.getConstant(0, getPointerTy())); + SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), + Perm, DAG.getConstant(0, getPointerTy())); + } + return SDValue(); + } // If this is a 256-bit vector result, first extract the 128-bit vector and // then extract the element from the 128-bit vector. if (VecVT.is256BitVector() || VecVT.is512BitVector()) { - SDValue Idx = Op.getOperand(1); - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); // Get the 128-bit vector. Vec = Extract128BitVector(Vec, IdxVal, DAG, dl); - EVT EltVT = VecVT.getVectorElementType(); + MVT EltVT = VecVT.getVectorElementType(); unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); @@ -7565,7 +7695,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, return Res; } - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); // TODO: handle v16i8. if (VT.getSizeInBits() == 16) { SDValue Vec = Op.getOperand(0); @@ -7592,7 +7722,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // SHUFPS the element to the lowest double word, then movss. int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; - MVT VVT = Op.getOperand(0).getValueType().getSimpleVT(); + MVT VVT = Op.getOperand(0).getSimpleValueType(); SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), DAG.getUNDEF(VVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, @@ -7611,7 +7741,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // Note if the lower 64 bits of the result of the UNPCKHPD is then stored // to a f64mem, the whole operation is folded into a single MOVHPDmr. int Mask[2] = { 1, -1 }; - MVT VVT = Op.getOperand(0).getValueType().getSimpleVT(); + MVT VVT = Op.getOperand(0).getSimpleValueType(); SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), DAG.getUNDEF(VVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, @@ -7622,7 +7752,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, } static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); SDLoc dl(Op); @@ -7676,7 +7806,7 @@ static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); SDLoc dl(Op); @@ -7724,17 +7854,15 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { } static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { - LLVMContext *Context = DAG.getContext(); SDLoc dl(Op); - MVT OpVT = Op.getValueType().getSimpleVT(); + MVT OpVT = Op.getSimpleValueType(); // If this is a 256-bit vector result, first insert into a 128-bit // vector and then insert into the 256-bit vector. if (!OpVT.is128BitVector()) { // Insert into a 128-bit vector. unsigned SizeFactor = OpVT.getSizeInBits()/128; - EVT VT128 = EVT::getVectorVT(*Context, - OpVT.getVectorElementType(), + MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(), OpVT.getVectorNumElements() / SizeFactor); Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); @@ -7762,8 +7890,8 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, SDValue In = Op.getOperand(0); SDValue Idx = Op.getOperand(1); unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); - EVT ResVT = Op.getValueType(); - EVT InVT = In.getValueType(); + MVT ResVT = Op.getSimpleValueType(); + MVT InVT = In.getSimpleValueType(); if (Subtarget->hasFp256()) { if (ResVT.is128BitVector() && @@ -7790,16 +7918,16 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, SDValue SubVec = Op.getNode()->getOperand(1); SDValue Idx = Op.getNode()->getOperand(2); - if ((Op.getNode()->getValueType(0).is256BitVector() || - Op.getNode()->getValueType(0).is512BitVector()) && - SubVec.getNode()->getValueType(0).is128BitVector() && + if ((Op.getNode()->getSimpleValueType(0).is256BitVector() || + Op.getNode()->getSimpleValueType(0).is512BitVector()) && + SubVec.getNode()->getSimpleValueType(0).is128BitVector() && isa<ConstantSDNode>(Idx)) { unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); } - if (Op.getNode()->getValueType(0).is512BitVector() && - SubVec.getNode()->getValueType(0).is256BitVector() && + if (Op.getNode()->getSimpleValueType(0).is512BitVector() && + SubVec.getNode()->getSimpleValueType(0).is256BitVector() && isa<ConstantSDNode>(Idx)) { unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); @@ -8108,10 +8236,9 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), is64Bit ? 257 : 256)); - SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), - DAG.getIntPtrConstant(0), - MachinePointerInfo(Ptr), - false, false, false, 0); + SDValue ThreadPointer = + DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0), + MachinePointerInfo(Ptr), false, false, false, 0); unsigned char OperandFlags = 0; // Most TLS accesses are not RIP relative, even on x86-64. One exception is @@ -8133,21 +8260,20 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, // emit "addl x@ntpoff,%eax" (local exec) // or "addl x@indntpoff,%eax" (initial exec) // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) - SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, - GA->getValueType(0), - GA->getOffset(), OperandFlags); + SDValue TGA = + DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), + GA->getOffset(), OperandFlags); SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); if (model == TLSModel::InitialExec) { if (isPIC && !is64Bit) { Offset = DAG.getNode(ISD::ADD, dl, PtrVT, - DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), + DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Offset); } Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, - MachinePointerInfo::getGOT(), false, false, false, - 0); + MachinePointerInfo::getGOT(), false, false, false, 0); } // The address of the thread local variable is the add of the thread @@ -8305,6 +8431,11 @@ SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); + // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the + // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away + // during isel. + SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, + DAG.getConstant(VTBits - 1, MVT::i8)); SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, DAG.getConstant(VTBits - 1, MVT::i8)) : DAG.getConstant(0, VT); @@ -8312,12 +8443,15 @@ SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ SDValue Tmp2, Tmp3; if (Op.getOpcode() == ISD::SHL_PARTS) { Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); - Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt); } else { Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); - Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); + Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt); } + // If the shift amount is larger or equal than the width of a part we can't + // rely on the results of shld/shrd. Insert a test and select the appropriate + // values for large shift amounts. SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, DAG.getConstant(VTBits, MVT::i8)); SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, @@ -8750,9 +8884,9 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget) { - MVT VT = Op->getValueType(0).getSimpleVT(); + MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); - MVT InVT = In.getValueType().getSimpleVT(); + MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); // Optimize vectors in AVX mode: @@ -8768,7 +8902,8 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, // Concat upper and lower parts. // - if (((VT != MVT::v8i32) || (InVT != MVT::v8i16)) && + if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) && + ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) && ((VT != MVT::v4i64) || (InVT != MVT::v4i32))) return SDValue(); @@ -8790,8 +8925,39 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } -SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op, - SelectionDAG &DAG) const { +static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, + SelectionDAG &DAG) { + MVT VT = Op->getValueType(0).getSimpleVT(); + SDValue In = Op->getOperand(0); + MVT InVT = In.getValueType().getSimpleVT(); + SDLoc DL(Op); + unsigned int NumElts = VT.getVectorNumElements(); + if (NumElts != 8 && NumElts != 16) + return SDValue(); + + if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) + return DAG.getNode(X86ISD::VZEXT, DL, VT, In); + + EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + // Now we have only mask extension + assert(InVT.getVectorElementType() == MVT::i1); + SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType()); + const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue(); + SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); + unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); + SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(), + false, false, false, Alignment); + + SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld); + if (VT.is512BitVector()) + return Brcst; + return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst); +} + +static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { if (Subtarget->hasFp256()) { SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); if (Res.getNode()) @@ -8800,12 +8966,16 @@ SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op, return SDValue(); } -SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op, - SelectionDAG &DAG) const { + +static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDLoc DL(Op); - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); - MVT SVT = In.getValueType().getSimpleVT(); + MVT SVT = In.getSimpleValueType(); + + if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1) + return LowerZERO_EXTEND_AVX512(Op, DAG); if (Subtarget->hasFp256()) { SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); @@ -8813,33 +8983,44 @@ SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op, return Res; } - if (!VT.is256BitVector() || !SVT.is128BitVector() || - VT.getVectorNumElements() != SVT.getVectorNumElements()) - return SDValue(); - - assert(Subtarget->hasFp256() && "256-bit vector is observed without AVX!"); - - // AVX2 has better support of integer extending. - if (Subtarget->hasInt256()) - return DAG.getNode(X86ISD::VZEXT, DL, VT, In); - - SDValue Lo = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, In); - static const int Mask[] = {4, 5, 6, 7, -1, -1, -1, -1}; - SDValue Hi = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, - DAG.getVectorShuffle(MVT::v8i16, DL, In, - DAG.getUNDEF(MVT::v8i16), - &Mask[0])); - - return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi); + assert(!VT.is256BitVector() || !SVT.is128BitVector() || + VT.getVectorNumElements() != SVT.getVectorNumElements()); + return SDValue(); } SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); - MVT SVT = In.getValueType().getSimpleVT(); - - if ((VT == MVT::v4i32) && (SVT == MVT::v4i64)) { + MVT InVT = In.getSimpleValueType(); + assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && + "Invalid TRUNCATE operation"); + + if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) { + if (VT.getVectorElementType().getSizeInBits() >=8) + return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); + + assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); + unsigned NumElts = InVT.getVectorNumElements(); + assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type"); + if (InVT.getSizeInBits() < 512) { + MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64; + In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); + InVT = ExtVT; + } + SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType()); + const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue(); + SDValue CP = DAG.getConstantPool(C, getPointerTy()); + unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); + SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(), + false, false, false, Alignment); + SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld); + SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In); + return DAG.getNode(X86ISD::TESTM, DL, VT, And, And); + } + + if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget->hasInt256()) { static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; @@ -8870,7 +9051,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask2); } - if ((VT == MVT::v8i16) && (SVT == MVT::v8i32)) { + if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { // On AVX2, v8i32 -> v8i16 becomed PSHUFB. if (Subtarget->hasInt256()) { In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In); @@ -8928,11 +9109,9 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { } // Handle truncation of V256 to V128 using shuffles. - if (!VT.is128BitVector() || !SVT.is256BitVector()) + if (!VT.is128BitVector() || !InVT.is256BitVector()) return SDValue(); - assert(VT.getVectorNumElements() != SVT.getVectorNumElements() && - "Invalid op"); assert(Subtarget->hasFp256() && "256-bit vector without AVX!"); unsigned NumElems = VT.getVectorNumElements(); @@ -8952,7 +9131,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const { - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); if (VT.isVector()) { if (VT == MVT::v8i16) return DAG.getNode(ISD::TRUNCATE, SDLoc(Op), VT, @@ -8996,9 +9175,9 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); - MVT SVT = In.getValueType().getSimpleVT(); + MVT SVT = In.getSimpleValueType(); assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); @@ -9010,7 +9189,7 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { LLVMContext *Context = DAG.getContext(); SDLoc dl(Op); - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); MVT EltVT = VT; unsigned NumElts = VT == MVT::f64 ? 2 : 4; if (VT.isVector()) { @@ -9044,7 +9223,7 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { LLVMContext *Context = DAG.getContext(); SDLoc dl(Op); - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); MVT EltVT = VT; unsigned NumElts = VT == MVT::f64 ? 2 : 4; if (VT.isVector()) { @@ -9065,7 +9244,7 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { MachinePointerInfo::getConstantPool(), false, false, false, Alignment); if (VT.isVector()) { - MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; + MVT XORVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits()/64); return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(ISD::XOR, dl, XORVT, DAG.getNode(ISD::BITCAST, dl, XORVT, @@ -9081,8 +9260,8 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDLoc dl(Op); - MVT VT = Op.getValueType().getSimpleVT(); - MVT SrcVT = Op1.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); + MVT SrcVT = Op1.getSimpleValueType(); // If second operand is smaller, extend it first. if (SrcVT.bitsLT(VT)) { @@ -9158,7 +9337,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { SDValue N0 = Op.getOperand(0); SDLoc dl(Op); - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, @@ -9168,8 +9347,8 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { // LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able. // -SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, - SelectionDAG &DAG) const { +static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); if (!Subtarget->hasSSE41()) @@ -9294,7 +9473,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, unsigned NumOperands = 0; // Truncate operations may prevent the merge of the SETCC instruction - // and the arithmetic intruction before it. Attempt to truncate the operands + // and the arithmetic instruction before it. Attempt to truncate the operands // of the arithmetic instruction and use a reduced bit-width instruction. bool NeedTruncation = false; SDValue ArithOp = Op; @@ -9402,7 +9581,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, case ISD::AND: Opcode = X86ISD::AND; break; case ISD::OR: { if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { - SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG); + SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG); if (EFLAGS.getNode()) return EFLAGS; } @@ -9638,7 +9817,7 @@ static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 // ones, and then concatenate the result back. static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && "Unsupported value type for operation"); @@ -9665,25 +9844,62 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); } +static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue CC = Op.getOperand(2); + MVT VT = Op.getSimpleValueType(); + + assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 32 && + Op.getValueType().getScalarType() == MVT::i1 && + "Cannot set masked compare for this operation"); + + ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); + SDLoc dl(Op); + + bool Unsigned = false; + unsigned SSECC; + switch (SetCCOpcode) { + default: llvm_unreachable("Unexpected SETCC condition"); + case ISD::SETNE: SSECC = 4; break; + case ISD::SETEQ: SSECC = 0; break; + case ISD::SETUGT: Unsigned = true; + case ISD::SETGT: SSECC = 6; break; // NLE + case ISD::SETULT: Unsigned = true; + case ISD::SETLT: SSECC = 1; break; + case ISD::SETUGE: Unsigned = true; + case ISD::SETGE: SSECC = 5; break; // NLT + case ISD::SETULE: Unsigned = true; + case ISD::SETLE: SSECC = 2; break; + } + unsigned Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM; + return DAG.getNode(Opc, dl, VT, Op0, Op1, + DAG.getConstant(SSECC, MVT::i8)); + +} + static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - SDValue Cond; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); - bool isFP = Op.getOperand(1).getValueType().getSimpleVT().isFloatingPoint(); + bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint(); SDLoc dl(Op); if (isFP) { #ifndef NDEBUG - MVT EltVT = Op0.getValueType().getVectorElementType().getSimpleVT(); + MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); assert(EltVT == MVT::f32 || EltVT == MVT::f64); #endif unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); - + unsigned Opc = X86ISD::CMPP; + if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) { + assert(VT.getVectorNumElements() <= 16); + Opc = X86ISD::CMPM; + } // In the two special cases we can't handle, emit two comparisons. if (SSECC == 8) { unsigned CC0, CC1; @@ -9695,14 +9911,14 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, CC0 = 7; CC1 = 4; CombineOpc = ISD::AND; } - SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CC0, MVT::i8)); - SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CC1, MVT::i8)); return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); } // Handle all other FP comparisons here. - return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); } @@ -9710,6 +9926,24 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, if (VT.is256BitVector() && !Subtarget->hasInt256()) return Lower256IntVSETCC(Op, DAG); + bool MaskResult = (VT.getVectorElementType() == MVT::i1); + EVT OpVT = Op1.getValueType(); + if (Subtarget->hasAVX512()) { + if (Op1.getValueType().is512BitVector() || + (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) + return LowerIntVSETCC_AVX512(Op, DAG); + + // In AVX-512 architecture setcc returns mask with i1 elements, + // But there is no compare instruction for i8 and i16 elements. + // We are not talking about 512-bit operands in this case, these + // types are illegal. + if (MaskResult && + (OpVT.getVectorElementType().getSizeInBits() < 32 && + OpVT.getVectorElementType().getSizeInBits() >= 8)) + return DAG.getNode(ISD::TRUNCATE, dl, VT, + DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC)); + } + // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. @@ -9719,15 +9953,18 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETNE: Invert = true; - case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; + case ISD::SETEQ: Opc = MaskResult? X86ISD::PCMPEQM: X86ISD::PCMPEQ; break; case ISD::SETLT: Swap = true; - case ISD::SETGT: Opc = X86ISD::PCMPGT; break; + case ISD::SETGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; break; case ISD::SETGE: Swap = true; - case ISD::SETLE: Opc = X86ISD::PCMPGT; Invert = true; break; + case ISD::SETLE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; + Invert = true; break; case ISD::SETULT: Swap = true; - case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break; + case ISD::SETUGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; + FlipSigns = true; break; case ISD::SETUGE: Swap = true; - case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break; + case ISD::SETULE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; + FlipSigns = true; Invert = true; break; } // Special case: Use min/max operations for SETULE/SETUGE @@ -9841,7 +10078,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); @@ -9885,7 +10122,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { } } - bool isFP = Op1.getValueType().getSimpleVT().isFloatingPoint(); + bool isFP = Op1.getSimpleValueType().isFloatingPoint(); unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); if (X86CC == X86::COND_INVALID) return SDValue(); @@ -10040,7 +10277,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = Cond.getOperand(1); unsigned Opc = Cmp.getOpcode(); - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); bool IllegalFPCMov = false; if (VT.isFloatingPoint() && !VT.isVector() && @@ -10149,15 +10386,50 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); } -SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op, - SelectionDAG &DAG) const { - MVT VT = Op->getValueType(0).getSimpleVT(); +static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); - MVT InVT = In.getValueType().getSimpleVT(); + MVT InVT = In.getSimpleValueType(); + SDLoc dl(Op); + + unsigned int NumElts = VT.getVectorNumElements(); + if (NumElts != 8 && NumElts != 16) + return SDValue(); + + if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) + return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); + + MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32; + Constant *C = ConstantInt::get(*DAG.getContext(), + APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits())); + + SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); + unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); + SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(), + false, false, false, Alignment); + SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld); + if (VT.is512BitVector()) + return Brcst; + return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst); +} + +static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op->getSimpleValueType(0); + SDValue In = Op->getOperand(0); + MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); + if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) + return LowerSIGN_EXTEND_AVX512(Op, DAG); + if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && - (VT != MVT::v8i32 || InVT != MVT::v8i16)) + (VT != MVT::v8i32 || InVT != MVT::v8i16) && + (VT != MVT::v16i16 || InVT != MVT::v16i8)) return SDValue(); if (Subtarget->hasInt256()) @@ -10502,7 +10774,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, // Get the inputs. SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); - // FIXME: Ensure alignment here + unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + EVT VT = Op.getNode()->getValueType(0); bool Is64Bit = Subtarget->is64Bit(); EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; @@ -10540,14 +10813,20 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); - Flag = Chain.getValue(1); const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); - Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), - SPTy).getValue(1); + unsigned SPReg = RegInfo->getStackRegister(); + SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); + Chain = SP.getValue(1); - SDValue Ops1[2] = { Chain.getValue(0), Chain }; + if (Align) { + SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), + DAG.getConstant(-(uint64_t)Align, VT)); + Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); + } + + SDValue Ops1[2] = { SP, Chain }; return DAG.getMergeValues(Ops1, 2, dl); } } @@ -10698,6 +10977,26 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); } +// getTargetVShiftByConstNode - Handle vector element shifts where the shift +// amount is a constant. Takes immediate version of shift as input. +static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, EVT VT, + SDValue SrcOp, uint64_t ShiftAmt, + SelectionDAG &DAG) { + + // Check for ShiftAmt >= element width + if (ShiftAmt >= VT.getVectorElementType().getSizeInBits()) { + if (Opc == X86ISD::VSRAI) + ShiftAmt = VT.getVectorElementType().getSizeInBits() - 1; + else + return DAG.getConstant(0, VT); + } + + assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) + && "Unknown target vector shift-by-constant node"); + + return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8)); +} + // getTargetVShiftNode - Handle vector element shifts where the shift amount // may or may not be a constant. Takes immediate version of shift as input. static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT, @@ -10705,18 +11004,10 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT, SelectionDAG &DAG) { assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32"); - if (isa<ConstantSDNode>(ShAmt)) { - // Constant may be a TargetConstant. Use a regular constant. - uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue(); - switch (Opc) { - default: llvm_unreachable("Unknown target vector shift node"); - case X86ISD::VSHLI: - case X86ISD::VSRLI: - case X86ISD::VSRAI: - return DAG.getNode(Opc, dl, VT, SrcOp, - DAG.getConstant(ShiftAmt, MVT::i32)); - } - } + // Catch shift-by-constant. + if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt)) + return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp, + CShAmt->getZExtValue(), DAG); // Change opcode to non-immediate version switch (Opc) { @@ -10919,24 +11210,32 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_pmaxu_b: case Intrinsic::x86_avx2_pmaxu_w: case Intrinsic::x86_avx2_pmaxu_d: + case Intrinsic::x86_avx512_pmaxu_d: + case Intrinsic::x86_avx512_pmaxu_q: case Intrinsic::x86_sse2_pminu_b: case Intrinsic::x86_sse41_pminuw: case Intrinsic::x86_sse41_pminud: case Intrinsic::x86_avx2_pminu_b: case Intrinsic::x86_avx2_pminu_w: case Intrinsic::x86_avx2_pminu_d: + case Intrinsic::x86_avx512_pminu_d: + case Intrinsic::x86_avx512_pminu_q: case Intrinsic::x86_sse41_pmaxsb: case Intrinsic::x86_sse2_pmaxs_w: case Intrinsic::x86_sse41_pmaxsd: case Intrinsic::x86_avx2_pmaxs_b: case Intrinsic::x86_avx2_pmaxs_w: case Intrinsic::x86_avx2_pmaxs_d: + case Intrinsic::x86_avx512_pmaxs_d: + case Intrinsic::x86_avx512_pmaxs_q: case Intrinsic::x86_sse41_pminsb: case Intrinsic::x86_sse2_pmins_w: case Intrinsic::x86_sse41_pminsd: case Intrinsic::x86_avx2_pmins_b: case Intrinsic::x86_avx2_pmins_w: - case Intrinsic::x86_avx2_pmins_d: { + case Intrinsic::x86_avx2_pmins_d: + case Intrinsic::x86_avx512_pmins_d: + case Intrinsic::x86_avx512_pmins_q: { unsigned Opcode; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. @@ -10946,6 +11245,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_pmaxu_b: case Intrinsic::x86_avx2_pmaxu_w: case Intrinsic::x86_avx2_pmaxu_d: + case Intrinsic::x86_avx512_pmaxu_d: + case Intrinsic::x86_avx512_pmaxu_q: Opcode = X86ISD::UMAX; break; case Intrinsic::x86_sse2_pminu_b: @@ -10954,6 +11255,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_pminu_b: case Intrinsic::x86_avx2_pminu_w: case Intrinsic::x86_avx2_pminu_d: + case Intrinsic::x86_avx512_pminu_d: + case Intrinsic::x86_avx512_pminu_q: Opcode = X86ISD::UMIN; break; case Intrinsic::x86_sse41_pmaxsb: @@ -10962,6 +11265,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_pmaxs_b: case Intrinsic::x86_avx2_pmaxs_w: case Intrinsic::x86_avx2_pmaxs_d: + case Intrinsic::x86_avx512_pmaxs_d: + case Intrinsic::x86_avx512_pmaxs_q: Opcode = X86ISD::SMAX; break; case Intrinsic::x86_sse41_pminsb: @@ -10970,6 +11275,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_pmins_b: case Intrinsic::x86_avx2_pmins_w: case Intrinsic::x86_avx2_pmins_d: + case Intrinsic::x86_avx512_pmins_d: + case Intrinsic::x86_avx512_pmins_q: Opcode = X86ISD::SMIN; break; } @@ -10982,10 +11289,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_sse2_max_pd: case Intrinsic::x86_avx_max_ps_256: case Intrinsic::x86_avx_max_pd_256: + case Intrinsic::x86_avx512_max_ps_512: + case Intrinsic::x86_avx512_max_pd_512: case Intrinsic::x86_sse_min_ps: case Intrinsic::x86_sse2_min_pd: case Intrinsic::x86_avx_min_ps_256: - case Intrinsic::x86_avx_min_pd_256: { + case Intrinsic::x86_avx_min_pd_256: + case Intrinsic::x86_avx512_min_ps_512: + case Intrinsic::x86_avx512_min_pd_512: { unsigned Opcode; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. @@ -10993,12 +11304,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_sse2_max_pd: case Intrinsic::x86_avx_max_ps_256: case Intrinsic::x86_avx_max_pd_256: + case Intrinsic::x86_avx512_max_ps_512: + case Intrinsic::x86_avx512_max_pd_512: Opcode = X86ISD::FMAX; break; case Intrinsic::x86_sse_min_ps: case Intrinsic::x86_sse2_min_pd: case Intrinsic::x86_avx_min_ps_256: case Intrinsic::x86_avx_min_pd_256: + case Intrinsic::x86_avx512_min_ps_512: + case Intrinsic::x86_avx512_min_pd_512: Opcode = X86ISD::FMIN; break; } @@ -11069,7 +11384,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_permd: case Intrinsic::x86_avx2_permps: // Operands intentionally swapped. Mask is last operand to intrinsic, - // but second operand for node/intruction. + // but second operand for node/instruction. return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(1)); @@ -11144,6 +11459,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } + case Intrinsic::x86_avx512_kortestz: + case Intrinsic::x86_avx512_kortestc: { + unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz)? X86::COND_E: X86::COND_B; + SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1)); + SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2)); + SDValue CC = DAG.getConstant(X86CC, MVT::i8); + SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } // SSE/AVX shift intrinsics case Intrinsic::x86_sse2_psll_w: @@ -11338,7 +11663,19 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_fma_vfmaddsub_ps_256: case Intrinsic::x86_fma_vfmaddsub_pd_256: case Intrinsic::x86_fma_vfmsubadd_ps_256: - case Intrinsic::x86_fma_vfmsubadd_pd_256: { + case Intrinsic::x86_fma_vfmsubadd_pd_256: + case Intrinsic::x86_fma_vfmadd_ps_512: + case Intrinsic::x86_fma_vfmadd_pd_512: + case Intrinsic::x86_fma_vfmsub_ps_512: + case Intrinsic::x86_fma_vfmsub_pd_512: + case Intrinsic::x86_fma_vfnmadd_ps_512: + case Intrinsic::x86_fma_vfnmadd_pd_512: + case Intrinsic::x86_fma_vfnmsub_ps_512: + case Intrinsic::x86_fma_vfnmsub_pd_512: + case Intrinsic::x86_fma_vfmaddsub_ps_512: + case Intrinsic::x86_fma_vfmaddsub_pd_512: + case Intrinsic::x86_fma_vfmsubadd_ps_512: + case Intrinsic::x86_fma_vfmsubadd_pd_512: { unsigned Opc; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. @@ -11346,36 +11683,48 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_fma_vfmadd_pd: case Intrinsic::x86_fma_vfmadd_ps_256: case Intrinsic::x86_fma_vfmadd_pd_256: + case Intrinsic::x86_fma_vfmadd_ps_512: + case Intrinsic::x86_fma_vfmadd_pd_512: Opc = X86ISD::FMADD; break; case Intrinsic::x86_fma_vfmsub_ps: case Intrinsic::x86_fma_vfmsub_pd: case Intrinsic::x86_fma_vfmsub_ps_256: case Intrinsic::x86_fma_vfmsub_pd_256: + case Intrinsic::x86_fma_vfmsub_ps_512: + case Intrinsic::x86_fma_vfmsub_pd_512: Opc = X86ISD::FMSUB; break; case Intrinsic::x86_fma_vfnmadd_ps: case Intrinsic::x86_fma_vfnmadd_pd: case Intrinsic::x86_fma_vfnmadd_ps_256: case Intrinsic::x86_fma_vfnmadd_pd_256: + case Intrinsic::x86_fma_vfnmadd_ps_512: + case Intrinsic::x86_fma_vfnmadd_pd_512: Opc = X86ISD::FNMADD; break; case Intrinsic::x86_fma_vfnmsub_ps: case Intrinsic::x86_fma_vfnmsub_pd: case Intrinsic::x86_fma_vfnmsub_ps_256: case Intrinsic::x86_fma_vfnmsub_pd_256: + case Intrinsic::x86_fma_vfnmsub_ps_512: + case Intrinsic::x86_fma_vfnmsub_pd_512: Opc = X86ISD::FNMSUB; break; case Intrinsic::x86_fma_vfmaddsub_ps: case Intrinsic::x86_fma_vfmaddsub_pd: case Intrinsic::x86_fma_vfmaddsub_ps_256: case Intrinsic::x86_fma_vfmaddsub_pd_256: + case Intrinsic::x86_fma_vfmaddsub_ps_512: + case Intrinsic::x86_fma_vfmaddsub_pd_512: Opc = X86ISD::FMADDSUB; break; case Intrinsic::x86_fma_vfmsubadd_ps: case Intrinsic::x86_fma_vfmsubadd_pd: case Intrinsic::x86_fma_vfmsubadd_ps_256: case Intrinsic::x86_fma_vfmsubadd_pd_256: + case Intrinsic::x86_fma_vfmsubadd_ps_512: + case Intrinsic::x86_fma_vfmsubadd_pd_512: Opc = X86ISD::FMSUBADD; break; } @@ -11386,7 +11735,87 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { } } -static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) { +static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, + SDValue Base, SDValue Index, + SDValue ScaleOp, SDValue Chain, + const X86Subtarget * Subtarget) { + SDLoc dl(Op); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); + assert(C && "Invalid scale type"); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); + SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); + EVT MaskVT = MVT::getVectorVT(MVT::i1, + Index.getValueType().getVectorNumElements()); + SDValue MaskInReg = DAG.getConstant(~0, MaskVT); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); + SDValue Disp = DAG.getTargetConstant(0, MVT::i32); + SDValue Segment = DAG.getRegister(0, MVT::i32); + SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; + SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); + SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; + return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl); +} + +static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, + SDValue Src, SDValue Mask, SDValue Base, + SDValue Index, SDValue ScaleOp, SDValue Chain, + const X86Subtarget * Subtarget) { + SDLoc dl(Op); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); + assert(C && "Invalid scale type"); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); + EVT MaskVT = MVT::getVectorVT(MVT::i1, + Index.getValueType().getVectorNumElements()); + SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); + SDValue Disp = DAG.getTargetConstant(0, MVT::i32); + SDValue Segment = DAG.getRegister(0, MVT::i32); + if (Src.getOpcode() == ISD::UNDEF) + Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); + SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; + SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); + SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; + return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl); +} + +static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, + SDValue Src, SDValue Base, SDValue Index, + SDValue ScaleOp, SDValue Chain) { + SDLoc dl(Op); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); + assert(C && "Invalid scale type"); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); + SDValue Disp = DAG.getTargetConstant(0, MVT::i32); + SDValue Segment = DAG.getRegister(0, MVT::i32); + EVT MaskVT = MVT::getVectorVT(MVT::i1, + Index.getValueType().getVectorNumElements()); + SDValue MaskInReg = DAG.getConstant(~0, MaskVT); + SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); + SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; + SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); + return SDValue(Res, 1); +} + +static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, + SDValue Src, SDValue Mask, SDValue Base, + SDValue Index, SDValue ScaleOp, SDValue Chain) { + SDLoc dl(Op); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); + assert(C && "Invalid scale type"); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); + SDValue Disp = DAG.getTargetConstant(0, MVT::i32); + SDValue Segment = DAG.getRegister(0, MVT::i32); + EVT MaskVT = MVT::getVectorVT(MVT::i1, + Index.getValueType().getVectorNumElements()); + SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); + SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); + SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; + SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); + return SDValue(Res, 1); +} + +static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDLoc dl(Op); unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); switch (IntNo) { @@ -11421,7 +11850,144 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, SDValue(Result.getNode(), 2)); } - + //int_gather(index, base, scale); + case Intrinsic::x86_avx512_gather_qpd_512: + case Intrinsic::x86_avx512_gather_qps_512: + case Intrinsic::x86_avx512_gather_dpd_512: + case Intrinsic::x86_avx512_gather_qpi_512: + case Intrinsic::x86_avx512_gather_qpq_512: + case Intrinsic::x86_avx512_gather_dpq_512: + case Intrinsic::x86_avx512_gather_dps_512: + case Intrinsic::x86_avx512_gather_dpi_512: { + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break; + case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break; + case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break; + case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break; + case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break; + case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break; + case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break; + case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break; + } + SDValue Chain = Op.getOperand(0); + SDValue Index = Op.getOperand(2); + SDValue Base = Op.getOperand(3); + SDValue Scale = Op.getOperand(4); + return getGatherNode(Opc, Op, DAG, Base, Index, Scale, Chain, Subtarget); + } + //int_gather_mask(v1, mask, index, base, scale); + case Intrinsic::x86_avx512_gather_qps_mask_512: + case Intrinsic::x86_avx512_gather_qpd_mask_512: + case Intrinsic::x86_avx512_gather_dpd_mask_512: + case Intrinsic::x86_avx512_gather_dps_mask_512: + case Intrinsic::x86_avx512_gather_qpi_mask_512: + case Intrinsic::x86_avx512_gather_qpq_mask_512: + case Intrinsic::x86_avx512_gather_dpi_mask_512: + case Intrinsic::x86_avx512_gather_dpq_mask_512: { + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_avx512_gather_qps_mask_512: + Opc = X86::VGATHERQPSZrm; break; + case Intrinsic::x86_avx512_gather_qpd_mask_512: + Opc = X86::VGATHERQPDZrm; break; + case Intrinsic::x86_avx512_gather_dpd_mask_512: + Opc = X86::VGATHERDPDZrm; break; + case Intrinsic::x86_avx512_gather_dps_mask_512: + Opc = X86::VGATHERDPSZrm; break; + case Intrinsic::x86_avx512_gather_qpi_mask_512: + Opc = X86::VPGATHERQDZrm; break; + case Intrinsic::x86_avx512_gather_qpq_mask_512: + Opc = X86::VPGATHERQQZrm; break; + case Intrinsic::x86_avx512_gather_dpi_mask_512: + Opc = X86::VPGATHERDDZrm; break; + case Intrinsic::x86_avx512_gather_dpq_mask_512: + Opc = X86::VPGATHERDQZrm; break; + } + SDValue Chain = Op.getOperand(0); + SDValue Src = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + SDValue Index = Op.getOperand(4); + SDValue Base = Op.getOperand(5); + SDValue Scale = Op.getOperand(6); + return getMGatherNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain, + Subtarget); + } + //int_scatter(base, index, v1, scale); + case Intrinsic::x86_avx512_scatter_qpd_512: + case Intrinsic::x86_avx512_scatter_qps_512: + case Intrinsic::x86_avx512_scatter_dpd_512: + case Intrinsic::x86_avx512_scatter_qpi_512: + case Intrinsic::x86_avx512_scatter_qpq_512: + case Intrinsic::x86_avx512_scatter_dpq_512: + case Intrinsic::x86_avx512_scatter_dps_512: + case Intrinsic::x86_avx512_scatter_dpi_512: { + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_avx512_scatter_qpd_512: + Opc = X86::VSCATTERQPDZmr; break; + case Intrinsic::x86_avx512_scatter_qps_512: + Opc = X86::VSCATTERQPSZmr; break; + case Intrinsic::x86_avx512_scatter_dpd_512: + Opc = X86::VSCATTERDPDZmr; break; + case Intrinsic::x86_avx512_scatter_dps_512: + Opc = X86::VSCATTERDPSZmr; break; + case Intrinsic::x86_avx512_scatter_qpi_512: + Opc = X86::VPSCATTERQDZmr; break; + case Intrinsic::x86_avx512_scatter_qpq_512: + Opc = X86::VPSCATTERQQZmr; break; + case Intrinsic::x86_avx512_scatter_dpq_512: + Opc = X86::VPSCATTERDQZmr; break; + case Intrinsic::x86_avx512_scatter_dpi_512: + Opc = X86::VPSCATTERDDZmr; break; + } + SDValue Chain = Op.getOperand(0); + SDValue Base = Op.getOperand(2); + SDValue Index = Op.getOperand(3); + SDValue Src = Op.getOperand(4); + SDValue Scale = Op.getOperand(5); + return getScatterNode(Opc, Op, DAG, Src, Base, Index, Scale, Chain); + } + //int_scatter_mask(base, mask, index, v1, scale); + case Intrinsic::x86_avx512_scatter_qps_mask_512: + case Intrinsic::x86_avx512_scatter_qpd_mask_512: + case Intrinsic::x86_avx512_scatter_dpd_mask_512: + case Intrinsic::x86_avx512_scatter_dps_mask_512: + case Intrinsic::x86_avx512_scatter_qpi_mask_512: + case Intrinsic::x86_avx512_scatter_qpq_mask_512: + case Intrinsic::x86_avx512_scatter_dpi_mask_512: + case Intrinsic::x86_avx512_scatter_dpq_mask_512: { + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_avx512_scatter_qpd_mask_512: + Opc = X86::VSCATTERQPDZmr; break; + case Intrinsic::x86_avx512_scatter_qps_mask_512: + Opc = X86::VSCATTERQPSZmr; break; + case Intrinsic::x86_avx512_scatter_dpd_mask_512: + Opc = X86::VSCATTERDPDZmr; break; + case Intrinsic::x86_avx512_scatter_dps_mask_512: + Opc = X86::VSCATTERDPSZmr; break; + case Intrinsic::x86_avx512_scatter_qpi_mask_512: + Opc = X86::VPSCATTERQDZmr; break; + case Intrinsic::x86_avx512_scatter_qpq_mask_512: + Opc = X86::VPSCATTERQQZmr; break; + case Intrinsic::x86_avx512_scatter_dpq_mask_512: + Opc = X86::VPSCATTERDQZmr; break; + case Intrinsic::x86_avx512_scatter_dpi_mask_512: + Opc = X86::VPSCATTERDDZmr; break; + } + SDValue Chain = Op.getOperand(0); + SDValue Base = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + SDValue Index = Op.getOperand(4); + SDValue Src = Op.getOperand(5); + SDValue Scale = Op.getOperand(6); + return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain); + } // XTEST intrinsics. case Intrinsic::x86_xtest: { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); @@ -11913,8 +12479,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); } - assert((VT == MVT::v2i64 || VT == MVT::v4i64) && - "Only know how to lower V2I64/V4I64 multiply"); + assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && + "Only know how to lower V2I64/V4I64/V8I64 multiply"); // Ahi = psrlqi(a, 32); // Bhi = psrlqi(b, 32); @@ -11927,13 +12493,12 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, // AhiBlo = psllqi(AhiBlo, 32); // return AloBlo + AloBhi + AhiBlo; - SDValue ShAmt = DAG.getConstant(32, MVT::i32); - - SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt); - SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt); + SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); + SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); // Bit cast to 32-bit vectors for MULUDQ - EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32; + EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : + (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32; A = DAG.getNode(ISD::BITCAST, dl, MulVT, A); B = DAG.getNode(ISD::BITCAST, dl, MulVT, B); Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi); @@ -11943,14 +12508,14 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); - AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt); - AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt); + AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG); + AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG); SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); } -SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); EVT EltTy = VT.getVectorElementType(); unsigned NumElts = VT.getVectorNumElements(); @@ -11972,16 +12537,26 @@ SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { if ((SplatValue != 0) && (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) { - unsigned lg2 = SplatValue.countTrailingZeros(); + unsigned Lg2 = SplatValue.countTrailingZeros(); // Splat the sign bit. - SDValue Sz = DAG.getConstant(EltTy.getSizeInBits()-1, MVT::i32); - SDValue SGN = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, N0, Sz, DAG); + SmallVector<SDValue, 16> Sz(NumElts, + DAG.getConstant(EltTy.getSizeInBits() - 1, + EltTy)); + SDValue SGN = DAG.getNode(ISD::SRA, dl, VT, N0, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Sz[0], + NumElts)); // Add (N0 < 0) ? abs2 - 1 : 0; - SDValue Amt = DAG.getConstant(EltTy.getSizeInBits() - lg2, MVT::i32); - SDValue SRL = getTargetVShiftNode(X86ISD::VSRLI, dl, VT, SGN, Amt, DAG); + SmallVector<SDValue, 16> Amt(NumElts, + DAG.getConstant(EltTy.getSizeInBits() - Lg2, + EltTy)); + SDValue SRL = DAG.getNode(ISD::SRL, dl, VT, SGN, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Amt[0], + NumElts)); SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL); - SDValue Lg2Amt = DAG.getConstant(lg2, MVT::i32); - SDValue SRA = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, ADD, Lg2Amt, DAG); + SmallVector<SDValue, 16> Lg2Amt(NumElts, DAG.getConstant(Lg2, EltTy)); + SDValue SRA = DAG.getNode(ISD::SRA, dl, VT, ADD, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Lg2Amt[0], + NumElts)); // If we're dividing by a positive value, we're done. Otherwise, we must // negate the result. @@ -12010,23 +12585,26 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || (Subtarget->hasInt256() && - (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) { + (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) || + (Subtarget->hasAVX512() && + (VT == MVT::v8i64 || VT == MVT::v16i32))) { if (Op.getOpcode() == ISD::SHL) - return DAG.getNode(X86ISD::VSHLI, dl, VT, R, - DAG.getConstant(ShiftAmt, MVT::i32)); + return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt, + DAG); if (Op.getOpcode() == ISD::SRL) - return DAG.getNode(X86ISD::VSRLI, dl, VT, R, - DAG.getConstant(ShiftAmt, MVT::i32)); + return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, + DAG); if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64) - return DAG.getNode(X86ISD::VSRAI, dl, VT, R, - DAG.getConstant(ShiftAmt, MVT::i32)); + return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt, + DAG); } if (VT == MVT::v16i8) { if (Op.getOpcode() == ISD::SHL) { // Make a large shift. - SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R, - DAG.getConstant(ShiftAmt, MVT::i32)); + SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, + MVT::v8i16, R, ShiftAmt, + DAG); SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); // Zero out the rightmost bits. SmallVector<SDValue, 16> V(16, @@ -12037,8 +12615,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. - SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R, - DAG.getConstant(ShiftAmt, MVT::i32)); + SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, + MVT::v8i16, R, ShiftAmt, + DAG); SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); // Zero out the leftmost bits. SmallVector<SDValue, 16> V(16, @@ -12069,8 +12648,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, if (Subtarget->hasInt256() && VT == MVT::v32i8) { if (Op.getOpcode() == ISD::SHL) { // Make a large shift. - SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R, - DAG.getConstant(ShiftAmt, MVT::i32)); + SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, + MVT::v16i16, R, ShiftAmt, + DAG); SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); // Zero out the rightmost bits. SmallVector<SDValue, 32> V(32, @@ -12081,8 +12661,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. - SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R, - DAG.getConstant(ShiftAmt, MVT::i32)); + SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, + MVT::v16i16, R, ShiftAmt, + DAG); SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); // Zero out the leftmost bits. SmallVector<SDValue, 32> V(32, @@ -12147,14 +12728,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, default: llvm_unreachable("Unknown shift opcode!"); case ISD::SHL: - return DAG.getNode(X86ISD::VSHLI, dl, VT, R, - DAG.getConstant(ShiftAmt, MVT::i32)); + return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt, + DAG); case ISD::SRL: - return DAG.getNode(X86ISD::VSRLI, dl, VT, R, - DAG.getConstant(ShiftAmt, MVT::i32)); + return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, + DAG); case ISD::SRA: - return DAG.getNode(X86ISD::VSRAI, dl, VT, R, - DAG.getConstant(ShiftAmt, MVT::i32)); + return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt, + DAG); } } @@ -12172,7 +12753,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, VT == MVT::v4i32 || VT == MVT::v8i16 || (Subtarget->hasInt256() && ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) || - VT == MVT::v8i32 || VT == MVT::v16i16))) { + VT == MVT::v8i32 || VT == MVT::v16i16)) || + (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) { SDValue BaseShAmt; EVT EltVT = VT.getVectorElementType(); @@ -12240,6 +12822,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, case MVT::v4i64: case MVT::v8i32: case MVT::v16i16: + case MVT::v16i32: + case MVT::v8i64: return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG); } case ISD::SRA: @@ -12249,6 +12833,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, case MVT::v8i16: case MVT::v8i32: case MVT::v16i16: + case MVT::v16i32: + case MVT::v8i64: return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG); } case ISD::SRL: @@ -12260,6 +12846,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, case MVT::v4i64: case MVT::v8i32: case MVT::v16i16: + case MVT::v16i32: + case MVT::v8i64: return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG); } } @@ -12268,7 +12856,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, // Special case in 32-bit mode, where i64 is expanded into high and low parts. if (!Subtarget->is64Bit() && - (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && + (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) || + (Subtarget->hasAVX512() && VT == MVT::v8i64)) && Amt.getOpcode() == ISD::BITCAST && Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { Amt = Amt.getOperand(0); @@ -12297,7 +12886,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, return SDValue(); } -SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, + SelectionDAG &DAG) { EVT VT = Op.getValueType(); SDLoc dl(Op); @@ -12316,6 +12906,8 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { if (V.getNode()) return V; + if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64)) + return Op; // AVX2 has VPSLLV/VPSRAV/VPSRLV. if (Subtarget->hasInt256()) { if (Op.getOpcode() == ISD::SRL && @@ -12356,8 +12948,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { // r = VSELECT(r, psllw(r & (char16)15, 4), a); SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1); - M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M, - DAG.getConstant(4, MVT::i32), DAG); + M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG); M = DAG.getNode(ISD::BITCAST, dl, VT, M); R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); @@ -12368,8 +12959,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { // r = VSELECT(r, psllw(r & (char16)63, 2), a); M = DAG.getNode(ISD::AND, dl, VT, R, CM2); - M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M, - DAG.getConstant(2, MVT::i32), DAG); + M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG); M = DAG.getNode(ISD::BITCAST, dl, VT, M); R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); @@ -12512,7 +13102,6 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, unsigned BitsDiff = VT.getScalarType().getSizeInBits() - ExtraVT.getScalarType().getSizeInBits(); - SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32); switch (VT.getSimpleVT().SimpleTy) { default: return SDValue(); @@ -12546,24 +13135,34 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, // fall through case MVT::v4i32: case MVT::v8i16: { - // (sext (vzext x)) -> (vsext x) SDValue Op0 = Op.getOperand(0); SDValue Op00 = Op0.getOperand(0); SDValue Tmp1; // Hopefully, this VECTOR_SHUFFLE is just a VZEXT. if (Op0.getOpcode() == ISD::BITCAST && - Op00.getOpcode() == ISD::VECTOR_SHUFFLE) - Tmp1 = LowerVectorIntExtend(Op00, DAG); - if (Tmp1.getNode()) { - SDValue Tmp1Op0 = Tmp1.getOperand(0); - assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT && - "This optimization is invalid without a VZEXT."); - return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0)); + Op00.getOpcode() == ISD::VECTOR_SHUFFLE) { + // (sext (vzext x)) -> (vsext x) + Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG); + if (Tmp1.getNode()) { + EVT ExtraEltVT = ExtraVT.getVectorElementType(); + // This folding is only valid when the in-reg type is a vector of i8, + // i16, or i32. + if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 || + ExtraEltVT == MVT::i32) { + SDValue Tmp1Op0 = Tmp1.getOperand(0); + assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT && + "This optimization is invalid without a VZEXT."); + return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0)); + } + Op0 = Tmp1; + } } // If the above didn't work, then just use Shift-Left + Shift-Right. - Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, Op0, ShAmt, DAG); - return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG); + Tmp1 = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, BitsDiff, + DAG); + return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Tmp1, BitsDiff, + DAG); } } } @@ -12655,9 +13254,10 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Ops, array_lengthof(Ops), dl); } -SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { - EVT SrcVT = Op.getOperand(0).getValueType(); - EVT DstVT = Op.getValueType(); +static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT SrcVT = Op.getOperand(0).getSimpleValueType(); + MVT DstVT = Op.getSimpleValueType(); assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && Subtarget->hasMMX() && "Unexpected custom BITCAST"); assert((DstVT == MVT::i64 || @@ -12742,7 +13342,8 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { Op.getOperand(1), Op.getOperand(2)); } -SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit()); // For MacOSX, we want to call an alternative entry point: __sincos_stret, @@ -12753,8 +13354,8 @@ SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - ArgListTy Args; - ArgListEntry Entry; + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; Entry.Node = Arg; Entry.Ty = ArgTy; @@ -12767,7 +13368,8 @@ SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { // the small struct {f32, f32} is returned in (eax, edx). For f64, // the results are returned via SRet in memory. const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret"; - SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy()); Type *RetTy = isF64 ? (Type*)StructType::get(ArgTy, ArgTy, NULL) @@ -12778,7 +13380,7 @@ SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { CallingConv::C, /*isTaillCall=*/false, /*doesNotRet=*/false, /*isReturnValueUsed*/true, Callee, Args, DAG, dl); - std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); + std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); if (isF64) // Returned in xmm0 and xmm1. @@ -12822,9 +13424,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); - case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG); - case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG); - case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, DAG); + case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); + case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); + case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); @@ -12840,7 +13442,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); - case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::FRAME_TO_ARGS_OFFSET: @@ -12858,7 +13461,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: - case ISD::SHL: return LowerShift(Op, DAG); + case ISD::SHL: return LowerShift(Op, Subtarget, DAG); case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: @@ -12866,7 +13469,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SMULO: case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); - case ISD::BITCAST: return LowerBITCAST(Op, DAG); + case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); case ISD::ADDC: case ISD::ADDE: case ISD::SUBC: @@ -12874,7 +13477,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ADD: return LowerADD(Op, DAG); case ISD::SUB: return LowerSUB(Op, DAG); case ISD::SDIV: return LowerSDIV(Op, DAG); - case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); + case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); } } @@ -13128,6 +13731,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CMP: return "X86ISD::CMP"; case X86ISD::COMI: return "X86ISD::COMI"; case X86ISD::UCOMI: return "X86ISD::UCOMI"; + case X86ISD::CMPM: return "X86ISD::CMPM"; + case X86ISD::CMPMU: return "X86ISD::CMPMU"; case X86ISD::SETCC: return "X86ISD::SETCC"; case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; @@ -13187,6 +13792,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; case X86ISD::VZEXT: return "X86ISD::VZEXT"; case X86ISD::VSEXT: return "X86ISD::VSEXT"; + case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; + case X86ISD::VTRUNCM: return "X86ISD::VTRUNCM"; + case X86ISD::VINSERT: return "X86ISD::VINSERT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; @@ -13200,6 +13808,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CMPP: return "X86ISD::CMPP"; case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; + case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM"; + case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM"; case X86ISD::ADD: return "X86ISD::ADD"; case X86ISD::SUB: return "X86ISD::SUB"; case X86ISD::ADC: return "X86ISD::ADC"; @@ -13214,9 +13824,14 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::BLSI: return "X86ISD::BLSI"; case X86ISD::BLSMSK: return "X86ISD::BLSMSK"; case X86ISD::BLSR: return "X86ISD::BLSR"; + case X86ISD::BZHI: return "X86ISD::BZHI"; + case X86ISD::BEXTR: return "X86ISD::BEXTR"; case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::PTEST: return "X86ISD::PTEST"; case X86ISD::TESTP: return "X86ISD::TESTP"; + case X86ISD::TESTM: return "X86ISD::TESTM"; + case X86ISD::KORTEST: return "X86ISD::KORTEST"; + case X86ISD::KTEST: return "X86ISD::KTEST"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; @@ -13239,6 +13854,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; case X86ISD::VPERMV: return "X86ISD::VPERMV"; + case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; case X86ISD::VPERMI: return "X86ISD::VPERMI"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; @@ -13422,37 +14038,46 @@ bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { bool X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const { + if (!VT.isSimple()) + return false; + + MVT SVT = VT.getSimpleVT(); + // Very little shuffling can be done for 64-bit vectors right now. if (VT.getSizeInBits() == 64) return false; // FIXME: pshufb, blends, shifts. - return (VT.getVectorNumElements() == 2 || + return (SVT.getVectorNumElements() == 2 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || - isMOVLMask(M, VT) || - isSHUFPMask(M, VT, Subtarget->hasFp256()) || - isPSHUFDMask(M, VT) || - isPSHUFHWMask(M, VT, Subtarget->hasInt256()) || - isPSHUFLWMask(M, VT, Subtarget->hasInt256()) || - isPALIGNRMask(M, VT, Subtarget) || - isUNPCKLMask(M, VT, Subtarget->hasInt256()) || - isUNPCKHMask(M, VT, Subtarget->hasInt256()) || - isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasInt256()) || - isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasInt256())); + isMOVLMask(M, SVT) || + isSHUFPMask(M, SVT) || + isPSHUFDMask(M, SVT) || + isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) || + isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) || + isPALIGNRMask(M, SVT, Subtarget) || + isUNPCKLMask(M, SVT, Subtarget->hasInt256()) || + isUNPCKHMask(M, SVT, Subtarget->hasInt256()) || + isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || + isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256())); } bool X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, EVT VT) const { - unsigned NumElts = VT.getVectorNumElements(); + if (!VT.isSimple()) + return false; + + MVT SVT = VT.getSimpleVT(); + unsigned NumElts = SVT.getVectorNumElements(); // FIXME: This collection of masks seems suspect. if (NumElts == 2) return true; - if (NumElts == 4 && VT.is128BitVector()) { - return (isMOVLMask(Mask, VT) || - isCommutedMOVLMask(Mask, VT, true) || - isSHUFPMask(Mask, VT, Subtarget->hasFp256()) || - isSHUFPMask(Mask, VT, Subtarget->hasFp256(), /* Commuted */ true)); + if (NumElts == 4 && SVT.is128BitVector()) { + return (isMOVLMask(Mask, SVT) || + isCommutedMOVLMask(Mask, SVT, true) || + isSHUFPMask(Mask, SVT) || + isSHUFPMask(Mask, SVT, /* Commuted */ true)); } return false; } @@ -15194,6 +15819,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::CMOV_V8F32: case X86::CMOV_V4F64: case X86::CMOV_V4I64: + case X86::CMOV_V16F32: + case X86::CMOV_V8F64: + case X86::CMOV_V8I64: case X86::CMOV_GR16: case X86::CMOV_GR32: case X86::CMOV_RFP32: @@ -15642,7 +16270,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); - return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); + return EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); } /// PerformTruncateCombine - Converts truncate operation to @@ -15749,6 +16377,44 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, EltNo); } +/// Extract one bit from mask vector, like v16i1 or v8i1. +/// AVX-512 feature. +static SDValue ExtractBitFromMaskVector(SDNode *N, SelectionDAG &DAG) { + SDValue Vec = N->getOperand(0); + SDLoc dl(Vec); + MVT VecVT = Vec.getSimpleValueType(); + SDValue Idx = N->getOperand(1); + MVT EltVT = N->getSimpleValueType(0); + + assert((VecVT.getVectorElementType() == MVT::i1 && EltVT == MVT::i8) || + "Unexpected operands in ExtractBitFromMaskVector"); + + // variable index + if (!isa<ConstantSDNode>(Idx)) { + MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); + SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + ExtVT.getVectorElementType(), Ext); + return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); + } + + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + + MVT ScalarVT = MVT::getIntegerVT(VecVT.getSizeInBits()); + unsigned MaxShift = VecVT.getSizeInBits() - 1; + Vec = DAG.getNode(ISD::BITCAST, dl, ScalarVT, Vec); + Vec = DAG.getNode(ISD::SHL, dl, ScalarVT, Vec, + DAG.getConstant(MaxShift - IdxVal, ScalarVT)); + Vec = DAG.getNode(ISD::SRL, dl, ScalarVT, Vec, + DAG.getConstant(MaxShift, ScalarVT)); + + if (VecVT == MVT::v16i1) { + Vec = DAG.getNode(ISD::BITCAST, dl, MVT::i16, Vec); + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Vec); + } + return DAG.getNode(ISD::BITCAST, dl, MVT::i8, Vec); +} + /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index /// generation and convert it from being a bunch of shuffles and extracts /// to a simple store and scalar loads to extract the elements. @@ -15759,6 +16425,11 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, return NewOp; SDValue InputVector = N->getOperand(0); + + if (InputVector.getValueType().getVectorElementType() == MVT::i1 && + !DCI.isBeforeLegalize()) + return ExtractBitFromMaskVector(N, DAG); + // Detect whether we are trying to convert from mmx to i32 and the bitcast // from mmx to v2i32 has a single usage. if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST && @@ -15846,24 +16517,28 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, } /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match. -static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, - SDValue RHS, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { +static std::pair<unsigned, bool> +matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, + SelectionDAG &DAG, const X86Subtarget *Subtarget) { if (!VT.isVector()) - return 0; + return std::make_pair(0, false); + bool NeedSplit = false; switch (VT.getSimpleVT().SimpleTy) { - default: return 0; + default: return std::make_pair(0, false); case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: if (!Subtarget->hasAVX2()) - return 0; + NeedSplit = true; + if (!Subtarget->hasAVX()) + return std::make_pair(0, false); + break; case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: if (!Subtarget->hasSSE2()) - return 0; + return std::make_pair(0, false); } // SSE2 has only a small subset of the operations. @@ -15874,6 +16549,7 @@ static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + unsigned Opc = 0; // Check for x CC y ? x : y. if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && DAG.isEqualTo(RHS, Cond.getOperand(1))) { @@ -15881,16 +16557,16 @@ static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, default: break; case ISD::SETULT: case ISD::SETULE: - return hasUnsigned ? X86ISD::UMIN : 0; + Opc = hasUnsigned ? X86ISD::UMIN : 0; break; case ISD::SETUGT: case ISD::SETUGE: - return hasUnsigned ? X86ISD::UMAX : 0; + Opc = hasUnsigned ? X86ISD::UMAX : 0; break; case ISD::SETLT: case ISD::SETLE: - return hasSigned ? X86ISD::SMIN : 0; + Opc = hasSigned ? X86ISD::SMIN : 0; break; case ISD::SETGT: case ISD::SETGE: - return hasSigned ? X86ISD::SMAX : 0; + Opc = hasSigned ? X86ISD::SMAX : 0; break; } // Check for x CC y ? y : x -- a min/max with reversed arms. } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && @@ -15899,20 +16575,20 @@ static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, default: break; case ISD::SETULT: case ISD::SETULE: - return hasUnsigned ? X86ISD::UMAX : 0; + Opc = hasUnsigned ? X86ISD::UMAX : 0; break; case ISD::SETUGT: case ISD::SETUGE: - return hasUnsigned ? X86ISD::UMIN : 0; + Opc = hasUnsigned ? X86ISD::UMIN : 0; break; case ISD::SETLT: case ISD::SETLE: - return hasSigned ? X86ISD::SMAX : 0; + Opc = hasSigned ? X86ISD::SMAX : 0; break; case ISD::SETGT: case ISD::SETGE: - return hasSigned ? X86ISD::SMIN : 0; + Opc = hasSigned ? X86ISD::SMIN : 0; break; } } - return 0; + return std::make_pair(Opc, NeedSplit); } /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT @@ -15926,13 +16602,14 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); EVT VT = LHS.getValueType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If we have SSE[12] support, try to form min/max nodes. SSE min/max // instructions match the semantics of the common C idiom x<y?x:y but not // x<=y?x:y, because of how they handle negative zero (which can be // ignored in unsafe-math mode). if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && - VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && + VT != MVT::f80 && TLI.isTypeLegal(VT) && (Subtarget->hasSSE2() || (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); @@ -16071,6 +16748,22 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); } + EVT CondVT = Cond.getValueType(); + if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() && + CondVT.getVectorElementType() == MVT::i1) { + // v16i8 (select v16i1, v16i8, v16i8) does not have a proper + // lowering on AVX-512. In this case we convert it to + // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. + // The same situation for all 128 and 256-bit vectors of i8 and i16 + EVT OpVT = LHS.getValueType(); + if ((OpVT.is128BitVector() || OpVT.is256BitVector()) && + (OpVT.getVectorElementType() == MVT::i8 || + OpVT.getVectorElementType() == MVT::i16)) { + Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond); + DCI.AddToWorklist(Cond.getNode()); + return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS); + } + } // If this is a select between two integer constants, try to do some // optimizations. if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { @@ -16195,9 +16888,12 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } } + // Early exit check + if (!TLI.isTypeLegal(VT)) + return SDValue(); + // Match VSELECTs into subs with unsigned saturation. - if (!DCI.isBeforeLegalize() && - N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && + if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && // psubus is available in SSE2 and AVX2 for i8 and i16 vectors. ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) || (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) { @@ -16251,14 +16947,35 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } // Try to match a min/max vector operation. - if (!DCI.isBeforeLegalize() && - N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) - if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget)) - return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS); + if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) { + std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget); + unsigned Opc = ret.first; + bool NeedSplit = ret.second; + + if (Opc && NeedSplit) { + unsigned NumElems = VT.getVectorNumElements(); + // Extract the LHS vectors + SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL); + SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL); + + // Extract the RHS vectors + SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL); + SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL); + + // Create min/max for each subvector + LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1); + RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2); + + // Merge the result + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS); + } else if (Opc) + return DAG.getNode(Opc, DL, VT, LHS, RHS); + } // Simplify vector selection if the selector will be produced by CMPP*/PCMP*. - if (!DCI.isBeforeLegalize() && N->getOpcode() == ISD::VSELECT && - Cond.getOpcode() == ISD::SETCC) { + if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && + // Check if SETCC has already been promoted + TLI.getSetCCResultType(*DAG.getContext(), VT) == Cond.getValueType()) { assert(Cond.getValueType().isVector() && "vector select expects a vector selector!"); @@ -16305,7 +17022,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // matched by one of the SSE/AVX BLEND instructions. These instructions only // depend on the highest bit in each word. Try to use SimplifyDemandedBits // to simplify previous instructions. - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) { unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); @@ -16314,6 +17030,15 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (BitWidth == 1) return SDValue(); + // Check all uses of that condition operand to check whether it will be + // consumed by non-BLEND instructions, which may depend on all bits are set + // properly. + for (SDNode::use_iterator I = Cond->use_begin(), + E = Cond->use_end(); I != E; ++I) + if (I->getOpcode() != ISD::VSELECT) + // TODO: Add other opcodes eventually lowered into BLEND. + return SDValue(); + assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); @@ -16990,33 +17715,80 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, if (R.getNode()) return R; - // Create BLSI, and BLSR instructions + // Create BLSI, BLSR, and BZHI instructions // BLSI is X & (-X) // BLSR is X & (X-1) - if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) { + // BZHI is X & ((1 << Y) - 1) + // BEXTR is ((X >> imm) & (2**size-1)) + if (VT == MVT::i32 || VT == MVT::i64) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); - // Check LHS for neg - if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 && - isZero(N0.getOperand(0))) - return DAG.getNode(X86ISD::BLSI, DL, VT, N1); - - // Check RHS for neg - if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 && - isZero(N1.getOperand(0))) - return DAG.getNode(X86ISD::BLSI, DL, VT, N0); + if (Subtarget->hasBMI()) { + // Check LHS for neg + if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 && + isZero(N0.getOperand(0))) + return DAG.getNode(X86ISD::BLSI, DL, VT, N1); + + // Check RHS for neg + if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 && + isZero(N1.getOperand(0))) + return DAG.getNode(X86ISD::BLSI, DL, VT, N0); + + // Check LHS for X-1 + if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && + isAllOnes(N0.getOperand(1))) + return DAG.getNode(X86ISD::BLSR, DL, VT, N1); + + // Check RHS for X-1 + if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && + isAllOnes(N1.getOperand(1))) + return DAG.getNode(X86ISD::BLSR, DL, VT, N0); + } + + if (Subtarget->hasBMI2()) { + // Check for (and (add (shl 1, Y), -1), X) + if (N0.getOpcode() == ISD::ADD && isAllOnes(N0.getOperand(1))) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == ISD::SHL) { + SDValue N001 = N00.getOperand(1); + assert(N001.getValueType() == MVT::i8 && "unexpected type"); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N00.getOperand(0)); + if (C && C->getZExtValue() == 1) + return DAG.getNode(X86ISD::BZHI, DL, VT, N1, N001); + } + } - // Check LHS for X-1 - if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && - isAllOnes(N0.getOperand(1))) - return DAG.getNode(X86ISD::BLSR, DL, VT, N1); + // Check for (and X, (add (shl 1, Y), -1)) + if (N1.getOpcode() == ISD::ADD && isAllOnes(N1.getOperand(1))) { + SDValue N10 = N1.getOperand(0); + if (N10.getOpcode() == ISD::SHL) { + SDValue N101 = N10.getOperand(1); + assert(N101.getValueType() == MVT::i8 && "unexpected type"); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N10.getOperand(0)); + if (C && C->getZExtValue() == 1) + return DAG.getNode(X86ISD::BZHI, DL, VT, N0, N101); + } + } + } - // Check RHS for X-1 - if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && - isAllOnes(N1.getOperand(1))) - return DAG.getNode(X86ISD::BLSR, DL, VT, N0); + // Check for BEXTR. + if ((Subtarget->hasBMI() || Subtarget->hasTBM()) && + (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) { + ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1); + ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1)); + if (MaskNode && ShiftNode) { + uint64_t Mask = MaskNode->getZExtValue(); + uint64_t Shift = ShiftNode->getZExtValue(); + if (isMask_64(Mask)) { + uint64_t MaskSize = CountPopulation_64(Mask); + if (Shift + MaskSize <= VT.getSizeInBits()) + return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0), + DAG.getConstant(Shift | (MaskSize << 8), VT)); + } + } + } // BEXTR return SDValue(); } @@ -17732,7 +18504,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { RHS.getOpcode() != ISD::VECTOR_SHUFFLE) return false; - MVT VT = LHS.getValueType().getSimpleVT(); + MVT VT = LHS.getSimpleValueType(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"); @@ -18205,7 +18977,7 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, if (!Ld->isVolatile() && !N->getValueType(0).isVector() && ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && !XTLI->getSubtarget()->is64Bit() && - !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { + VT == MVT::i64) { SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG); DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); @@ -18531,6 +19303,22 @@ namespace { const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={}; } +static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) { + + if (AsmPieces.size() == 3 || AsmPieces.size() == 4) { + if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") && + std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") && + std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) { + + if (AsmPieces.size() == 3) + return true; + else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}")) + return true; + } + } + return false; +} + bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); @@ -18572,12 +19360,8 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { const std::string &ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); array_pod_sort(AsmPieces.begin(), AsmPieces.end()); - if (AsmPieces.size() == 4 && - AsmPieces[0] == "~{cc}" && - AsmPieces[1] == "~{dirflag}" && - AsmPieces[2] == "~{flags}" && - AsmPieces[3] == "~{fpsr}") - return IntrinsicLowering::LowerToByteSwap(CI); + if (clobbersFlagRegisters(AsmPieces)) + return IntrinsicLowering::LowerToByteSwap(CI); } break; case 3: @@ -18590,11 +19374,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { const std::string &ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); array_pod_sort(AsmPieces.begin(), AsmPieces.end()); - if (AsmPieces.size() == 4 && - AsmPieces[0] == "~{cc}" && - AsmPieces[1] == "~{dirflag}" && - AsmPieces[2] == "~{flags}" && - AsmPieces[3] == "~{fpsr}") + if (clobbersFlagRegisters(AsmPieces)) return IntrinsicLowering::LowerToByteSwap(CI); } |