diff options
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 466 |
1 files changed, 309 insertions, 157 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 88f3829..3042386 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -49,6 +49,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" #include <bitset> +#include <cctype> using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); @@ -1578,18 +1579,20 @@ X86TargetLowering::LowerReturn(SDValue Chain, MVT::Other, &RetOps[0], RetOps.size()); } -bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { +bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { if (N->getNumValues() != 1) return false; if (!N->hasNUsesOfValue(1, 0)) return false; + SDValue TCChain = Chain; SDNode *Copy = *N->use_begin(); if (Copy->getOpcode() == ISD::CopyToReg) { // If the copy has a glue operand, we conservatively assume it isn't safe to // perform a tail call. if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) return false; + TCChain = Copy->getOperand(0); } else if (Copy->getOpcode() != ISD::FP_EXTEND) return false; @@ -1601,7 +1604,11 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { HasRet = true; } - return HasRet; + if (!HasRet) + return false; + + Chain = TCChain; + return true; } EVT @@ -2929,6 +2936,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::VPERMILP: + case X86ISD::VPERMI: return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); } } @@ -3970,6 +3978,27 @@ unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { return Index / NumElemsPerChunk; } +/// getShuffleCLImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions. +/// Handles 256-bit. +static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) { + EVT VT = N->getValueType(0); + + unsigned NumElts = VT.getVectorNumElements(); + + assert((VT.is256BitVector() && NumElts == 4) && + "Unsupported vector type for VPERMQ/VPERMPD"); + + unsigned Mask = 0; + for (unsigned i = 0; i != NumElts; ++i) { + int Elt = N->getMaskElt(i); + if (Elt < 0) + continue; + Mask |= Elt << (i*2); + } + + return Mask; +} /// isZeroNode - Returns true if Elt is a constant zero or a floating point /// constant +0.0. bool X86::isZeroNode(SDValue Elt) { @@ -4402,6 +4431,7 @@ static bool getTargetShuffleMask(SDNode *N, EVT VT, case X86ISD::VPERM2X128: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + if (Mask.empty()) return false; break; case X86ISD::MOVDDUP: case X86ISD::MOVLHPD: @@ -4852,41 +4882,42 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, return SDValue(); } -/// isVectorBroadcast - Check if the node chain is suitable to be xformed to -/// a vbroadcast node. We support two patterns: -/// 1. A splat BUILD_VECTOR which uses a single scalar load. +/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction +/// to generate a splat value for the following cases: +/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. /// 2. A splat shuffle which uses a scalar_to_vector node which comes from -/// a scalar load. -/// The scalar load node is returned when a pattern is found, +/// a scalar load, or a constant. +/// The VBROADCAST node is returned when a pattern is found, /// or SDValue() otherwise. -static SDValue isVectorBroadcast(SDValue &Op, const X86Subtarget *Subtarget) { +SDValue +X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { if (!Subtarget->hasAVX()) return SDValue(); EVT VT = Op.getValueType(); - SDValue V = Op; - - if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) - V = V.getOperand(0); + DebugLoc dl = Op.getDebugLoc(); - //A suspected load to be broadcasted. SDValue Ld; + bool ConstSplatVal; - switch (V.getOpcode()) { + switch (Op.getOpcode()) { default: // Unknown pattern found. return SDValue(); case ISD::BUILD_VECTOR: { // The BUILD_VECTOR node must be a splat. - if (!isSplatVector(V.getNode())) + if (!isSplatVector(Op.getNode())) return SDValue(); - Ld = V.getOperand(0); + Ld = Op.getOperand(0); + ConstSplatVal = (Ld.getOpcode() == ISD::Constant || + Ld.getOpcode() == ISD::ConstantFP); // The suspected load node has several users. Make sure that all // of its users are from the BUILD_VECTOR node. - if (!Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) + // Constants may have multiple users. + if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) return SDValue(); break; } @@ -4904,15 +4935,50 @@ static SDValue isVectorBroadcast(SDValue &Op, const X86Subtarget *Subtarget) { return SDValue(); Ld = Sc.getOperand(0); + ConstSplatVal = (Ld.getOpcode() == ISD::Constant || + Ld.getOpcode() == ISD::ConstantFP); // The scalar_to_vector node and the suspected // load node must have exactly one user. - if (!Sc.hasOneUse() || !Ld.hasOneUse()) + // Constants may have multiple users. + if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse())) return SDValue(); break; } } + bool Is256 = VT.getSizeInBits() == 256; + bool Is128 = VT.getSizeInBits() == 128; + + // Handle the broadcasting a single constant scalar from the constant pool + // into a vector. On Sandybridge it is still better to load a constant vector + // from the constant pool and not to broadcast it from a scalar. + if (ConstSplatVal && Subtarget->hasAVX2()) { + EVT CVT = Ld.getValueType(); + assert(!CVT.isVector() && "Must not broadcast a vector type"); + unsigned ScalarSize = CVT.getSizeInBits(); + + if ((Is256 && (ScalarSize == 32 || ScalarSize == 64)) || + (Is128 && (ScalarSize == 32))) { + + const Constant *C = 0; + if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) + C = CI->getConstantIntValue(); + else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld)) + C = CF->getConstantFPValue(); + + assert(C && "Invalid constant type"); + + SDValue CP = DAG.getConstantPool(C, getPointerTy()); + unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); + Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(), + false, false, false, Alignment); + + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + } + } + // The scalar source must be a normal load. if (!ISD::isNormalLoad(Ld.getNode())) return SDValue(); @@ -4921,28 +4987,26 @@ static SDValue isVectorBroadcast(SDValue &Op, const X86Subtarget *Subtarget) { if (Ld->hasAnyUseOfValue(1)) return SDValue(); - bool Is256 = VT.getSizeInBits() == 256; - bool Is128 = VT.getSizeInBits() == 128; unsigned ScalarSize = Ld.getValueType().getSizeInBits(); // VBroadcast to YMM if (Is256 && (ScalarSize == 32 || ScalarSize == 64)) - return Ld; + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // VBroadcast to XMM if (Is128 && (ScalarSize == 32)) - return Ld; + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The integer check is needed for the 64-bit into 128-bit so it doesn't match // double since there is vbroadcastsd xmm if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) { // VBroadcast to YMM if (Is256 && (ScalarSize == 8 || ScalarSize == 16)) - return Ld; + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // VBroadcast to XMM if (Is128 && (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) - return Ld; + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } // Unsupported broadcast. @@ -4977,9 +5041,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return getOnesVector(VT, Subtarget->hasAVX2(), DAG, dl); } - SDValue LD = isVectorBroadcast(Op, Subtarget); - if (LD.getNode()) - return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD); + SDValue Broadcast = LowerVectorBroadcast(Op, DAG); + if (Broadcast.getNode()) + return Broadcast; unsigned EVTBits = ExtVT.getSizeInBits(); @@ -5343,6 +5407,85 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { return LowerAVXCONCAT_VECTORS(Op, DAG); } +// Try to lower a shuffle node into a simple blend instruction. +static SDValue LowerVECTOR_SHUFFLEtoBlend(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + DebugLoc dl = SVOp->getDebugLoc(); + EVT VT = Op.getValueType(); + EVT InVT = V1.getValueType(); + int MaskSize = VT.getVectorNumElements(); + int InSize = InVT.getVectorNumElements(); + + if (!Subtarget->hasSSE41()) + return SDValue(); + + if (MaskSize != InSize) + return SDValue(); + + int ISDNo = 0; + MVT OpTy; + + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v8i16: + ISDNo = X86ISD::BLENDPW; + OpTy = MVT::v8i16; + break; + case MVT::v4i32: + case MVT::v4f32: + ISDNo = X86ISD::BLENDPS; + OpTy = MVT::v4f32; + break; + case MVT::v2i64: + case MVT::v2f64: + ISDNo = X86ISD::BLENDPD; + OpTy = MVT::v2f64; + break; + case MVT::v8i32: + case MVT::v8f32: + if (!Subtarget->hasAVX()) + return SDValue(); + ISDNo = X86ISD::BLENDPS; + OpTy = MVT::v8f32; + break; + case MVT::v4i64: + case MVT::v4f64: + if (!Subtarget->hasAVX()) + return SDValue(); + ISDNo = X86ISD::BLENDPD; + OpTy = MVT::v4f64; + break; + case MVT::v16i16: + if (!Subtarget->hasAVX2()) + return SDValue(); + ISDNo = X86ISD::BLENDPW; + OpTy = MVT::v16i16; + break; + } + assert(ISDNo && "Invalid Op Number"); + + unsigned MaskVals = 0; + + for (int i = 0; i < MaskSize; ++i) { + int EltIdx = SVOp->getMaskElt(i); + if (EltIdx == i || EltIdx == -1) + MaskVals |= (1<<i); + else if (EltIdx == (i + MaskSize)) + continue; // Bit is set to zero; + else return SDValue(); + } + + V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, OpTy, V2); + SDValue Ret = DAG.getNode(ISDNo, dl, OpTy, V1, V2, + DAG.getConstant(MaskVals, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Ret); +} + // v8i16 shuffles - Prefer shuffles in the following order: // 1. [all] pshuflw, pshufhw, optional move // 2. [ssse3] 1 x pshufb @@ -5836,96 +5979,79 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { unsigned NumElems = VT.getVectorNumElements(); unsigned NumLaneElems = NumElems / 2; - int MinRange[2][2] = { { static_cast<int>(NumElems), - static_cast<int>(NumElems) }, - { static_cast<int>(NumElems), - static_cast<int>(NumElems) } }; - int MaxRange[2][2] = { { -1, -1 }, { -1, -1 } }; + DebugLoc dl = SVOp->getDebugLoc(); + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); + SDValue Shufs[2]; - // Collect used ranges for each source in each lane + SmallVector<int, 16> Mask; for (unsigned l = 0; l < 2; ++l) { - unsigned LaneStart = l*NumLaneElems; + // Build a shuffle mask for the output, discovering on the fly which + // input vectors to use as shuffle operands (recorded in InputUsed). + // If building a suitable shuffle vector proves too hard, then bail + // out with useBuildVector set. + int InputUsed[2] = { -1, -1 }; // Not yet discovered. + unsigned LaneStart = l * NumLaneElems; for (unsigned i = 0; i != NumLaneElems; ++i) { + // The mask element. This indexes into the input. int Idx = SVOp->getMaskElt(i+LaneStart); - if (Idx < 0) + if (Idx < 0) { + // the mask element does not index into any input vector. + Mask.push_back(-1); continue; - - int Input = 0; - if (Idx >= (int)NumElems) { - Idx -= NumElems; - Input = 1; } - if (Idx > MaxRange[l][Input]) - MaxRange[l][Input] = Idx; - if (Idx < MinRange[l][Input]) - MinRange[l][Input] = Idx; - } - } + // The input vector this mask element indexes into. + int Input = Idx / NumLaneElems; - // Make sure each range is 128-bits - int ExtractIdx[2][2] = { { -1, -1 }, { -1, -1 } }; - for (unsigned l = 0; l < 2; ++l) { - for (unsigned Input = 0; Input < 2; ++Input) { - if (MinRange[l][Input] == (int)NumElems && MaxRange[l][Input] < 0) - continue; + // Turn the index into an offset from the start of the input vector. + Idx -= Input * NumLaneElems; - if (MinRange[l][Input] >= 0 && MaxRange[l][Input] < (int)NumLaneElems) - ExtractIdx[l][Input] = 0; - else if (MinRange[l][Input] >= (int)NumLaneElems && - MaxRange[l][Input] < (int)NumElems) - ExtractIdx[l][Input] = NumLaneElems; - else - return SDValue(); - } - } + // Find or create a shuffle vector operand to hold this input. + unsigned OpNo; + for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) { + if (InputUsed[OpNo] == Input) + // This input vector is already an operand. + break; + if (InputUsed[OpNo] < 0) { + // Create a new operand for this input vector. + InputUsed[OpNo] = Input; + break; + } + } - DebugLoc dl = SVOp->getDebugLoc(); - MVT EltVT = VT.getVectorElementType().getSimpleVT(); - EVT NVT = MVT::getVectorVT(EltVT, NumElems/2); + if (OpNo >= array_lengthof(InputUsed)) { + // More than two input vectors used! Give up. + return SDValue(); + } - SDValue Ops[2][2]; - for (unsigned l = 0; l < 2; ++l) { - for (unsigned Input = 0; Input < 2; ++Input) { - if (ExtractIdx[l][Input] >= 0) - Ops[l][Input] = Extract128BitVector(SVOp->getOperand(Input), - DAG.getConstant(ExtractIdx[l][Input], MVT::i32), - DAG, dl); - else - Ops[l][Input] = DAG.getUNDEF(NVT); + // Add the mask index for the new shuffle vector. + Mask.push_back(Idx + OpNo * NumLaneElems); } - } - // Generate 128-bit shuffles - SmallVector<int, 16> Mask1, Mask2; - for (unsigned i = 0; i != NumLaneElems; ++i) { - int Elt = SVOp->getMaskElt(i); - if (Elt >= (int)NumElems) { - Elt %= NumLaneElems; - Elt += NumLaneElems; - } else if (Elt >= 0) { - Elt %= NumLaneElems; - } - Mask1.push_back(Elt); - } - for (unsigned i = NumLaneElems; i != NumElems; ++i) { - int Elt = SVOp->getMaskElt(i); - if (Elt >= (int)NumElems) { - Elt %= NumLaneElems; - Elt += NumLaneElems; - } else if (Elt >= 0) { - Elt %= NumLaneElems; + if (InputUsed[0] < 0) { + // No input vectors were used! The result is undefined. + Shufs[l] = DAG.getUNDEF(NVT); + } else { + SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2), + DAG.getConstant((InputUsed[0] % 2) * NumLaneElems, MVT::i32), + DAG, dl); + // If only one input was used, use an undefined vector for the other. + SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) : + Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2), + DAG.getConstant((InputUsed[1] % 2) * NumLaneElems, MVT::i32), + DAG, dl); + // At least one input vector was used. Create a new shuffle vector. + Shufs[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]); } - Mask2.push_back(Elt); - } - SDValue Shuf1 = DAG.getVectorShuffle(NVT, dl, Ops[0][0], Ops[0][1], &Mask1[0]); - SDValue Shuf2 = DAG.getVectorShuffle(NVT, dl, Ops[1][0], Ops[1][1], &Mask2[0]); + Mask.clear(); + } // Concatenate the result back - SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Shuf1, + SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Shufs[0], DAG.getConstant(0, MVT::i32), DAG, dl); - return Insert128BitVector(V, Shuf2, DAG.getConstant(NumElems/2, MVT::i32), + return Insert128BitVector(V, Shufs[1],DAG.getConstant(NumLaneElems, MVT::i32), DAG, dl); } @@ -6203,10 +6329,8 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { getShuffleSHUFImmediate(SVOp), DAG); } -static -SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, - const TargetLowering &TLI, - const X86Subtarget *Subtarget) { +SDValue +X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); @@ -6222,9 +6346,9 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, int Size = VT.getSizeInBits(); // Use vbroadcast whenever the splat comes from a foldable load - SDValue LD = isVectorBroadcast(Op, Subtarget); - if (LD.getNode()) - return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD); + SDValue Broadcast = LowerVectorBroadcast(Op, DAG); + if (Broadcast.getNode()) + return Broadcast; // Handle splats by matching through known shuffle masks if ((Size == 128 && NumElem <= 4) || @@ -6309,7 +6433,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // Normalize the input vectors. Here splats, zeroed vectors, profitable // narrowing and commutation of operands should be handled. The actual code // doesn't include all of those, work in progress... - SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget); + SDValue NewOp = NormalizeVectorShuffle(Op, DAG); if (NewOp.getNode()) return NewOp; @@ -6524,6 +6648,27 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, V2, getShuffleVPERM2X128Immediate(SVOp), DAG); + SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(Op, Subtarget, DAG); + if (BlendOp.getNode()) + return BlendOp; + + if (V2IsUndef && HasAVX2 && (VT == MVT::v8i32 || VT == MVT::v8f32)) { + SmallVector<SDValue, 8> permclMask; + for (unsigned i = 0; i != 8; ++i) { + permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32)); + } + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, + &permclMask[0], 8); + // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 + return DAG.getNode(X86ISD::VPERMV, dl, VT, + DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1); + } + + if (V2IsUndef && HasAVX2 && (VT == MVT::v4i64 || VT == MVT::v4f64)) + return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, + getShuffleCLImmediate(SVOp), DAG); + + //===--------------------------------------------------------------------===// // Since no target specific shuffle was selected for this generic one, // lower it into other known shuffles. FIXME: this isn't true yet, but @@ -7182,8 +7327,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) GV = GA->resolveAliasedGlobal(false); - TLSModel::Model model - = getTLSModel(GV, getTargetMachine().getRelocationModel()); + TLSModel::Model model = getTargetMachine().getTLSModel(GV); switch (model) { case TLSModel::GeneralDynamic: @@ -8099,8 +8243,8 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, unsigned BitWidth = Op0.getValueSizeInBits(); unsigned AndBitWidth = And.getValueSizeInBits(); if (BitWidth > AndBitWidth) { - APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; - DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); + APInt Zeros, Ones; + DAG.ComputeMaskedBits(Op0, Zeros, Ones); if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) return SDValue(); } @@ -9449,12 +9593,12 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const case Intrinsic::x86_avx2_vperm2i128: return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - case Intrinsic::x86_avx_vpermil_ps: - case Intrinsic::x86_avx_vpermil_pd: - case Intrinsic::x86_avx_vpermil_ps_256: - case Intrinsic::x86_avx_vpermil_pd_256: - return DAG.getNode(X86ISD::VPERMILP, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_avx2_permd: + case Intrinsic::x86_avx2_permps: + // Operands intentionally swapped. Mask is last operand to intrinsic, + // but second operand for node/intruction. + return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(1)); // ptest and testp intrinsics. The intrinsic these come from are designed to // return an integer value, not just an instruction so lower it to the ptest @@ -10963,6 +11107,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::ANDNP: return "X86ISD::ANDNP"; case X86ISD::PSIGN: return "X86ISD::PSIGN"; case X86ISD::BLENDV: return "X86ISD::BLENDV"; + case X86ISD::BLENDPW: return "X86ISD::BLENDPW"; + case X86ISD::BLENDPS: return "X86ISD::BLENDPS"; + case X86ISD::BLENDPD: return "X86ISD::BLENDPD"; case X86ISD::HADD: return "X86ISD::HADD"; case X86ISD::HSUB: return "X86ISD::HSUB"; case X86ISD::FHADD: return "X86ISD::FHADD"; @@ -11035,6 +11182,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; + case X86ISD::VPERMV: return "X86ISD::VPERMV"; + case X86ISD::VPERMI: return "X86ISD::VPERMI"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; @@ -11192,14 +11341,15 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, unsigned notOpc, unsigned EAXreg, const TargetRegisterClass *RC, - bool invSrc) const { + bool Invert) const { // For the atomic bitwise operator, we generate // thisMBB: // newMBB: // ld t1 = [bitinstr.addr] // op t2 = t1, [bitinstr.val] + // not t3 = t2 (if Invert) // mov EAX = t1 - // lcs dest = [bitinstr.addr], t2 [EAX is implicit] + // lcs dest = [bitinstr.addr], t3 [EAX is implicit] // bz newMBB // fallthrough -->nextMBB const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); @@ -11247,13 +11397,6 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, for (int i=0; i <= lastAddrIndx; ++i) (*MIB).addOperand(*argOpers[i]); - unsigned tt = F->getRegInfo().createVirtualRegister(RC); - if (invSrc) { - MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); - } - else - tt = t1; - unsigned t2 = F->getRegInfo().createVirtualRegister(RC); assert((argOpers[valArgIndx]->isReg() || argOpers[valArgIndx]->isImm()) && @@ -11262,16 +11405,23 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); else MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); - MIB.addReg(tt); + MIB.addReg(t1); (*MIB).addOperand(*argOpers[valArgIndx]); + unsigned t3 = F->getRegInfo().createVirtualRegister(RC); + if (Invert) { + MIB = BuildMI(newMBB, dl, TII->get(notOpc), t3).addReg(t2); + } + else + t3 = t2; + MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); MIB.addReg(t1); MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); for (int i=0; i <= lastAddrIndx; ++i) (*MIB).addOperand(*argOpers[i]); - MIB.addReg(t2); + MIB.addReg(t3); assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); (*MIB).setMemRefs(bInstr->memoperands_begin(), bInstr->memoperands_end()); @@ -11294,7 +11444,7 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, unsigned regOpcH, unsigned immOpcL, unsigned immOpcH, - bool invSrc) const { + bool Invert) const { // For the atomic bitwise operator, we generate // thisMBB (instructions are in pairs, except cmpxchg8b) // ld t1,t2 = [bitinstr.addr] @@ -11302,6 +11452,7 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) // op t5, t6 <- out1, out2, [bitinstr.val] // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) + // neg t7, t8 < t5, t6 (if Invert) // mov ECX, EBX <- t5, t6 // mov EAX, EDX <- t1, t2 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] @@ -11385,16 +11536,9 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); // The subsequent operations should be using the destination registers of - //the PHI instructions. - if (invSrc) { - t1 = F->getRegInfo().createVirtualRegister(RC); - t2 = F->getRegInfo().createVirtualRegister(RC); - MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); - MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); - } else { - t1 = dest1Oper.getReg(); - t2 = dest2Oper.getReg(); - } + // the PHI instructions. + t1 = dest1Oper.getReg(); + t2 = dest2Oper.getReg(); int valArgIndx = lastAddrIndx + 1; assert((argOpers[valArgIndx]->isReg() || @@ -11421,15 +11565,26 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, MIB.addReg(t2); (*MIB).addOperand(*argOpers[valArgIndx + 1]); + unsigned t7, t8; + if (Invert) { + t7 = F->getRegInfo().createVirtualRegister(RC); + t8 = F->getRegInfo().createVirtualRegister(RC); + MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t7).addReg(t5); + MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t8).addReg(t6); + } else { + t7 = t5; + t8 = t6; + } + MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); MIB.addReg(t1); MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); MIB.addReg(t2); MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); - MIB.addReg(t5); + MIB.addReg(t7); MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); - MIB.addReg(t6); + MIB.addReg(t8); MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); for (int i=0; i <= lastAddrIndx; ++i) @@ -12620,11 +12775,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, //===----------------------------------------------------------------------===// void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, - const APInt &Mask, APInt &KnownZero, APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth) const { + unsigned BitWidth = KnownZero.getBitWidth(); unsigned Opc = Op.getOpcode(); assert((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || @@ -12633,7 +12788,7 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!"); - KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. + KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. switch (Opc) { default: break; case X86ISD::ADD: @@ -12652,8 +12807,7 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, break; // Fallthrough case X86ISD::SETCC: - KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), - Mask.getBitWidth() - 1); + KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); break; case ISD::INTRINSIC_WO_CHAIN: { unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); @@ -12678,8 +12832,7 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break; } - KnownZero = APInt::getHighBitsSet(Mask.getBitWidth(), - Mask.getBitWidth() - NumLoBits); + KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); break; } } @@ -14000,13 +14153,14 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. - if (Mask.getOpcode() != ISD::BITCAST || - X.getOpcode() != ISD::BITCAST || - Y.getOpcode() != ISD::BITCAST) - return SDValue(); - // Look through mask bitcast. - Mask = Mask.getOperand(0); + if (Mask.getOpcode() == ISD::BITCAST) + Mask = Mask.getOperand(0); + if (X.getOpcode() == ISD::BITCAST) + X = X.getOperand(0); + if (Y.getOpcode() == ISD::BITCAST) + Y = Y.getOperand(0); + EVT MaskVT = Mask.getValueType(); // Validate that the Mask operand is a vector sra node. @@ -14027,8 +14181,6 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, // Now we know we at least have a plendvb with the mask val. See if // we can form a psignb/w/d. // psign = x.type == y.type == mask.type && y = sub(0, x); - X = X.getOperand(0); - Y = Y.getOperand(0); if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { @@ -15596,8 +15748,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, if (Res.second == 0) { // Map st(0) -> st(7) -> ST0 if (Constraint.size() == 7 && Constraint[0] == '{' && - tolower(Constraint[1]) == 's' && - tolower(Constraint[2]) == 't' && + std::tolower(Constraint[1]) == 's' && + std::tolower(Constraint[2]) == 't' && Constraint[3] == '(' && (Constraint[4] >= '0' && Constraint[4] <= '7') && Constraint[5] == ')' && |