diff options
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 3551 |
1 files changed, 1954 insertions, 1597 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 03727a2..cae9aad 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -13,9 +13,9 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "x86-isel" +#include "X86ISelLowering.h" #include "X86.h" #include "X86InstrBuilder.h" -#include "X86ISelLowering.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "Utils/X86ShuffleDecode.h" @@ -39,20 +39,17 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" -#include "llvm/ADT/VectorExtras.h" +#include "llvm/ADT/VariadicFunction.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/Dwarf.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" +#include <bitset> using namespace llvm; -using namespace dwarf; STATISTIC(NumTailCalls, "Number of tail calls"); @@ -60,17 +57,6 @@ STATISTIC(NumTailCalls, "Number of tail calls"); static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, SDValue V2); -static SDValue Insert128BitVector(SDValue Result, - SDValue Vec, - SDValue Idx, - SelectionDAG &DAG, - DebugLoc dl); - -static SDValue Extract128BitVector(SDValue Vec, - SDValue Idx, - SelectionDAG &DAG, - DebugLoc dl); - /// Generate a DAG to grab 128-bits from a vector > 128 bits. This /// sets things up to match to an AVX VEXTRACTF128 instruction or a /// simple subregister reference. Idx is an index in the 128 bits we @@ -168,8 +154,8 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) : TargetLowering(TM, createTLOF(TM)) { Subtarget = &TM.getSubtarget<X86Subtarget>(); - X86ScalarSSEf64 = Subtarget->hasXMMInt(); - X86ScalarSSEf32 = Subtarget->hasXMM(); + X86ScalarSSEf64 = Subtarget->hasSSE2(); + X86ScalarSSEf32 = Subtarget->hasSSE1(); X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; RegInfo = TM.getRegisterInfo(); @@ -185,8 +171,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // For 64-bit since we have so many registers use the ILP scheduler, for // 32-bit code use the register pressure specific scheduling. + // For 32 bit Atom, use Hybrid (register pressure + latency) scheduling. if (Subtarget->is64Bit()) setSchedulingPreference(Sched::ILP); + else if (Subtarget->isAtom()) + setSchedulingPreference(Sched::Hybrid); else setSchedulingPreference(Sched::RegPressure); setStackPointerRegisterToSaveRestore(X86StackPtr); @@ -198,15 +187,18 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setLibcallName(RTLIB::SREM_I64, "_allrem"); setLibcallName(RTLIB::UREM_I64, "_aullrem"); setLibcallName(RTLIB::MUL_I64, "_allmul"); - setLibcallName(RTLIB::FPTOUINT_F64_I64, "_ftol2"); - setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2"); setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); - setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::C); - setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::C); + + // The _ftol2 runtime function has an unusual calling conv, which + // is modeled by a special pseudo-instruction. + setLibcallName(RTLIB::FPTOUINT_F64_I64, 0); + setLibcallName(RTLIB::FPTOUINT_F32_I64, 0); + setLibcallName(RTLIB::FPTOUINT_F64_I32, 0); + setLibcallName(RTLIB::FPTOUINT_F32_I32, 0); } if (Subtarget->isTargetDarwin()) { @@ -255,7 +247,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (Subtarget->is64Bit()) { setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); - setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); + setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); } else if (!TM.Options.UseSoftFloat) { // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. @@ -326,6 +318,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); } + if (isTargetFTOL()) { + // Use the _ftol2 runtime function, which has a pseudo-instruction + // to handle its weird calling convention. + setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); + } + // TODO: when we have SSE, these could be more efficient, by using movd/movq. if (!X86ScalarSSEf64) { setOperationAction(ISD::BITCAST , MVT::f32 , Expand); @@ -378,32 +376,46 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i16 , Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i32 , Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i64 , Expand); + // Promote the i8 variants and force them on up to i32 which has a shorter + // encoding. + setOperationAction(ISD::CTTZ , MVT::i8 , Promote); + AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); + setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); + AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); if (Subtarget->hasBMI()) { - setOperationAction(ISD::CTTZ , MVT::i8 , Promote); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); } else { - setOperationAction(ISD::CTTZ , MVT::i8 , Custom); setOperationAction(ISD::CTTZ , MVT::i16 , Custom); setOperationAction(ISD::CTTZ , MVT::i32 , Custom); if (Subtarget->is64Bit()) setOperationAction(ISD::CTTZ , MVT::i64 , Custom); } - setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i8 , Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i16 , Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i32 , Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i64 , Expand); if (Subtarget->hasLZCNT()) { + // When promoting the i8 variants, force them to i32 for a shorter + // encoding. setOperationAction(ISD::CTLZ , MVT::i8 , Promote); + AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); + AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); } else { setOperationAction(ISD::CTLZ , MVT::i8 , Custom); setOperationAction(ISD::CTLZ , MVT::i16 , Custom); setOperationAction(ISD::CTLZ , MVT::i32 , Custom); - if (Subtarget->is64Bit()) + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); + if (Subtarget->is64Bit()) { setOperationAction(ISD::CTLZ , MVT::i64 , Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); + } } if (Subtarget->hasPOPCNT()) { @@ -466,7 +478,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); } - if (Subtarget->hasXMM()) + if (Subtarget->hasSSE1()) setOperationAction(ISD::PREFETCH , MVT::Other, Legal); setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); @@ -800,7 +812,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); - if (!TM.Options.UseSoftFloat && Subtarget->hasXMM()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) { addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); setOperationAction(ISD::FADD, MVT::v4f32, Legal); @@ -817,7 +829,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SETCC, MVT::v4f32, Custom); } - if (!TM.Options.UseSoftFloat && Subtarget->hasXMMInt()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM @@ -923,7 +935,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); } - if (Subtarget->hasSSE41orAVX()) { + if (Subtarget->hasSSE41()) { setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FCEIL, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); @@ -959,14 +971,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); // FIXME: these should be Legal but thats only for the case where - // the index is constant. For now custom expand to deal with that + // the index is constant. For now custom expand to deal with that. if (Subtarget->is64Bit()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); } } - if (Subtarget->hasXMMInt()) { + if (Subtarget->hasSSE2()) { setOperationAction(ISD::SRL, MVT::v8i16, Custom); setOperationAction(ISD::SRL, MVT::v16i8, Custom); @@ -995,7 +1007,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } } - if (Subtarget->hasSSE42orAVX()) + if (Subtarget->hasSSE42()) setOperationAction(ISD::SETCC, MVT::v2i64, Custom); if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) { @@ -1157,7 +1169,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // of this type with custom code. for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) { - setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, + Custom); } // We want to custom lower some of our intrinsics. @@ -1195,7 +1208,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); - setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SHL); @@ -1210,6 +1222,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::SINT_TO_FP); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); @@ -1279,7 +1293,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { } unsigned Align = 4; - if (Subtarget->hasXMM()) + if (Subtarget->hasSSE1()) getMaxByValAlign(Ty, Align); return Align; } @@ -1313,17 +1327,20 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16))) && Subtarget->getStackAlignment() >= 16) { - if (Subtarget->hasAVX() && - Subtarget->getStackAlignment() >= 32) - return MVT::v8f32; - if (Subtarget->hasXMMInt()) + if (Subtarget->getStackAlignment() >= 32) { + if (Subtarget->hasAVX2()) + return MVT::v8i32; + if (Subtarget->hasAVX()) + return MVT::v8f32; + } + if (Subtarget->hasSSE2()) return MVT::v4i32; - if (Subtarget->hasXMM()) + if (Subtarget->hasSSE1()) return MVT::v4f32; } else if (!MemcpyStrSrc && Size >= 8 && !Subtarget->is64Bit() && Subtarget->getStackAlignment() >= 8 && - Subtarget->hasXMMInt()) { + Subtarget->hasSSE2()) { // Do not use f64 to lower memcpy if source is string constant. It's // better to use i32 to avoid the loads. return MVT::f64; @@ -1488,14 +1505,14 @@ X86TargetLowering::LowerReturn(SDValue Chain, // or SSE or MMX vectors. if ((ValVT == MVT::f32 || ValVT == MVT::f64 || VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && - (Subtarget->is64Bit() && !Subtarget->hasXMM())) { + (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } // Likewise we can't return F64 values with SSE1 only. gcc does so, but // llvm-gcc has never done it right and no one has noticed, so this // should be OK for now. if (ValVT == MVT::f64 && - (Subtarget->is64Bit() && !Subtarget->hasXMMInt())) + (Subtarget->is64Bit() && !Subtarget->hasSSE2())) report_fatal_error("SSE2 register return with SSE2 disabled"); // Returns in ST0/ST1 are handled specially: these are pushed as operands to @@ -1521,7 +1538,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, ValToCopy); // If we don't have SSE2 available, convert to v4f32 so the generated // register is legal. - if (!Subtarget->hasXMMInt()) + if (!Subtarget->hasSSE2()) ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); } } @@ -1568,8 +1585,12 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { return false; SDNode *Copy = *N->use_begin(); - if (Copy->getOpcode() != ISD::CopyToReg && - Copy->getOpcode() != ISD::FP_EXTEND) + if (Copy->getOpcode() == ISD::CopyToReg) { + // If the copy has a glue operand, we conservatively assume it isn't safe to + // perform a tail call. + if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) + return false; + } else if (Copy->getOpcode() != ISD::FP_EXTEND) return false; bool HasRet = false; @@ -1621,7 +1642,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && - ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) { + ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } @@ -1711,7 +1732,7 @@ static bool IsTailCallConvention(CallingConv::ID CC) { } bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { - if (!CI->isTailCall()) + if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) return false; CallSite CS(CI); @@ -1790,6 +1811,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, MachineFrameInfo *MFI = MF.getFrameInfo(); bool Is64Bit = Subtarget->is64Bit(); + bool IsWindows = Subtarget->isTargetWindows(); bool IsWin64 = Subtarget->isTargetWin64(); assert(!(isVarArg && IsTailCallConvention(CallConv)) && @@ -1820,7 +1842,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); - TargetRegisterClass *RC = NULL; + const TargetRegisterClass *RC; if (RegVT == MVT::i32) RC = X86::GR32RegisterClass; else if (Is64Bit && RegVT == MVT::i64) @@ -1928,19 +1950,20 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, TotalNumIntRegs = 6; TotalNumXMMRegs = 8; GPR64ArgRegs = GPR64ArgRegs64Bit; - NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, TotalNumXMMRegs); + NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, + TotalNumXMMRegs); } unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, TotalNumIntRegs); bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); - assert(!(NumXMMRegs && !Subtarget->hasXMM()) && + assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && "SSE register cannot be used when SSE is disabled!"); assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"); if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || - !Subtarget->hasXMM()) + !Subtarget->hasSSE1()) // Kernel mode asks for SSE to be disabled, so don't push them // on the stack. TotalNumXMMRegs = 0; @@ -1957,8 +1980,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); } else { // For X86-64, if there are vararg parameters that are passed via - // registers, then we must store them to their spots on the stack so they - // may be loaded by deferencing the result of va_next. + // registers, then we must store them to their spots on the stack so + // they may be loaded by deferencing the result of va_next. FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); FuncInfo->setRegSaveFrameIndex( @@ -2024,7 +2047,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. - if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) + if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && + ArgsAreStructReturn(Ins)) FuncInfo->setBytesToPopOnReturn(4); } @@ -2099,7 +2123,7 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, SDValue X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, - bool &isTailCall, + bool doesNotRet, bool &isTailCall, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, @@ -2108,9 +2132,13 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isTargetWin64(); + bool IsWindows = Subtarget->isTargetWindows(); bool IsStructRet = CallIsStructReturn(Outs); bool IsSibcall = false; + if (MF.getTarget().Options.DisableTailCalls) + isTailCall = false; + if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, @@ -2303,7 +2331,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); - assert((Subtarget->hasXMM() || !NumXMMRegs) + assert((Subtarget->hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); Chain = DAG.getCopyToReg(Chain, dl, X86::AL, @@ -2488,6 +2516,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (Is64Bit && isVarArg && !IsWin64) Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); + // Add a register mask operand representing the call-preserved registers. + const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); + if (InFlag.getNode()) Ops.push_back(InFlag); @@ -2510,10 +2544,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, getTargetMachine().Options.GuaranteedTailCallOpt)) NumBytesForCalleeToPush = NumBytes; // Callee pops everything - else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) + else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && + IsStructRet) // If this is a call to a struct-return function, the callee // pops the hidden struct pointer, so we have to push it back. // This is common for Darwin/X86, Linux & Mingw32 targets. + // For MSVC Win32 targets, the caller pops the hidden struct pointer. NumBytesForCalleeToPush = 4; else NumBytesForCalleeToPush = 0; // Callee pops nothing. @@ -2709,9 +2745,9 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, return false; } - // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. - // Therefore if it's not used by the call it is not safe to optimize this into - // a sibcall. + // If the call result is in ST0 / ST1, it needs to be popped off the x87 + // stack. Therefore, if it's not used by the call it is not safe to optimize + // this into a sibcall. bool Unused = false; for (unsigned i = 0, e = Ins.size(); i != e; ++i) { if (!Ins[i].Used) { @@ -2853,9 +2889,8 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: - case X86ISD::SHUFPD: + case X86ISD::SHUFP: case X86ISD::PALIGN: - case X86ISD::SHUFPS: case X86ISD::MOVLHPS: case X86ISD::MOVLHPD: case X86ISD::MOVHLPS: @@ -2872,7 +2907,6 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::VPERM2X128: return true; } - return false; } static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, @@ -2884,8 +2918,6 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::MOVDDUP: return DAG.getNode(Opc, dl, VT, V1); } - - return SDValue(); } static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, @@ -2898,8 +2930,6 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::VPERMILP: return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); } - - return SDValue(); } static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, @@ -2907,13 +2937,11 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::PALIGN: - case X86ISD::SHUFPD: - case X86ISD::SHUFPS: + case X86ISD::SHUFP: case X86ISD::VPERM2X128: return DAG.getNode(Opc, dl, VT, V1, V2, DAG.getConstant(TargetMask, MVT::i8)); } - return SDValue(); } static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, @@ -2931,7 +2959,6 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::UNPCKH: return DAG.getNode(Opc, dl, VT, V1, V2); } - return SDValue(); } SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { @@ -3126,17 +3153,6 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) { return (Val < 0) || (Val >= Low && Val < Hi); } -/// isUndefOrInRange - Return true if every element in Mask, begining -/// from position Pos and ending in Pos+Size, falls within the specified -/// range (L, L+Pos]. or is undef. -static bool isUndefOrInRange(const SmallVectorImpl<int> &Mask, - int Pos, int Size, int Low, int Hi) { - for (int i = Pos, e = Pos+Size; i != e; ++i) - if (!isUndefOrInRange(Mask[i], Low, Hi)) - return false; - return true; -} - /// isUndefOrEqual - Val is either less than zero (undef) or equal to the /// specified value. static bool isUndefOrEqual(int Val, int CmpVal) { @@ -3148,7 +3164,7 @@ static bool isUndefOrEqual(int Val, int CmpVal) { /// isSequentialOrUndefInRange - Return true if every element in Mask, begining /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (L, L+Pos]. or is undef. -static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask, +static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, int Pos, int Size, int Low) { for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low) if (!isUndefOrEqual(Mask[i], Low)) @@ -3159,7 +3175,7 @@ static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask, /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference /// the second operand. -static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { +static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) { if (VT == MVT::v4f32 || VT == MVT::v4i32 ) return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); if (VT == MVT::v2f64 || VT == MVT::v2i64) @@ -3167,180 +3183,113 @@ static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { return false; } -bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isPSHUFDMask(M, N->getValueType(0)); -} - /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFHW. -static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { +static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) { if (VT != MVT::v8i16) return false; // Lower quadword copied in order or undef. - for (int i = 0; i != 4; ++i) - if (Mask[i] >= 0 && Mask[i] != i) - return false; + if (!isSequentialOrUndefInRange(Mask, 0, 4, 0)) + return false; // Upper quadword shuffled. - for (int i = 4; i != 8; ++i) + for (unsigned i = 4; i != 8; ++i) if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) return false; return true; } -bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isPSHUFHWMask(M, N->getValueType(0)); -} - /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFLW. -static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { +static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) { if (VT != MVT::v8i16) return false; // Upper quadword copied in order. - for (int i = 4; i != 8; ++i) - if (Mask[i] >= 0 && Mask[i] != i) - return false; + if (!isSequentialOrUndefInRange(Mask, 4, 4, 4)) + return false; // Lower quadword shuffled. - for (int i = 0; i != 4; ++i) + for (unsigned i = 0; i != 4; ++i) if (Mask[i] >= 4) return false; return true; } -bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isPSHUFLWMask(M, N->getValueType(0)); -} - /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PALIGNR. -static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, - bool hasSSSE3OrAVX) { - int i, e = VT.getVectorNumElements(); - if (VT.getSizeInBits() != 128) +static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT, + const X86Subtarget *Subtarget) { + if ((VT.getSizeInBits() == 128 && !Subtarget->hasSSSE3()) || + (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())) return false; - // Do not handle v2i64 / v2f64 shuffles with palignr. - if (e < 4 || !hasSSSE3OrAVX) + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + // Do not handle 64-bit element shuffles with palignr. + if (NumLaneElts == 2) return false; - for (i = 0; i != e; ++i) - if (Mask[i] >= 0) - break; + for (unsigned l = 0; l != NumElts; l+=NumLaneElts) { + unsigned i; + for (i = 0; i != NumLaneElts; ++i) { + if (Mask[i+l] >= 0) + break; + } - // All undef, not a palignr. - if (i == e) - return false; + // Lane is all undef, go to next lane + if (i == NumLaneElts) + continue; - // Make sure we're shifting in the right direction. - if (Mask[i] <= i) - return false; + int Start = Mask[i+l]; - int s = Mask[i] - i; + // Make sure its in this lane in one of the sources + if (!isUndefOrInRange(Start, l, l+NumLaneElts) && + !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts)) + return false; - // Check the rest of the elements to see if they are consecutive. - for (++i; i != e; ++i) { - int m = Mask[i]; - if (m >= 0 && m != s+i) + // If not lane 0, then we must match lane 0 + if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l)) return false; - } - return true; -} -/// isVSHUFPYMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to 256-bit -/// VSHUFPSY. -static bool isVSHUFPYMask(const SmallVectorImpl<int> &Mask, EVT VT, - bool HasAVX, bool Commuted = false) { - int NumElems = VT.getVectorNumElements(); + // Correct second source to be contiguous with first source + if (Start >= (int)NumElts) + Start -= NumElts - NumLaneElts; - if (!HasAVX || VT.getSizeInBits() != 256) - return false; + // Make sure we're shifting in the right direction. + if (Start <= (int)(i+l)) + return false; - if (NumElems != 4 && NumElems != 8) - return false; + Start -= i; - // VSHUFPSY divides the resulting vector into 4 chunks. - // The sources are also splitted into 4 chunks, and each destination - // chunk must come from a different source chunk. - // - // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 - // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 - // - // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, - // Y3..Y0, Y3..Y0, X3..X0, X3..X0 - // - // VSHUFPDY divides the resulting vector into 4 chunks. - // The sources are also splitted into 4 chunks, and each destination - // chunk must come from a different source chunk. - // - // SRC1 => X3 X2 X1 X0 - // SRC2 => Y3 Y2 Y1 Y0 - // - // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 - // - unsigned QuarterSize = NumElems/4; - unsigned HalfSize = QuarterSize*2; - for (unsigned l = 0; l != 2; ++l) { - unsigned LaneStart = l*HalfSize; - for (unsigned s = 0; s != 2; ++s) { - unsigned QuarterStart = s*QuarterSize; - unsigned Src = (Commuted) ? (1-s) : s; - unsigned SrcStart = Src*NumElems + LaneStart; - for (unsigned i = 0; i != QuarterSize; ++i) { - int Idx = Mask[i+QuarterStart+LaneStart]; - if (!isUndefOrInRange(Idx, SrcStart, SrcStart+HalfSize)) - return false; - // For VSHUFPSY, the mask of the second half must be the same as the first - // but with the appropriate offsets. This works in the same way as - // VPERMILPS works with masks. - if (NumElems == 4 || l == 0 || Mask[i+QuarterStart] < 0) - continue; - if (!isUndefOrEqual(Idx, Mask[i+QuarterStart]+HalfSize)) - return false; - } - } - } + // Check the rest of the elements to see if they are consecutive. + for (++i; i != NumLaneElts; ++i) { + int Idx = Mask[i+l]; - return true; -} + // Make sure its in this lane + if (!isUndefOrInRange(Idx, l, l+NumLaneElts) && + !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts)) + return false; -/// getShuffleVSHUFPYImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VSHUFPSY/VSHUFPDY instructions. -static unsigned getShuffleVSHUFPYImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - EVT VT = SVOp->getValueType(0); - int NumElems = VT.getVectorNumElements(); + // If not lane 0, then we must match lane 0 + if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l)) + return false; - assert(VT.getSizeInBits() == 256 && "Only supports 256-bit types"); - assert((NumElems == 4 || NumElems == 8) && "Only supports v4 and v8 types"); + if (Idx >= (int)NumElts) + Idx -= NumElts - NumLaneElts; - int HalfSize = NumElems/2; - unsigned Mul = (NumElems == 8) ? 2 : 1; - unsigned Mask = 0; - for (int i = 0; i != NumElems; ++i) { - int Elt = SVOp->getMaskElt(i); - if (Elt < 0) - continue; - Elt %= HalfSize; - unsigned Shamt = i; - // For VSHUFPSY, the mask of the first half must be equal to the second one. - if (NumElems == 8) Shamt %= HalfSize; - Mask |= Elt << (Shamt*Mul); + if (!isUndefOrEqual(Idx, Start+i)) + return false; + + } } - return Mask; + return true; } /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming @@ -3359,42 +3308,63 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, } /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to 128-bit +/// specifies a shuffle of elements that is suitable for input to 128/256-bit /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be /// reverse of what x86 shuffles want. -static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT, +static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX, bool Commuted = false) { - unsigned NumElems = VT.getVectorNumElements(); - - if (VT.getSizeInBits() != 128) + if (!HasAVX && VT.getSizeInBits() == 256) return false; - if (NumElems != 2 && NumElems != 4) + unsigned NumElems = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElems = NumElems/NumLanes; + + if (NumLaneElems != 2 && NumLaneElems != 4) return false; - unsigned Half = NumElems / 2; - unsigned SrcStart = Commuted ? NumElems : 0; - for (unsigned i = 0; i != Half; ++i) - if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems)) - return false; - SrcStart = Commuted ? 0 : NumElems; - for (unsigned i = Half; i != NumElems; ++i) - if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems)) - return false; + // VSHUFPSY divides the resulting vector into 4 chunks. + // The sources are also splitted into 4 chunks, and each destination + // chunk must come from a different source chunk. + // + // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 + // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 + // + // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, + // Y3..Y0, Y3..Y0, X3..X0, X3..X0 + // + // VSHUFPDY divides the resulting vector into 4 chunks. + // The sources are also splitted into 4 chunks, and each destination + // chunk must come from a different source chunk. + // + // SRC1 => X3 X2 X1 X0 + // SRC2 => Y3 Y2 Y1 Y0 + // + // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 + // + unsigned HalfLaneElems = NumLaneElems/2; + for (unsigned l = 0; l != NumElems; l += NumLaneElems) { + for (unsigned i = 0; i != NumLaneElems; ++i) { + int Idx = Mask[i+l]; + unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0); + if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems)) + return false; + // For VSHUFPSY, the mask of the second half must be the same as the + // first but with the appropriate offsets. This works in the same way as + // VPERMILPS works with masks. + if (NumElems != 8 || l == 0 || Mask[i] < 0) + continue; + if (!isUndefOrEqual(Idx, Mask[i]+l)) + return false; + } + } return true; } -bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isSHUFPMask(M, N->getValueType(0)); -} - /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVHLPS. -bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { - EVT VT = N->getValueType(0); +static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) { unsigned NumElems = VT.getVectorNumElements(); if (VT.getSizeInBits() != 128) @@ -3404,17 +3374,16 @@ bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { return false; // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 - return isUndefOrEqual(N->getMaskElt(0), 6) && - isUndefOrEqual(N->getMaskElt(1), 7) && - isUndefOrEqual(N->getMaskElt(2), 2) && - isUndefOrEqual(N->getMaskElt(3), 3); + return isUndefOrEqual(Mask[0], 6) && + isUndefOrEqual(Mask[1], 7) && + isUndefOrEqual(Mask[2], 2) && + isUndefOrEqual(Mask[3], 3); } /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, /// <2, 3, 2, 3> -bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { - EVT VT = N->getValueType(0); +static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) { unsigned NumElems = VT.getVectorNumElements(); if (VT.getSizeInBits() != 128) @@ -3423,26 +3392,29 @@ bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { if (NumElems != 4) return false; - return isUndefOrEqual(N->getMaskElt(0), 2) && - isUndefOrEqual(N->getMaskElt(1), 3) && - isUndefOrEqual(N->getMaskElt(2), 2) && - isUndefOrEqual(N->getMaskElt(3), 3); + return isUndefOrEqual(Mask[0], 2) && + isUndefOrEqual(Mask[1], 3) && + isUndefOrEqual(Mask[2], 2) && + isUndefOrEqual(Mask[3], 3); } /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. -bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { - unsigned NumElems = N->getValueType(0).getVectorNumElements(); +static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) { + if (VT.getSizeInBits() != 128) + return false; + + unsigned NumElems = VT.getVectorNumElements(); if (NumElems != 2 && NumElems != 4) return false; - for (unsigned i = 0; i < NumElems/2; ++i) - if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) + for (unsigned i = 0; i != NumElems/2; ++i) + if (!isUndefOrEqual(Mask[i], i + NumElems)) return false; - for (unsigned i = NumElems/2; i < NumElems; ++i) - if (!isUndefOrEqual(N->getMaskElt(i), i)) + for (unsigned i = NumElems/2; i != NumElems; ++i) + if (!isUndefOrEqual(Mask[i], i)) return false; return true; @@ -3450,19 +3422,19 @@ bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVLHPS. -bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { - unsigned NumElems = N->getValueType(0).getVectorNumElements(); +static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) { + unsigned NumElems = VT.getVectorNumElements(); if ((NumElems != 2 && NumElems != 4) - || N->getValueType(0).getSizeInBits() > 128) + || VT.getSizeInBits() > 128) return false; - for (unsigned i = 0; i < NumElems/2; ++i) - if (!isUndefOrEqual(N->getMaskElt(i), i)) + for (unsigned i = 0; i != NumElems/2; ++i) + if (!isUndefOrEqual(Mask[i], i)) return false; - for (unsigned i = 0; i < NumElems/2; ++i) - if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) + for (unsigned i = 0; i != NumElems/2; ++i) + if (!isUndefOrEqual(Mask[i + NumElems/2], i + NumElems)) return false; return true; @@ -3470,9 +3442,9 @@ bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKL. -static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, +static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2, bool V2IsSplat = false) { - int NumElts = VT.getVectorNumElements(); + unsigned NumElts = VT.getVectorNumElements(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); @@ -3486,11 +3458,9 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; - unsigned Start = 0; - unsigned End = NumLaneElts; - for (unsigned s = 0; s < NumLanes; ++s) { - for (unsigned i = Start, j = s * NumLaneElts; - i != End; + for (unsigned l = 0; l != NumLanes; ++l) { + for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; + i != (l+1)*NumLaneElts; i += 2, ++j) { int BitI = Mask[i]; int BitI1 = Mask[i+1]; @@ -3504,25 +3474,16 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, return false; } } - // Process the next 128 bits. - Start += NumLaneElts; - End += NumLaneElts; } return true; } -bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool HasAVX2, bool V2IsSplat) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isUNPCKLMask(M, N->getValueType(0), HasAVX2, V2IsSplat); -} - /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKH. -static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, +static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2, bool V2IsSplat = false) { - int NumElts = VT.getVectorNumElements(); + unsigned NumElts = VT.getVectorNumElements(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); @@ -3536,11 +3497,9 @@ static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; - unsigned Start = 0; - unsigned End = NumLaneElts; for (unsigned l = 0; l != NumLanes; ++l) { - for (unsigned i = Start, j = (l*NumLaneElts)+NumLaneElts/2; - i != End; i += 2, ++j) { + for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; + i != (l+1)*NumLaneElts; i += 2, ++j) { int BitI = Mask[i]; int BitI1 = Mask[i+1]; if (!isUndefOrEqual(BitI, j)) @@ -3553,42 +3512,39 @@ static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, return false; } } - // Process the next 128 bits. - Start += NumLaneElts; - End += NumLaneElts; } return true; } -bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool HasAVX2, bool V2IsSplat) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isUNPCKHMask(M, N->getValueType(0), HasAVX2, V2IsSplat); -} - /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, /// <0, 0, 1, 1> -static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { - int NumElems = VT.getVectorNumElements(); - if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) +static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, + bool HasAVX2) { + unsigned NumElts = VT.getVectorNumElements(); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for unpckh"); + + if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && + (!HasAVX2 || (NumElts != 16 && NumElts != 32))) return false; // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern // FIXME: Need a better way to get rid of this, there's no latency difference // between UNPCKLPD and MOVDDUP, the later should always be checked first and // the former later. We should also remove the "_undef" special mask. - if (NumElems == 4 && VT.getSizeInBits() == 256) + if (NumElts == 4 && VT.getSizeInBits() == 256) return false; // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate // independently on 128-bit lanes. - unsigned NumLanes = VT.getSizeInBits() / 128; - unsigned NumLaneElts = NumElems / NumLanes; + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; - for (unsigned s = 0; s < NumLanes; ++s) { - for (unsigned i = s * NumLaneElts, j = s * NumLaneElts; - i != NumLaneElts * (s + 1); + for (unsigned l = 0; l != NumLanes; ++l) { + for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; + i != (l+1)*NumLaneElts; i += 2, ++j) { int BitI = Mask[i]; int BitI1 = Mask[i+1]; @@ -3603,81 +3559,77 @@ static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { return true; } -bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); -} - /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, /// <2, 2, 3, 3> -static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { - int NumElems = VT.getVectorNumElements(); - if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) +static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) { + unsigned NumElts = VT.getVectorNumElements(); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for unpckh"); + + if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && + (!HasAVX2 || (NumElts != 16 && NumElts != 32))) return false; - for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { - int BitI = Mask[i]; - int BitI1 = Mask[i+1]; - if (!isUndefOrEqual(BitI, j)) - return false; - if (!isUndefOrEqual(BitI1, j)) - return false; + // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate + // independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + for (unsigned l = 0; l != NumLanes; ++l) { + for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; + i != (l+1)*NumLaneElts; i += 2, ++j) { + int BitI = Mask[i]; + int BitI1 = Mask[i+1]; + if (!isUndefOrEqual(BitI, j)) + return false; + if (!isUndefOrEqual(BitI1, j)) + return false; + } } return true; } -bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); -} - /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSS, /// MOVSD, and MOVD, i.e. setting the lowest element. -static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { +static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) { if (VT.getVectorElementType().getSizeInBits() < 32) return false; + if (VT.getSizeInBits() == 256) + return false; - int NumElts = VT.getVectorNumElements(); + unsigned NumElts = VT.getVectorNumElements(); if (!isUndefOrEqual(Mask[0], NumElts)) return false; - for (int i = 1; i < NumElts; ++i) + for (unsigned i = 1; i != NumElts; ++i) if (!isUndefOrEqual(Mask[i], i)) return false; return true; } -bool X86::isMOVLMask(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isMOVLMask(M, N->getValueType(0)); -} - /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered /// as permutations between 128-bit chunks or halves. As an example: this /// shuffle bellow: /// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> /// The first half comes from the second half of V1 and the second half from the /// the second half of V2. -static bool isVPERM2X128Mask(const SmallVectorImpl<int> &Mask, EVT VT, - bool HasAVX) { +static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { if (!HasAVX || VT.getSizeInBits() != 256) return false; // The shuffle result is divided into half A and half B. In total the two // sources have 4 halves, namely: C, D, E, F. The final values of A and // B must come from C, D, E or F. - int HalfSize = VT.getVectorNumElements()/2; + unsigned HalfSize = VT.getVectorNumElements()/2; bool MatchA = false, MatchB = false; // Check if A comes from one of C, D, E, F. - for (int Half = 0; Half < 4; ++Half) { + for (unsigned Half = 0; Half != 4; ++Half) { if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { MatchA = true; break; @@ -3685,7 +3637,7 @@ static bool isVPERM2X128Mask(const SmallVectorImpl<int> &Mask, EVT VT, } // Check if B comes from one of C, D, E, F. - for (int Half = 0; Half < 4; ++Half) { + for (unsigned Half = 0; Half != 4; ++Half) { if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { MatchB = true; break; @@ -3700,16 +3652,16 @@ static bool isVPERM2X128Mask(const SmallVectorImpl<int> &Mask, EVT VT, static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { EVT VT = SVOp->getValueType(0); - int HalfSize = VT.getVectorNumElements()/2; + unsigned HalfSize = VT.getVectorNumElements()/2; - int FstHalf = 0, SndHalf = 0; - for (int i = 0; i < HalfSize; ++i) { + unsigned FstHalf = 0, SndHalf = 0; + for (unsigned i = 0; i < HalfSize; ++i) { if (SVOp->getMaskElt(i) > 0) { FstHalf = SVOp->getMaskElt(i)/HalfSize; break; } } - for (int i = HalfSize; i < HalfSize*2; ++i) { + for (unsigned i = HalfSize; i < HalfSize*2; ++i) { if (SVOp->getMaskElt(i) > 0) { SndHalf = SVOp->getMaskElt(i)/HalfSize; break; @@ -3725,31 +3677,28 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { /// type is 32 or 64. In the VPERMILPS the high half of the mask should point /// to the same elements of the low, but to the higher half of the source. /// In VPERMILPD the two lanes could be shuffled independently of each other -/// with the same restriction that lanes can't be crossed. -static bool isVPERMILPMask(const SmallVectorImpl<int> &Mask, EVT VT, - bool HasAVX) { - int NumElts = VT.getVectorNumElements(); - int NumLanes = VT.getSizeInBits()/128; - +/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. +static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { if (!HasAVX) return false; + unsigned NumElts = VT.getVectorNumElements(); // Only match 256-bit with 32/64-bit types if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8)) return false; - int LaneSize = NumElts/NumLanes; - for (int l = 0; l != NumLanes; ++l) { - int LaneStart = l*LaneSize; - for (int i = 0; i != LaneSize; ++i) { - if (!isUndefOrInRange(Mask[i+LaneStart], LaneStart, LaneStart+LaneSize)) + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned LaneSize = NumElts/NumLanes; + for (unsigned l = 0; l != NumElts; l += LaneSize) { + for (unsigned i = 0; i != LaneSize; ++i) { + if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) return false; - if (NumElts == 4 || l == 0) + if (NumElts != 8 || l == 0) continue; // VPERMILPS handling if (Mask[i] < 0) continue; - if (!isUndefOrEqual(Mask[i+LaneStart], Mask[i]+LaneSize)) + if (!isUndefOrEqual(Mask[i+l], Mask[i]+l)) return false; } } @@ -3757,48 +3706,19 @@ static bool isVPERMILPMask(const SmallVectorImpl<int> &Mask, EVT VT, return true; } -/// getShuffleVPERMILPImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VPERMILPS/D* instructions. -static unsigned getShuffleVPERMILPImmediate(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); - - int NumElts = VT.getVectorNumElements(); - int NumLanes = VT.getSizeInBits()/128; - int LaneSize = NumElts/NumLanes; - - // Although the mask is equal for both lanes do it twice to get the cases - // where a mask will match because the same mask element is undef on the - // first half but valid on the second. This would get pathological cases - // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid. - unsigned Shift = (LaneSize == 4) ? 2 : 1; - unsigned Mask = 0; - for (int i = 0; i != NumElts; ++i) { - int MaskElt = SVOp->getMaskElt(i); - if (MaskElt < 0) - continue; - MaskElt %= LaneSize; - unsigned Shamt = i; - // VPERMILPSY, the mask of the first half must be equal to the second one - if (NumElts == 8) Shamt %= LaneSize; - Mask |= MaskElt << (Shamt*Shift); - } - - return Mask; -} - -/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse +/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse /// of what x86 movss want. X86 movs requires the lowest element to be lowest /// element of vector 2 and the other elements to come from vector 1 in order. -static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, +static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT, bool V2IsSplat = false, bool V2IsUndef = false) { - int NumOps = VT.getVectorNumElements(); + unsigned NumOps = VT.getVectorNumElements(); if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) return false; if (!isUndefOrEqual(Mask[0], 0)) return false; - for (int i = 1; i < NumOps; ++i) + for (unsigned i = 1; i != NumOps; ++i) if (!(isUndefOrEqual(Mask[i], i+NumOps) || (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) @@ -3807,26 +3727,14 @@ static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, return true; } -static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, - bool V2IsUndef = false) { - SmallVector<int, 8> M; - N->getMask(M); - return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); -} - /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSHDUP. /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> -bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N, - const X86Subtarget *Subtarget) { - if (!Subtarget->hasSSE3orAVX()) - return false; - - // The second vector must be undef - if (N->getOperand(1).getOpcode() != ISD::UNDEF) +static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT, + const X86Subtarget *Subtarget) { + if (!Subtarget->hasSSE3()) return false; - EVT VT = N->getValueType(0); unsigned NumElems = VT.getVectorNumElements(); if ((VT.getSizeInBits() == 128 && NumElems != 4) || @@ -3834,9 +3742,9 @@ bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N, return false; // "i+1" is the value the indexed mask element must have - for (unsigned i = 0; i < NumElems; i += 2) - if (!isUndefOrEqual(N->getMaskElt(i), i+1) || - !isUndefOrEqual(N->getMaskElt(i+1), i+1)) + for (unsigned i = 0; i != NumElems; i += 2) + if (!isUndefOrEqual(Mask[i], i+1) || + !isUndefOrEqual(Mask[i+1], i+1)) return false; return true; @@ -3845,16 +3753,11 @@ bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N, /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSLDUP. /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> -bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N, - const X86Subtarget *Subtarget) { - if (!Subtarget->hasSSE3orAVX()) - return false; - - // The second vector must be undef - if (N->getOperand(1).getOpcode() != ISD::UNDEF) +static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT, + const X86Subtarget *Subtarget) { + if (!Subtarget->hasSSE3()) return false; - EVT VT = N->getValueType(0); unsigned NumElems = VT.getVectorNumElements(); if ((VT.getSizeInBits() == 128 && NumElems != 4) || @@ -3862,9 +3765,9 @@ bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N, return false; // "i" is the value the indexed mask element must have - for (unsigned i = 0; i < NumElems; i += 2) - if (!isUndefOrEqual(N->getMaskElt(i), i) || - !isUndefOrEqual(N->getMaskElt(i+1), i)) + for (unsigned i = 0; i != NumElems; i += 2) + if (!isUndefOrEqual(Mask[i], i) || + !isUndefOrEqual(Mask[i+1], i)) return false; return true; @@ -3873,17 +3776,16 @@ bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N, /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 256-bit /// version of MOVDDUP. -static bool isMOVDDUPYMask(const SmallVectorImpl<int> &Mask, EVT VT, - bool HasAVX) { - int NumElts = VT.getVectorNumElements(); +static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { + unsigned NumElts = VT.getVectorNumElements(); if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4) return false; - for (int i = 0; i != NumElts/2; ++i) + for (unsigned i = 0; i != NumElts/2; ++i) if (!isUndefOrEqual(Mask[i], 0)) return false; - for (int i = NumElts/2; i != NumElts; ++i) + for (unsigned i = NumElts/2; i != NumElts; ++i) if (!isUndefOrEqual(Mask[i], NumElts/2)) return false; return true; @@ -3892,18 +3794,16 @@ static bool isMOVDDUPYMask(const SmallVectorImpl<int> &Mask, EVT VT, /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 128-bit /// version of MOVDDUP. -bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { - EVT VT = N->getValueType(0); - +static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) { if (VT.getSizeInBits() != 128) return false; - int e = VT.getVectorNumElements() / 2; - for (int i = 0; i < e; ++i) - if (!isUndefOrEqual(N->getMaskElt(i), i)) + unsigned e = VT.getVectorNumElements() / 2; + for (unsigned i = 0; i != e; ++i) + if (!isUndefOrEqual(Mask[i], i)) return false; - for (int i = 0; i < e; ++i) - if (!isUndefOrEqual(N->getMaskElt(e+i), i)) + for (unsigned i = 0; i != e; ++i) + if (!isUndefOrEqual(Mask[e+i], i)) return false; return true; } @@ -3948,31 +3848,43 @@ bool X86::isVINSERTF128Index(SDNode *N) { /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. -unsigned X86::getShuffleSHUFImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - int NumOperands = SVOp->getValueType(0).getVectorNumElements(); +/// Handles 128-bit and 256-bit. +static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { + EVT VT = N->getValueType(0); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for PSHUF/SHUFP"); - unsigned Shift = (NumOperands == 4) ? 2 : 1; + // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate + // independently on 128-bit lanes. + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + assert((NumLaneElts == 2 || NumLaneElts == 4) && + "Only supports 2 or 4 elements per lane"); + + unsigned Shift = (NumLaneElts == 4) ? 1 : 0; unsigned Mask = 0; - for (int i = 0; i < NumOperands; ++i) { - int Val = SVOp->getMaskElt(NumOperands-i-1); - if (Val < 0) Val = 0; - if (Val >= NumOperands) Val -= NumOperands; - Mask |= Val; - if (i != NumOperands - 1) - Mask <<= Shift; + for (unsigned i = 0; i != NumElts; ++i) { + int Elt = N->getMaskElt(i); + if (Elt < 0) continue; + Elt %= NumLaneElts; + unsigned ShAmt = i << Shift; + if (ShAmt >= 8) ShAmt -= 8; + Mask |= Elt << ShAmt; } + return Mask; } /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. -unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); +static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { unsigned Mask = 0; // 8 nodes, but we only care about the last 4. for (unsigned i = 7; i >= 4; --i) { - int Val = SVOp->getMaskElt(i); + int Val = N->getMaskElt(i); if (Val >= 0) Mask |= (Val - 4); if (i != 4) @@ -3983,12 +3895,11 @@ unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. -unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); +static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { unsigned Mask = 0; // 8 nodes, but we only care about the first 4. for (int i = 3; i >= 0; --i) { - int Val = SVOp->getMaskElt(i); + int Val = N->getMaskElt(i); if (Val >= 0) Mask |= Val; if (i != 0) @@ -4002,14 +3913,21 @@ unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { EVT VT = SVOp->getValueType(0); unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3; - int Val = 0; - unsigned i, e; - for (i = 0, e = VT.getVectorNumElements(); i != e; ++i) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + int Val = 0; + unsigned i; + for (i = 0; i != NumElts; ++i) { Val = SVOp->getMaskElt(i); if (Val >= 0) break; } + if (Val >= (int)NumElts) + Val -= NumElts - NumLaneElts; + assert(Val - i > 0 && "PALIGNR imm should be positive"); return (Val - i) * EltSize; } @@ -4082,17 +4000,16 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, /// match movhlps. The lower half elements should come from upper half of /// V1 (and in order), and the upper half elements should come from the upper /// half of V2 (and in order). -static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { - EVT VT = Op->getValueType(0); +static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) { if (VT.getSizeInBits() != 128) return false; if (VT.getVectorNumElements() != 4) return false; for (unsigned i = 0, e = 2; i != e; ++i) - if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) + if (!isUndefOrEqual(Mask[i], i+2)) return false; for (unsigned i = 2; i != 4; ++i) - if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) + if (!isUndefOrEqual(Mask[i], i+4)) return false; return true; } @@ -4140,8 +4057,7 @@ static bool WillBeConstantPoolLoad(SDNode *N) { /// half of V2 (and in order). And since V1 will become the source of the /// MOVLP, it must be either a vector load or a scalar load to vector. static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, - ShuffleVectorSDNode *Op) { - EVT VT = Op->getValueType(0); + ArrayRef<int> Mask, EVT VT) { if (VT.getSizeInBits() != 128) return false; @@ -4157,10 +4073,10 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, if (NumElems != 2 && NumElems != 4) return false; for (unsigned i = 0, e = NumElems/2; i != e; ++i) - if (!isUndefOrEqual(Op->getMaskElt(i), i)) + if (!isUndefOrEqual(Mask[i], i)) return false; for (unsigned i = NumElems/2; i != NumElems; ++i) - if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) + if (!isUndefOrEqual(Mask[i], i+NumElems)) return false; return true; } @@ -4208,15 +4124,15 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) { /// getZeroVector - Returns a vector of specified type with all zero elements. /// -static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG, - DebugLoc dl) { +static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); // Always build SSE zero vectors as <4 x i32> bitcasted // to their dest type. This ensures they get CSE'd. SDValue Vec; if (VT.getSizeInBits() == 128) { // SSE - if (HasXMMInt) { // SSE2 + if (Subtarget->hasSSE2()) { // SSE2 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } else { // SSE1 @@ -4224,12 +4140,17 @@ static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG, Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); } } else if (VT.getSizeInBits() == 256) { // AVX - // 256-bit logic and arithmetic instructions in AVX are - // all floating-point, no support for integer ops. Default - // to emitting fp zeroed vectors then. - SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); - SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); + if (Subtarget->hasAVX2()) { // AVX2 + SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); + } else { + // 256-bit logic and arithmetic instructions in AVX are all + // floating-point, no support for integer ops. Emit fp zeroed vectors. + SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); + } } return DAG.getNode(ISD::BITCAST, dl, VT, Vec); } @@ -4266,24 +4187,12 @@ static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG, /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements /// that point to V2 points to its first element. -static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { - EVT VT = SVOp->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - - bool Changed = false; - SmallVector<int, 8> MaskVec; - SVOp->getMask(MaskVec); - +static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) { for (unsigned i = 0; i != NumElems; ++i) { - if (MaskVec[i] > (int)NumElems) { - MaskVec[i] = NumElems; - Changed = true; + if (Mask[i] > (int)NumElems) { + Mask[i] = NumElems; } } - if (Changed) - return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), - SVOp->getOperand(1), &MaskVec[0]); - return SDValue(SVOp, 0); } /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd @@ -4387,7 +4296,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { // Extract the 128-bit part containing the splat element and update // the splat element index when it refers to the higher register. if (Size == 256) { - unsigned Idx = (EltNo > NumElems/2) ? NumElems/2 : 0; + unsigned Idx = (EltNo >= NumElems/2) ? NumElems/2 : 0; V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl); if (Idx > 0) EltNo -= NumElems/2; @@ -4419,11 +4328,12 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { /// element of V2 is swizzled into the zero/undef vector, landing at element /// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, - bool isZero, bool HasXMMInt, + bool IsZero, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { EVT VT = V2.getValueType(); - SDValue V1 = isZero - ? getZeroVector(VT, HasXMMInt, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); + SDValue V1 = IsZero + ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); unsigned NumElems = VT.getVectorNumElements(); SmallVector<int, 16> MaskVec; for (unsigned i = 0; i != NumElems; ++i) @@ -4450,20 +4360,20 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, if (Index < 0) return DAG.getUNDEF(VT.getVectorElementType()); - int NumElems = VT.getVectorNumElements(); - SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1); + unsigned NumElems = VT.getVectorNumElements(); + SDValue NewV = (Index < (int)NumElems) ? SV->getOperand(0) + : SV->getOperand(1); return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1); } // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { - int NumElems = VT.getVectorNumElements(); + unsigned NumElems = VT.getVectorNumElements(); SmallVector<unsigned, 16> ShuffleMask; SDValue ImmN; switch(Opcode) { - case X86ISD::SHUFPS: - case X86ISD::SHUFPD: + case X86ISD::SHUFP: ImmN = N->getOperand(N->getNumOperands()-1); DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), ShuffleMask); @@ -4481,9 +4391,9 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, DecodeMOVLHPSMask(NumElems, ShuffleMask); break; case X86ISD::PSHUFD: + case X86ISD::VPERMILP: ImmN = N->getOperand(N->getNumOperands()-1); - DecodePSHUFMask(NumElems, - cast<ConstantSDNode>(ImmN)->getZExtValue(), + DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), ShuffleMask); break; case X86ISD::PSHUFHW: @@ -4505,14 +4415,9 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, Depth+1); } - case X86ISD::VPERMILP: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodeVPERMILPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; case X86ISD::VPERM2X128: ImmN = N->getOperand(N->getNumOperands()-1); - DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), + DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), ShuffleMask); break; case X86ISD::MOVDDUP: @@ -4523,16 +4428,15 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, case X86ISD::MOVSLDUP: case X86ISD::PALIGN: return SDValue(); // Not yet implemented. - default: - assert(0 && "unknown target shuffle node"); - return SDValue(); + default: llvm_unreachable("unknown target shuffle node"); } Index = ShuffleMask[Index]; if (Index < 0) return DAG.getUNDEF(VT.getVectorElementType()); - SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1); + SDValue NewV = (Index < (int)NumElems) ? N->getOperand(0) + : N->getOperand(1); return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1); } @@ -4693,6 +4597,7 @@ static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, + const X86Subtarget* Subtarget, const TargetLowering &TLI) { if (NumNonZero > 8) return SDValue(); @@ -4704,7 +4609,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; if (ThisIsNonZero && First) { if (NumZero) - V = getZeroVector(MVT::v8i16, true, DAG, dl); + V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); else V = DAG.getUNDEF(MVT::v8i16); First = false; @@ -4740,6 +4645,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, + const X86Subtarget* Subtarget, const TargetLowering &TLI) { if (NumNonZero > 4) return SDValue(); @@ -4752,7 +4658,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, if (isNonZero) { if (First) { if (NumZero) - V = getZeroVector(MVT::v8i16, true, DAG, dl); + V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); else V = DAG.getUNDEF(MVT::v8i16); First = false; @@ -4773,7 +4679,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, const TargetLowering &TLI, DebugLoc dl) { assert(VT.getSizeInBits() == 128 && "Unknown type for VShift"); EVT ShVT = MVT::v2i64; - unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; + unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(Opc, dl, ShVT, SrcOp, @@ -4841,21 +4747,16 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, int EltNo = (Offset - StartOffset) >> 2; int NumElems = VT.getVectorNumElements(); - EVT CanonVT = VT.getSizeInBits() == 128 ? MVT::v4i32 : MVT::v8i32; EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(StartOffset), false, false, false, 0); - // Canonicalize it to a v4i32 or v8i32 shuffle. SmallVector<int, 8> Mask; for (int i = 0; i < NumElems; ++i) Mask.push_back(EltNo); - V1 = DAG.getNode(ISD::BITCAST, dl, CanonVT, V1); - return DAG.getNode(ISD::BITCAST, dl, NVT, - DAG.getVectorShuffle(CanonVT, dl, V1, - DAG.getUNDEF(CanonVT),&Mask[0])); + return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); } return SDValue(); @@ -4938,7 +4839,10 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, /// a scalar load. /// The scalar load node is returned when a pattern is found, /// or SDValue() otherwise. -static SDValue isVectorBroadcast(SDValue &Op, bool hasAVX2) { +static SDValue isVectorBroadcast(SDValue &Op, const X86Subtarget *Subtarget) { + if (!Subtarget->hasAVX()) + return SDValue(); + EVT VT = Op.getValueType(); SDValue V = Op; @@ -4993,22 +4897,14 @@ static SDValue isVectorBroadcast(SDValue &Op, bool hasAVX2) { if (!ISD::isNormalLoad(Ld.getNode())) return SDValue(); + // Reject loads that have uses of the chain result + if (Ld->hasAnyUseOfValue(1)) + return SDValue(); + bool Is256 = VT.getSizeInBits() == 256; bool Is128 = VT.getSizeInBits() == 128; unsigned ScalarSize = Ld.getValueType().getSizeInBits(); - if (hasAVX2) { - // VBroadcast to YMM - if (Is256 && (ScalarSize == 8 || ScalarSize == 16 || - ScalarSize == 32 || ScalarSize == 64 )) - return Ld; - - // VBroadcast to XMM - if (Is128 && (ScalarSize == 8 || ScalarSize == 32 || - ScalarSize == 16 || ScalarSize == 64 )) - return Ld; - } - // VBroadcast to YMM if (Is256 && (ScalarSize == 32 || ScalarSize == 64)) return Ld; @@ -5017,6 +4913,17 @@ static SDValue isVectorBroadcast(SDValue &Op, bool hasAVX2) { if (Is128 && (ScalarSize == 32)) return Ld; + // The integer check is needed for the 64-bit into 128-bit so it doesn't match + // double since there is vbroadcastsd xmm + if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) { + // VBroadcast to YMM + if (Is256 && (ScalarSize == 8 || ScalarSize == 16)) + return Ld; + + // VBroadcast to XMM + if (Is128 && (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) + return Ld; + } // Unsupported broadcast. return SDValue(); @@ -5034,27 +4941,25 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (ISD::isBuildVectorAllZeros(Op.getNode())) { // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. - if (Op.getValueType() == MVT::v4i32 || - Op.getValueType() == MVT::v8i32) + if (VT == MVT::v4i32 || VT == MVT::v8i32) return Op; - return getZeroVector(Op.getValueType(), Subtarget->hasXMMInt(), DAG, dl); + return getZeroVector(VT, Subtarget, DAG, dl); } // Vectors containing all ones can be matched by pcmpeqd on 128-bit width // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use // vpcmpeqd on 256-bit vectors. if (ISD::isBuildVectorAllOnes(Op.getNode())) { - if (Op.getValueType() == MVT::v4i32 || - (Op.getValueType() == MVT::v8i32 && Subtarget->hasAVX2())) + if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasAVX2())) return Op; - return getOnesVector(Op.getValueType(), Subtarget->hasAVX2(), DAG, dl); + return getOnesVector(VT, Subtarget->hasAVX2(), DAG, dl); } - SDValue LD = isVectorBroadcast(Op, Subtarget->hasAVX2()); - if (Subtarget->hasAVX() && LD.getNode()) - return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD); + SDValue LD = isVectorBroadcast(Op, Subtarget); + if (LD.getNode()) + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD); unsigned EVTBits = ExtVT.getSizeInBits(); @@ -5105,8 +5010,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // convert it to a vector with movd (S2V+shuffle to zero extend). Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); - Item = getShuffleVectorZeroOrUndef(Item, 0, true, - Subtarget->hasXMMInt(), DAG); + Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); // Now we have our 32-bit value zero extended in the low element of // a vector. If Idx != 0, swizzle it into place. @@ -5119,7 +5023,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { DAG.getUNDEF(Item.getValueType()), &Mask[0]); } - return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item); + return DAG.getNode(ISD::BITCAST, dl, VT, Item); } } @@ -5128,23 +5032,33 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // the rest of the elements. This will be matched as movd/movq/movss/movsd // depending on what the source datatype is. if (Idx == 0) { - if (NumZero == 0) { + if (NumZero == 0) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); - } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || + + if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || (ExtVT == MVT::i64 && Subtarget->is64Bit())) { + if (VT.getSizeInBits() == 256) { + SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, + Item, DAG.getIntPtrConstant(0)); + } + assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. - return getShuffleVectorZeroOrUndef(Item, 0, true,Subtarget->hasXMMInt(), - DAG); - } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { + return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); + } + + if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); - unsigned NumBits = VT.getSizeInBits(); - assert((NumBits == 128 || NumBits == 256) && - "Expected an SSE or AVX value type!"); - EVT MiddleVT = NumBits == 128 ? MVT::v4i32 : MVT::v8i32; - Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); - Item = getShuffleVectorZeroOrUndef(Item, 0, true, - Subtarget->hasXMMInt(), DAG); + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); + if (VT.getSizeInBits() == 256) { + SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); + Item = Insert128BitVector(ZeroVec, Item, DAG.getConstant(0, MVT::i32), + DAG, dl); + } else { + assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); + Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); + } return DAG.getNode(ISD::BITCAST, dl, VT, Item); } } @@ -5172,8 +5086,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a shuffle of zero and zero-extended scalar to vector. - Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, - Subtarget->hasXMMInt(), DAG); + Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); SmallVector<int, 8> MaskVec; for (unsigned i = 0; i < NumElems; i++) MaskVec.push_back(i == Idx ? 0 : 1); @@ -5203,9 +5116,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // For AVX-length vectors, build the individual 128-bit pieces and use // shuffles to put them in place. - if (VT.getSizeInBits() == 256 && !ISD::isBuildVectorAllZeros(Op.getNode())) { + if (VT.getSizeInBits() == 256) { SmallVector<SDValue, 32> V; - for (unsigned i = 0; i < NumElems; ++i) + for (unsigned i = 0; i != NumElems; ++i) V.push_back(Op.getOperand(i)); EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); @@ -5229,8 +5142,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { unsigned Idx = CountTrailingZeros_32(NonZeros); SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(Idx)); - return getShuffleVectorZeroOrUndef(V2, Idx, true, - Subtarget->hasXMMInt(), DAG); + return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); } return SDValue(); } @@ -5238,24 +5150,23 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8 && NumElems == 16) { SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, - *this); + Subtarget, *this); if (V.getNode()) return V; } if (EVTBits == 16 && NumElems == 8) { SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, - *this); + Subtarget, *this); if (V.getNode()) return V; } // If element VT is == 32 bits, turn it into a number of shuffles. - SmallVector<SDValue, 8> V; - V.resize(NumElems); + SmallVector<SDValue, 8> V(NumElems); if (NumElems == 4 && NumZero > 0) { for (unsigned i = 0; i < 4; ++i) { bool isZero = !(NonZeros & (1 << i)); if (isZero) - V[i] = getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl); + V[i] = getZeroVector(VT, Subtarget, DAG, dl); else V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); } @@ -5278,13 +5189,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { } } - SmallVector<int, 8> MaskVec; - bool Reverse = (NonZeros & 0x3) == 2; - for (unsigned i = 0; i < 2; ++i) - MaskVec.push_back(Reverse ? 1-i : i); - Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; - for (unsigned i = 0; i < 2; ++i) - MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); + bool Reverse1 = (NonZeros & 0x3) == 2; + bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; + int MaskVec[] = { + Reverse1 ? 1 : 0, + Reverse1 ? 0 : 1, + static_cast<int>(Reverse2 ? NumElems+1 : NumElems), + static_cast<int>(Reverse2 ? NumElems : NumElems+1) + }; return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); } @@ -5299,7 +5211,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return LD; // For SSE 4.1, use insertps to put the high elements into the low element. - if (getSubtarget()->hasSSE41orAVX()) { + if (getSubtarget()->hasSSE41()) { SDValue Result; if (Op.getOperand(0).getOpcode() != ISD::UNDEF) Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); @@ -5430,7 +5342,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, // mask values count as coming from any quadword, for better codegen. unsigned LoQuad[] = { 0, 0, 0, 0 }; unsigned HiQuad[] = { 0, 0, 0, 0 }; - BitVector InputQuads(4); + std::bitset<4> InputQuads; for (unsigned i = 0; i < 8; ++i) { unsigned *Quad = i < 4 ? LoQuad : HiQuad; int EltIdx = SVOp->getMaskElt(i); @@ -5470,10 +5382,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, // quads, disable the next transformation since it does not help SSSE3. bool V1Used = InputQuads[0] || InputQuads[1]; bool V2Used = InputQuads[2] || InputQuads[3]; - if (Subtarget->hasSSSE3orAVX()) { + if (Subtarget->hasSSSE3()) { if (InputQuads.count() == 2 && V1Used && V2Used) { - BestLoQuad = InputQuads.find_first(); - BestHiQuad = InputQuads.find_next(BestLoQuad); + BestLoQuad = InputQuads[0] ? 0 : 1; + BestHiQuad = InputQuads[2] ? 2 : 3; } if (InputQuads.count() > 2) { BestLoQuad = -1; @@ -5486,9 +5398,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, // words from all 4 input quadwords. SDValue NewV; if (BestLoQuad >= 0 || BestHiQuad >= 0) { - SmallVector<int, 8> MaskV; - MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); - MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); + int MaskV[] = { + BestLoQuad < 0 ? 0 : BestLoQuad, + BestHiQuad < 0 ? 1 : BestHiQuad + }; NewV = DAG.getVectorShuffle(MVT::v2i64, dl, DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); @@ -5533,8 +5446,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, unsigned TargetMask = 0; NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); - TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): - X86::getShufflePSHUFLWImmediate(NewV.getNode()); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); + TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp): + getShufflePSHUFLWImmediate(SVOp); V1 = NewV.getOperand(0); return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); } @@ -5543,7 +5457,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, // If we have SSSE3, and all words of the result are from 1 input vector, // case 2 is generated, otherwise case 3 is generated. If no SSSE3 // is present, fall back to case 4. - if (Subtarget->hasSSSE3orAVX()) { + if (Subtarget->hasSSSE3()) { SmallVector<SDValue,16> pshufbMask; // If we have elements from both input vectors, set the high bit of the @@ -5591,59 +5505,51 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, // and update MaskVals with new element order. - BitVector InOrder(8); + std::bitset<8> InOrder; if (BestLoQuad >= 0) { - SmallVector<int, 8> MaskV; + int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 }; for (int i = 0; i != 4; ++i) { int idx = MaskVals[i]; if (idx < 0) { - MaskV.push_back(-1); InOrder.set(i); } else if ((idx / 4) == BestLoQuad) { - MaskV.push_back(idx & 3); + MaskV[i] = idx & 3; InOrder.set(i); - } else { - MaskV.push_back(-1); } } - for (unsigned i = 4; i != 8; ++i) - MaskV.push_back(i); NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskV[0]); - if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3orAVX()) + if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, - NewV.getOperand(0), - X86::getShufflePSHUFLWImmediate(NewV.getNode()), - DAG); + NewV.getOperand(0), + getShufflePSHUFLWImmediate(SVOp), DAG); + } } // If BestHi >= 0, generate a pshufhw to put the high elements in order, // and update MaskVals with the new element order. if (BestHiQuad >= 0) { - SmallVector<int, 8> MaskV; - for (unsigned i = 0; i != 4; ++i) - MaskV.push_back(i); + int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 }; for (unsigned i = 4; i != 8; ++i) { int idx = MaskVals[i]; if (idx < 0) { - MaskV.push_back(-1); InOrder.set(i); } else if ((idx / 4) == BestHiQuad) { - MaskV.push_back((idx & 3) + 4); + MaskV[i] = (idx & 3) + 4; InOrder.set(i); - } else { - MaskV.push_back(-1); } } NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskV[0]); - if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3orAVX()) + if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, - NewV.getOperand(0), - X86::getShufflePSHUFHWImmediate(NewV.getNode()), - DAG); + NewV.getOperand(0), + getShufflePSHUFHWImmediate(SVOp), DAG); + } } // In case BestHi & BestLo were both -1, which means each quadword has a word @@ -5685,8 +5591,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); DebugLoc dl = SVOp->getDebugLoc(); - SmallVector<int, 16> MaskVals; - SVOp->getMask(MaskVals); + ArrayRef<int> MaskVals = SVOp->getMask(); // If we have SSSE3, case 1 is generated when all result bytes come from // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is @@ -5705,7 +5610,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, } // If SSSE3, use 1 pshufb instruction per vector with elements in the result. - if (TLI.getSubtarget()->hasSSSE3orAVX()) { + if (TLI.getSubtarget()->hasSSSE3()) { SmallVector<SDValue,16> pshufbMask; // If all result elements are from one input vector, then only translate @@ -5836,7 +5741,7 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, unsigned NewWidth = (NumElems == 4) ? 2 : 4; EVT NewVT; switch (VT.getSimpleVT().SimpleTy) { - default: assert(false && "Unexpected!"); + default: llvm_unreachable("Unexpected!"); case MVT::v4f32: NewVT = MVT::v2f64; break; case MVT::v4i32: NewVT = MVT::v2i64; break; case MVT::v8i16: NewVT = MVT::v4i32; break; @@ -5902,96 +5807,106 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT, OpVT, SrcOp))); } -/// areShuffleHalvesWithinDisjointLanes - Check whether each half of a vector -/// shuffle node referes to only one lane in the sources. -static bool areShuffleHalvesWithinDisjointLanes(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); - int NumElems = VT.getVectorNumElements(); - int HalfSize = NumElems/2; - SmallVector<int, 16> M; - SVOp->getMask(M); - bool MatchA = false, MatchB = false; - - for (int l = 0; l < NumElems*2; l += HalfSize) { - if (isUndefOrInRange(M, 0, HalfSize, l, l+HalfSize)) { - MatchA = true; - break; - } - } - - for (int l = 0; l < NumElems*2; l += HalfSize) { - if (isUndefOrInRange(M, HalfSize, HalfSize, l, l+HalfSize)) { - MatchB = true; - break; - } - } - - return MatchA && MatchB; -} - /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles /// which could not be matched by any known target speficic shuffle static SDValue LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { - if (areShuffleHalvesWithinDisjointLanes(SVOp)) { - // If each half of a vector shuffle node referes to only one lane in the - // source vectors, extract each used 128-bit lane and shuffle them using - // 128-bit shuffles. Then, concatenate the results. Otherwise leave - // the work to the legalizer. - DebugLoc dl = SVOp->getDebugLoc(); - EVT VT = SVOp->getValueType(0); - int NumElems = VT.getVectorNumElements(); - int HalfSize = NumElems/2; - - // Extract the reference for each half - int FstVecExtractIdx = 0, SndVecExtractIdx = 0; - int FstVecOpNum = 0, SndVecOpNum = 0; - for (int i = 0; i < HalfSize; ++i) { - int Elt = SVOp->getMaskElt(i); - if (SVOp->getMaskElt(i) < 0) + EVT VT = SVOp->getValueType(0); + + unsigned NumElems = VT.getVectorNumElements(); + unsigned NumLaneElems = NumElems / 2; + + int MinRange[2][2] = { { static_cast<int>(NumElems), + static_cast<int>(NumElems) }, + { static_cast<int>(NumElems), + static_cast<int>(NumElems) } }; + int MaxRange[2][2] = { { -1, -1 }, { -1, -1 } }; + + // Collect used ranges for each source in each lane + for (unsigned l = 0; l < 2; ++l) { + unsigned LaneStart = l*NumLaneElems; + for (unsigned i = 0; i != NumLaneElems; ++i) { + int Idx = SVOp->getMaskElt(i+LaneStart); + if (Idx < 0) continue; - FstVecOpNum = Elt/NumElems; - FstVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize; - break; + + int Input = 0; + if (Idx >= (int)NumElems) { + Idx -= NumElems; + Input = 1; + } + + if (Idx > MaxRange[l][Input]) + MaxRange[l][Input] = Idx; + if (Idx < MinRange[l][Input]) + MinRange[l][Input] = Idx; } - for (int i = HalfSize; i < NumElems; ++i) { - int Elt = SVOp->getMaskElt(i); - if (SVOp->getMaskElt(i) < 0) + } + + // Make sure each range is 128-bits + int ExtractIdx[2][2] = { { -1, -1 }, { -1, -1 } }; + for (unsigned l = 0; l < 2; ++l) { + for (unsigned Input = 0; Input < 2; ++Input) { + if (MinRange[l][Input] == (int)NumElems && MaxRange[l][Input] < 0) continue; - SndVecOpNum = Elt/NumElems; - SndVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize; - break; + + if (MinRange[l][Input] >= 0 && MaxRange[l][Input] < (int)NumLaneElems) + ExtractIdx[l][Input] = 0; + else if (MinRange[l][Input] >= (int)NumLaneElems && + MaxRange[l][Input] < (int)NumElems) + ExtractIdx[l][Input] = NumLaneElems; + else + return SDValue(); } + } - // Extract the subvectors - SDValue V1 = Extract128BitVector(SVOp->getOperand(FstVecOpNum), - DAG.getConstant(FstVecExtractIdx, MVT::i32), DAG, dl); - SDValue V2 = Extract128BitVector(SVOp->getOperand(SndVecOpNum), - DAG.getConstant(SndVecExtractIdx, MVT::i32), DAG, dl); + DebugLoc dl = SVOp->getDebugLoc(); + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT NVT = MVT::getVectorVT(EltVT, NumElems/2); + + SDValue Ops[2][2]; + for (unsigned l = 0; l < 2; ++l) { + for (unsigned Input = 0; Input < 2; ++Input) { + if (ExtractIdx[l][Input] >= 0) + Ops[l][Input] = Extract128BitVector(SVOp->getOperand(Input), + DAG.getConstant(ExtractIdx[l][Input], MVT::i32), + DAG, dl); + else + Ops[l][Input] = DAG.getUNDEF(NVT); + } + } - // Generate 128-bit shuffles - SmallVector<int, 16> MaskV1, MaskV2; - for (int i = 0; i < HalfSize; ++i) { - int Elt = SVOp->getMaskElt(i); - MaskV1.push_back(Elt < 0 ? Elt : Elt % HalfSize); + // Generate 128-bit shuffles + SmallVector<int, 16> Mask1, Mask2; + for (unsigned i = 0; i != NumLaneElems; ++i) { + int Elt = SVOp->getMaskElt(i); + if (Elt >= (int)NumElems) { + Elt %= NumLaneElems; + Elt += NumLaneElems; + } else if (Elt >= 0) { + Elt %= NumLaneElems; } - for (int i = HalfSize; i < NumElems; ++i) { - int Elt = SVOp->getMaskElt(i); - MaskV2.push_back(Elt < 0 ? Elt : Elt % HalfSize); + Mask1.push_back(Elt); + } + for (unsigned i = NumLaneElems; i != NumElems; ++i) { + int Elt = SVOp->getMaskElt(i); + if (Elt >= (int)NumElems) { + Elt %= NumLaneElems; + Elt += NumLaneElems; + } else if (Elt >= 0) { + Elt %= NumLaneElems; } - - EVT NVT = V1.getValueType(); - V1 = DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &MaskV1[0]); - V2 = DAG.getVectorShuffle(NVT, dl, V2, DAG.getUNDEF(NVT), &MaskV2[0]); - - // Concatenate the result back - SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), V1, - DAG.getConstant(0, MVT::i32), DAG, dl); - return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32), - DAG, dl); + Mask2.push_back(Elt); } - return SDValue(); + SDValue Shuf1 = DAG.getVectorShuffle(NVT, dl, Ops[0][0], Ops[0][1], &Mask1[0]); + SDValue Shuf2 = DAG.getVectorShuffle(NVT, dl, Ops[1][0], Ops[1][1], &Mask2[0]); + + // Concatenate the result back + SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Shuf1, + DAG.getConstant(0, MVT::i32), DAG, dl); + return Insert128BitVector(V, Shuf2, DAG.getConstant(NumElems/2, MVT::i32), + DAG, dl); } /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with @@ -6005,11 +5920,9 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { assert(VT.getSizeInBits() == 128 && "Unsupported vector size"); - SmallVector<std::pair<int, int>, 8> Locs; - Locs.resize(4); - SmallVector<int, 8> Mask1(4U, -1); - SmallVector<int, 8> PermMask; - SVOp->getMask(PermMask); + std::pair<int, int> Locs[4]; + int Mask1[] = { -1, -1, -1, -1 }; + SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end()); unsigned NumHi = 0; unsigned NumLo = 0; @@ -6039,17 +5952,14 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { // vector operands, put the elements into the right order. V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); - SmallVector<int, 8> Mask2(4U, -1); + int Mask2[] = { -1, -1, -1, -1 }; - for (unsigned i = 0; i != 4; ++i) { - if (Locs[i].first == -1) - continue; - else { + for (unsigned i = 0; i != 4; ++i) + if (Locs[i].first != -1) { unsigned Idx = (i < 2) ? 0 : 4; Idx += Locs[i].first * 2 + Locs[i].second; Mask2[i] = Idx; } - } return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); } else if (NumLo == 3 || NumHi == 3) { @@ -6102,18 +6012,16 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { } // Break it into (shuffle shuffle_hi, shuffle_lo). - Locs.clear(); - Locs.resize(4); - SmallVector<int,8> LoMask(4U, -1); - SmallVector<int,8> HiMask(4U, -1); + int LoMask[] = { -1, -1, -1, -1 }; + int HiMask[] = { -1, -1, -1, -1 }; - SmallVector<int,8> *MaskPtr = &LoMask; + int *MaskPtr = LoMask; unsigned MaskIdx = 0; unsigned LoIdx = 0; unsigned HiIdx = 2; for (unsigned i = 0; i != 4; ++i) { if (i == 2) { - MaskPtr = &HiMask; + MaskPtr = HiMask; MaskIdx = 1; LoIdx = 0; HiIdx = 2; @@ -6123,26 +6031,21 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { Locs[i] = std::make_pair(-1, -1); } else if (Idx < 4) { Locs[i] = std::make_pair(MaskIdx, LoIdx); - (*MaskPtr)[LoIdx] = Idx; + MaskPtr[LoIdx] = Idx; LoIdx++; } else { Locs[i] = std::make_pair(MaskIdx, HiIdx); - (*MaskPtr)[HiIdx] = Idx; + MaskPtr[HiIdx] = Idx; HiIdx++; } } SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); - SmallVector<int, 8> MaskOps; - for (unsigned i = 0; i != 4; ++i) { - if (Locs[i].first == -1) { - MaskOps.push_back(-1); - } else { - unsigned Idx = Locs[i].first * 4 + Locs[i].second; - MaskOps.push_back(Idx); - } - } + int MaskOps[] = { -1, -1, -1, -1 }; + for (unsigned i = 0; i != 4; ++i) + if (Locs[i].first != -1) + MaskOps[i] = Locs[i].first * 4 + Locs[i].second; return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); } @@ -6220,35 +6123,41 @@ bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt); V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1); + // If we are accessing the upper part of a YMM register + // then the EXTRACT_VECTOR_ELT is likely to be legalized to a sequence of + // EXTRACT_SUBVECTOR + EXTRACT_VECTOR_ELT, which are not detected at this point + // because the legalization of N did not happen yet. + if (Idx >= (int)NumElems/2 && VT.getSizeInBits() == 256) + return false; + // Skip one more bit_convert if necessary - if (V.getOpcode() == ISD::BITCAST) + if (V.getOpcode() == ISD::BITCAST) { + if (!V.hasOneUse()) + return false; V = V.getOperand(0); + } - if (ISD::isNormalLoad(V.getNode())) { - // Is the original load suitable? - LoadSDNode *LN0 = cast<LoadSDNode>(V); + if (!ISD::isNormalLoad(V.getNode())) + return false; - // FIXME: avoid the multi-use bug that is preventing lots of - // of foldings to be detected, this is still wrong of course, but - // give the temporary desired behavior, and if it happens that - // the load has real more uses, during isel it will not fold, and - // will generate poor code. - if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse() - return false; + // Is the original load suitable? + LoadSDNode *LN0 = cast<LoadSDNode>(V); - if (!HasShuffleIntoBitcast) - return true; + if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile()) + return false; - // If there's a bitcast before the shuffle, check if the load type and - // alignment is valid. - unsigned Align = LN0->getAlignment(); - unsigned NewAlign = - TLI.getTargetData()->getABITypeAlignment( - VT.getTypeForEVT(*DAG.getContext())); + if (!HasShuffleIntoBitcast) + return true; - if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) - return false; - } + // If there's a bitcast before the shuffle, check if the load type and + // alignment is valid. + unsigned Align = LN0->getAlignment(); + unsigned NewAlign = + TLI.getTargetData()->getABITypeAlignment( + VT.getTypeForEVT(*DAG.getContext())); + + if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) + return false; return true; } @@ -6266,14 +6175,14 @@ SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { static SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, - bool HasXMMInt) { + bool HasSSE2) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); EVT VT = Op.getValueType(); assert(VT != MVT::v2i64 && "unsupported shuffle type"); - if (HasXMMInt && VT == MVT::v2f64) + if (HasSSE2 && VT == MVT::v2f64) return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1) @@ -6299,24 +6208,8 @@ SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); } -static inline unsigned getSHUFPOpcode(EVT VT) { - switch(VT.getSimpleVT().SimpleTy) { - case MVT::v8i32: // Use fp unit for int unpack. - case MVT::v8f32: - case MVT::v4i32: // Use fp unit for int unpack. - case MVT::v4f32: return X86ISD::SHUFPS; - case MVT::v4i64: // Use fp unit for int unpack. - case MVT::v4f64: - case MVT::v2i64: // Use fp unit for int unpack. - case MVT::v2f64: return X86ISD::SHUFPD; - default: - llvm_unreachable("Unknown type for shufp*"); - } - return 0; -} - static -SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) { +SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); EVT VT = Op.getValueType(); @@ -6342,7 +6235,7 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); if (CanFoldLoad) { - if (HasXMMInt && NumElems == 2) + if (HasSSE2 && NumElems == 2) return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); if (NumElems == 4) @@ -6357,10 +6250,10 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) { // this is horrible, but will stay like this until we move all shuffle // matching to x86 specific nodes. Note that for the 1st condition all // types are matched with movsd. - if (HasXMMInt) { + if (HasSSE2) { // FIXME: isMOVLMask should be checked and matched before getMOVLP, // as to remove this logic from here, as much as possible - if (NumElems == 2 || !X86::isMOVLMask(SVOp)) + if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT)) return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); } @@ -6368,8 +6261,8 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) { assert(VT != MVT::v4i32 && "unsupported shuffle type"); // Invert the operand order and use SHUFPS to match it. - return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V2, V1, - X86::getShuffleSHUFImmediate(SVOp), DAG); + return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1, + getShuffleSHUFImmediate(SVOp), DAG); } static @@ -6383,7 +6276,7 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, SDValue V2 = Op.getOperand(1); if (isZeroShuffle(SVOp)) - return getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl); + return getZeroVector(VT, Subtarget, DAG, dl); // Handle splat operations if (SVOp->isSplat()) { @@ -6397,8 +6290,8 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, return Op; // Use vbroadcast whenever the splat comes from a foldable load - SDValue LD = isVectorBroadcast(Op, Subtarget->hasAVX2()); - if (Subtarget->hasAVX() && LD.getNode()) + SDValue LD = isVectorBroadcast(Op, Subtarget); + if (LD.getNode()) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD); // Handle splats by matching through known shuffle masks @@ -6417,21 +6310,26 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, if (NewOp.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); } else if ((VT == MVT::v4i32 || - (VT == MVT::v4f32 && Subtarget->hasXMMInt()))) { + (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { // FIXME: Figure out a cleaner way to do this. // Try to make use of movq to zero out the top part. if (ISD::isBuildVectorAllZeros(V2.getNode())) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); if (NewOp.getNode()) { - if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) - return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), + EVT NewVT = NewOp.getValueType(); + if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), + NewVT, true, false)) + return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget, dl); } } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); - if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) - return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), - DAG, Subtarget, dl); + if (NewOp.getNode()) { + EVT NewVT = NewOp.getValueType(); + if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT)) + return getVZextMovL(VT, NewVT, NewOp.getOperand(1), + DAG, Subtarget, dl); + } } } return SDValue(); @@ -6445,10 +6343,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); unsigned NumElems = VT.getVectorNumElements(); + bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; bool V1IsSplat = false; bool V2IsSplat = false; - bool HasXMMInt = Subtarget->hasXMMInt(); + bool HasSSE2 = Subtarget->hasSSE2(); bool HasAVX = Subtarget->hasAVX(); bool HasAVX2 = Subtarget->hasAVX2(); MachineFunction &MF = DAG.getMachineFunction(); @@ -6456,7 +6355,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); - assert(V1.getOpcode() != ISD::UNDEF && "Op 1 of shuffle should not be undef"); + if (V1IsUndef && V2IsUndef) + return DAG.getUNDEF(VT); + + assert(!V1IsUndef && "Op 1 of shuffle should not be undef"); // Vector shuffle lowering takes 3 steps: // @@ -6479,38 +6381,43 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (NewOp.getNode()) return NewOp; + SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end()); + // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and // unpckh_undef). Only use pshufd if speed is more important than size. - if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) + if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasAVX2)) return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); - if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) + if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasAVX2)) return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); - if (X86::isMOVDDUPMask(SVOp) && Subtarget->hasSSE3orAVX() && + if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() && V2IsUndef && RelaxedMayFoldVectorLoad(V1)) return getMOVDDup(Op, dl, V1, DAG); - if (X86::isMOVHLPS_v_undef_Mask(SVOp)) + if (isMOVHLPS_v_undef_Mask(M, VT)) return getMOVHighToLow(Op, dl, DAG); // Use to match splats - if (HasXMMInt && X86::isUNPCKHMask(SVOp, HasAVX2) && V2IsUndef && + if (HasSSE2 && isUNPCKHMask(M, VT, HasAVX2) && V2IsUndef && (VT == MVT::v2f64 || VT == MVT::v2i64)) return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); - if (X86::isPSHUFDMask(SVOp)) { + if (isPSHUFDMask(M, VT)) { // The actual implementation will match the mask in the if above and then // during isel it can match several different instructions, not only pshufd // as its name says, sad but true, emulate the behavior for now... - if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) - return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); + if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) + return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); + + unsigned TargetMask = getShuffleSHUFImmediate(SVOp); - unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); + if (HasAVX && (VT == MVT::v4f32 || VT == MVT::v2f64)) + return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, DAG); - if (HasXMMInt && (VT == MVT::v4f32 || VT == MVT::v4i32)) + if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); - return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V1, + return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1, TargetMask, DAG); } @@ -6518,7 +6425,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { bool isLeft = false; unsigned ShAmt = 0; SDValue ShVal; - bool isShift = HasXMMInt && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); + bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); if (isShift && ShVal.hasOneUse()) { // If the shifted value has multiple uses, it may be cheaper to use // v_set0 + movlhps or movhlps, etc. @@ -6527,11 +6434,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); } - if (X86::isMOVLMask(SVOp)) { + if (isMOVLMask(M, VT)) { if (ISD::isBuildVectorAllZeros(V1.getNode())) return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); - if (!X86::isMOVLPMask(SVOp)) { - if (HasXMMInt && (VT == MVT::v2i64 || VT == MVT::v2f64)) + if (!isMOVLPMask(M, VT)) { + if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); if (VT == MVT::v4i32 || VT == MVT::v4f32) @@ -6540,27 +6447,27 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } // FIXME: fold these into legal mask. - if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp, HasAVX2)) - return getMOVLowToHigh(Op, dl, DAG, HasXMMInt); + if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasAVX2)) + return getMOVLowToHigh(Op, dl, DAG, HasSSE2); - if (X86::isMOVHLPSMask(SVOp)) + if (isMOVHLPSMask(M, VT)) return getMOVHighToLow(Op, dl, DAG); - if (X86::isMOVSHDUPMask(SVOp, Subtarget)) + if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget)) return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); - if (X86::isMOVSLDUPMask(SVOp, Subtarget)) + if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget)) return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); - if (X86::isMOVLPMask(SVOp)) - return getMOVLP(Op, dl, DAG, HasXMMInt); + if (isMOVLPMask(M, VT)) + return getMOVLP(Op, dl, DAG, HasSSE2); - if (ShouldXformToMOVHLPS(SVOp) || - ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) + if (ShouldXformToMOVHLPS(M, VT) || + ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT)) return CommuteVectorShuffle(SVOp, DAG); if (isShift) { - // No better options. Use a vshl / vsrl. + // No better options. Use a vshldq / vsrldq. EVT EltVT = VT.getVectorElementType(); ShAmt *= EltVT.getSizeInBits(); return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); @@ -6573,18 +6480,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { V2IsSplat = isSplatVector(V2.getNode()); // Canonicalize the splat or undef, if present, to be on the RHS. - if (V1IsSplat && !V2IsSplat) { - Op = CommuteVectorShuffle(SVOp, DAG); - SVOp = cast<ShuffleVectorSDNode>(Op); - V1 = SVOp->getOperand(0); - V2 = SVOp->getOperand(1); + if (!V2IsUndef && V1IsSplat && !V2IsSplat) { + CommuteVectorShuffleMask(M, NumElems); + std::swap(V1, V2); std::swap(V1IsSplat, V2IsSplat); Commuted = true; } - SmallVector<int, 32> M; - SVOp->getMask(M); - if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) { // Shuffling low element of v1 into undef, just return v1. if (V2IsUndef) @@ -6604,41 +6506,40 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (V2IsSplat) { // Normalize mask so all entries that point to V2 points to its first // element then try to match unpck{h|l} again. If match, return a - // new vector_shuffle with the corrected mask. - SDValue NewMask = NormalizeMask(SVOp, DAG); - ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); - if (NSVOp != SVOp) { - if (X86::isUNPCKLMask(NSVOp, HasAVX2, true)) { - return NewMask; - } else if (X86::isUNPCKHMask(NSVOp, HasAVX2, true)) { - return NewMask; - } + // new vector_shuffle with the corrected mask.p + SmallVector<int, 8> NewMask(M.begin(), M.end()); + NormalizeMask(NewMask, NumElems); + if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) { + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); + } else if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) { + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); } } if (Commuted) { // Commute is back and try unpck* again. // FIXME: this seems wrong. - SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); - ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); + CommuteVectorShuffleMask(M, NumElems); + std::swap(V1, V2); + std::swap(V1IsSplat, V2IsSplat); + Commuted = false; - if (X86::isUNPCKLMask(NewSVOp, HasAVX2)) - return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V2, V1, DAG); + if (isUNPCKLMask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); - if (X86::isUNPCKHMask(NewSVOp, HasAVX2)) - return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V2, V1, DAG); + if (isUNPCKHMask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); } // Normalize the node to match x86 shuffle ops if needed - if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true) || - isVSHUFPYMask(M, VT, HasAVX, /* Commuted */ true))) + if (!V2IsUndef && (isSHUFPMask(M, VT, HasAVX, /* Commuted */ true))) return CommuteVectorShuffle(SVOp, DAG); // The checks below are all present in isShuffleMaskLegal, but they are // inlined here right now to enable us to directly emit target specific // nodes, and remove one by one until they don't return Op anymore. - if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX())) + if (isPALIGNRMask(M, VT, Subtarget)) return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, getShufflePALIGNRImmediate(SVOp), DAG); @@ -6651,21 +6552,21 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (isPSHUFHWMask(M, VT)) return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, - X86::getShufflePSHUFHWImmediate(SVOp), + getShufflePSHUFHWImmediate(SVOp), DAG); if (isPSHUFLWMask(M, VT)) return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, - X86::getShufflePSHUFLWImmediate(SVOp), + getShufflePSHUFLWImmediate(SVOp), DAG); - if (isSHUFPMask(M, VT)) - return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, - X86::getShuffleSHUFImmediate(SVOp), DAG); + if (isSHUFPMask(M, VT, HasAVX)) + return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, + getShuffleSHUFImmediate(SVOp), DAG); - if (isUNPCKL_v_undef_Mask(M, VT)) + if (isUNPCKL_v_undef_Mask(M, VT, HasAVX2)) return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); - if (isUNPCKH_v_undef_Mask(M, VT)) + if (isUNPCKH_v_undef_Mask(M, VT, HasAVX2)) return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); //===--------------------------------------------------------------------===// @@ -6678,20 +6579,19 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); // Handle VPERMILPS/D* permutations - if (isVPERMILPMask(M, VT, HasAVX)) + if (isVPERMILPMask(M, VT, HasAVX)) { + if (HasAVX2 && VT == MVT::v8i32) + return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, + getShuffleSHUFImmediate(SVOp), DAG); return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, - getShuffleVPERMILPImmediate(SVOp), DAG); + getShuffleSHUFImmediate(SVOp), DAG); + } // Handle VPERM2F128/VPERM2I128 permutations if (isVPERM2X128Mask(M, VT, HasAVX)) return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, V2, getShuffleVPERM2X128Immediate(SVOp), DAG); - // Handle VSHUFPS/DY permutations - if (isVSHUFPYMask(M, VT, HasAVX)) - return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, - getShuffleVSHUFPYImmediate(SVOp), DAG); - //===--------------------------------------------------------------------===// // Since no target specific shuffle was selected for this generic one, // lower it into other known shuffles. FIXME: this isn't true yet, but @@ -6810,7 +6710,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length"); - if (Subtarget->hasSSE41orAVX()) { + if (Subtarget->hasSSE41()) { SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); if (Res.getNode()) return Res; @@ -6952,7 +6852,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { return Insert128BitVector(N0, V, Ins128Idx, DAG, dl); } - if (Subtarget->hasSSE41orAVX()) + if (Subtarget->hasSSE41()) return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); if (EltVT == MVT::i8) @@ -7408,19 +7308,77 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(), Chain.getValue(1)); - } + } else if (Subtarget->isTargetWindows()) { + // Just use the implicit TLS architecture + // Need to generate someting similar to: + // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage + // ; from TEB + // mov ecx, dword [rel _tls_index]: Load index (from C runtime) + // mov rcx, qword [rdx+rcx*8] + // mov eax, .tls$:tlsvar + // [rax+rcx] contains the address + // Windows 64bit: gs:0x58 + // Windows 32bit: fs:__tls_array - assert(false && - "TLS not implemented for this target."); + // If GV is an alias then use the aliasee for determining + // thread-localness. + if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) + GV = GA->resolveAliasedGlobal(false); + DebugLoc dl = GA->getDebugLoc(); + SDValue Chain = DAG.getEntryNode(); - llvm_unreachable("Unreachable"); - return SDValue(); + // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or + // %gs:0x58 (64-bit). + Value *Ptr = Constant::getNullValue(Subtarget->is64Bit() + ? Type::getInt8PtrTy(*DAG.getContext(), + 256) + : Type::getInt32PtrTy(*DAG.getContext(), + 257)); + + SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, + Subtarget->is64Bit() + ? DAG.getIntPtrConstant(0x58) + : DAG.getExternalSymbol("_tls_array", + getPointerTy()), + MachinePointerInfo(Ptr), + false, false, false, 0); + + // Load the _tls_index variable + SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy()); + if (Subtarget->is64Bit()) + IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, + IDX, MachinePointerInfo(), MVT::i32, + false, false, 0); + else + IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(), + false, false, false, 0); + + SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), + getPointerTy()); + IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale); + + SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX); + res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(), + false, false, false, 0); + + // Get the offset of start of .tls section + SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, + GA->getValueType(0), + GA->getOffset(), X86II::MO_SECREL); + SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA); + + // The address of the thread local variable is the add of the thread + // pointer with the offset of the variable. + return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset); + } + + llvm_unreachable("TLS not implemented for this target."); } -/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and -/// take a 2 x i32 value to shift plus a shift amount. -SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const { +/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values +/// and take a 2 x i32 value to shift plus a shift amount. +SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); @@ -7561,85 +7519,65 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const { - // This algorithm is not obvious. Here it is in C code, more or less: + // This algorithm is not obvious. Here it is what we're trying to output: /* - double uint64_to_double( uint32_t hi, uint32_t lo ) { - static const __m128i exp = { 0x4330000045300000ULL, 0 }; - static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; - - // Copy ints to xmm registers. - __m128i xh = _mm_cvtsi32_si128( hi ); - __m128i xl = _mm_cvtsi32_si128( lo ); - - // Combine into low half of a single xmm register. - __m128i x = _mm_unpacklo_epi32( xh, xl ); - __m128d d; - double sd; - - // Merge in appropriate exponents to give the integer bits the right - // magnitude. - x = _mm_unpacklo_epi32( x, exp ); - - // Subtract away the biases to deal with the IEEE-754 double precision - // implicit 1. - d = _mm_sub_pd( (__m128d) x, bias ); - - // All conversions up to here are exact. The correctly rounded result is - // calculated using the current rounding mode using the following - // horizontal add. - d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); - _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this - // store doesn't really need to be here (except - // maybe to zero the other double) - return sd; - } + movq %rax, %xmm0 + punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } + subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } + #ifdef __SSE3__ + haddpd %xmm0, %xmm0 + #else + pshufd $0x4e, %xmm0, %xmm1 + addpd %xmm1, %xmm0 + #endif */ DebugLoc dl = Op.getDebugLoc(); LLVMContext *Context = DAG.getContext(); // Build some magic constants. - SmallVector<Constant*,4> CV0; - CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); - CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); - CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); - CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); - Constant *C0 = ConstantVector::get(CV0); + const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; + Constant *C0 = ConstantDataVector::get(*Context, CV0); SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); SmallVector<Constant*,2> CV1; CV1.push_back( - ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); + ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); CV1.push_back( - ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); + ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); Constant *C1 = ConstantVector::get(CV1); SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); - SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, - DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Op.getOperand(0), - DAG.getIntPtrConstant(1))); - SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, - DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Op.getOperand(0), - DAG.getIntPtrConstant(0))); - SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); + // Load the 64-bit value into an XMM register. + SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, + Op.getOperand(0)); SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, MachinePointerInfo::getConstantPool(), false, false, false, 16); - SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); - SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2); + SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1), + CLod0); + SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, MachinePointerInfo::getConstantPool(), false, false, false, 16); + SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1); SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); + SDValue Result; - // Add the halves; easiest way is to swap them into another reg first. - int ShufMask[2] = { 1, -1 }; - SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, - DAG.getUNDEF(MVT::v2f64), ShufMask); - SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, + if (Subtarget->hasSSE3()) { + // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'. + Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); + } else { + SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub); + SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32, + S2F, 0x4E, DAG); + Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, + DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle), + Sub); + } + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, DAG.getIntPtrConstant(0)); } @@ -7656,8 +7594,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, Op.getOperand(0)); // Zero out the upper parts of the register. - Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget->hasXMMInt(), - DAG); + Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), @@ -7709,6 +7646,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, return LowerUINT_TO_FP_i64(Op, DAG); else if (SrcVT == MVT::i32 && X86ScalarSSEf64) return LowerUINT_TO_FP_i32(Op, DAG); + else if (Subtarget->is64Bit() && + SrcVT == MVT::i64 && DstVT == MVT::f32) + return SDValue(); // Make a 64-bit buffer, and use it to build an FILD. SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); @@ -7728,7 +7668,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), - StackSlot, MachinePointerInfo(), + StackSlot, MachinePointerInfo(), false, false, 0); // For i64 source, we need to add the appropriate power of 2 if the input // was negative. This is the same as the optimization in @@ -7776,19 +7716,19 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, } std::pair<SDValue,SDValue> X86TargetLowering:: -FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { +FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const { DebugLoc DL = Op.getDebugLoc(); EVT DstTy = Op.getValueType(); - if (!IsSigned) { + if (!IsSigned && !isIntegerTypeFTOL(DstTy)) { assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); DstTy = MVT::i64; } assert(DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && - "Unknown FP_TO_SINT to lower!"); + "Unknown FP_TO_INT to lower!"); // These are really Legal. if (DstTy == MVT::i32 && @@ -7799,26 +7739,29 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) return std::make_pair(SDValue(), SDValue()); - // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary - // stack slot. + // We lower FP->int64 either into FISTP64 followed by a load from a temporary + // stack slot, or into the FTOL runtime function. MachineFunction &MF = DAG.getMachineFunction(); unsigned MemSize = DstTy.getSizeInBits()/8; int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); - - unsigned Opc; - switch (DstTy.getSimpleVT().SimpleTy) { - default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); - case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; - case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; - case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; - } + if (!IsSigned && isIntegerTypeFTOL(DstTy)) + Opc = X86ISD::WIN_FTOL; + else + switch (DstTy.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); + case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; + case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; + case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; + } SDValue Chain = DAG.getEntryNode(); SDValue Value = Op.getOperand(0); EVT TheVT = Op.getOperand(0).getValueType(); + // FIXME This causes a redundant load/store if the SSE-class value is already + // in memory, such as if it is on the callstack. if (isScalarFPTypeInSSEReg(TheVT)) { assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); Chain = DAG.getStore(Chain, DL, Value, StackSlot, @@ -7843,12 +7786,26 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), MachineMemOperand::MOStore, MemSize, MemSize); - // Build the FP_TO_INT*_IN_MEM - SDValue Ops[] = { Chain, Value, StackSlot }; - SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), - Ops, 3, DstTy, MMO); - - return std::make_pair(FIST, StackSlot); + if (Opc != X86ISD::WIN_FTOL) { + // Build the FP_TO_INT*_IN_MEM + SDValue Ops[] = { Chain, Value, StackSlot }; + SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), + Ops, 3, DstTy, MMO); + return std::make_pair(FIST, StackSlot); + } else { + SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, + DAG.getVTList(MVT::Other, MVT::Glue), + Chain, Value); + SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX, + MVT::i32, ftol.getValue(1)); + SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX, + MVT::i32, eax.getValue(2)); + SDValue Ops[] = { eax, edx }; + SDValue pair = IsReplace + ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, 2) + : DAG.getMergeValues(Ops, 2, DL); + return std::make_pair(pair, SDValue()); + } } SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, @@ -7856,27 +7813,37 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, if (Op.getValueType().isVector()) return SDValue(); - std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); + std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, + /*IsSigned=*/ true, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. if (FIST.getNode() == 0) return Op; - // Load the result. - return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), - FIST, StackSlot, MachinePointerInfo(), - false, false, false, 0); + if (StackSlot.getNode()) + // Load the result. + return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), + FIST, StackSlot, MachinePointerInfo(), + false, false, false, 0); + else + // The node is the result. + return FIST; } SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const { - std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); + std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, + /*IsSigned=*/ false, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; assert(FIST.getNode() && "Unexpected failure"); - // Load the result. - return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), - FIST, StackSlot, MachinePointerInfo(), - false, false, false, 0); + if (StackSlot.getNode()) + // Load the result. + return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), + FIST, StackSlot, MachinePointerInfo(), + false, false, false, 0); + else + // The node is the result. + return FIST; } SDValue X86TargetLowering::LowerFABS(SDValue Op, @@ -7887,15 +7854,14 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, EVT EltVT = VT; if (VT.isVector()) EltVT = VT.getVectorElementType(); - SmallVector<Constant*,4> CV; + Constant *C; if (EltVT == MVT::f64) { - Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); - CV.assign(2, C); + C = ConstantVector::getSplat(2, + ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); } else { - Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); - CV.assign(4, C); + C = ConstantVector::getSplat(4, + ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); } - Constant *C = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), @@ -7913,15 +7879,12 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { EltVT = VT.getVectorElementType(); NumElts = VT.getVectorNumElements(); } - SmallVector<Constant*,8> CV; - if (EltVT == MVT::f64) { - Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); - CV.assign(NumElts, C); - } else { - Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); - CV.assign(NumElts, C); - } - Constant *C = ConstantVector::get(CV); + Constant *C; + if (EltVT == MVT::f64) + C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); + else + C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); + C = ConstantVector::getSplat(NumElts, C); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), @@ -8353,9 +8316,8 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { if (isFP) { unsigned SSECC = 8; EVT EltVT = Op0.getValueType().getVectorElementType(); - assert(EltVT == MVT::f32 || EltVT == MVT::f64); + assert(EltVT == MVT::f32 || EltVT == MVT::f64); (void)EltVT; - unsigned Opc = EltVT == MVT::f32 ? X86ISD::CMPPS : X86ISD::CMPPD; bool Swap = false; // SSE Condition code mapping: @@ -8395,19 +8357,24 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { if (SSECC == 8) { if (SetCCOpcode == ISD::SETUEQ) { SDValue UNORD, EQ; - UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); - EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); + UNORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(3, MVT::i8)); + EQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(0, MVT::i8)); return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); } else if (SetCCOpcode == ISD::SETONE) { SDValue ORD, NEQ; - ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); - NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); + ORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(7, MVT::i8)); + NEQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(4, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); } llvm_unreachable("Illegal FP comparison"); } // Handle all other FP comparisons here. - return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); + return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(SSECC, MVT::i8)); } // Break 256-bit integer vector compare into smaller ones. @@ -8417,38 +8384,30 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. - unsigned Opc = 0, EQOpc = 0, GTOpc = 0; + unsigned Opc = 0; bool Swap = false, Invert = false, FlipSigns = false; - switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { - default: break; - case MVT::i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; - case MVT::i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; - case MVT::i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; - case MVT::i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; - } - switch (SetCCOpcode) { default: break; case ISD::SETNE: Invert = true; - case ISD::SETEQ: Opc = EQOpc; break; + case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; case ISD::SETLT: Swap = true; - case ISD::SETGT: Opc = GTOpc; break; + case ISD::SETGT: Opc = X86ISD::PCMPGT; break; case ISD::SETGE: Swap = true; - case ISD::SETLE: Opc = GTOpc; Invert = true; break; + case ISD::SETLE: Opc = X86ISD::PCMPGT; Invert = true; break; case ISD::SETULT: Swap = true; - case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; + case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break; case ISD::SETUGE: Swap = true; - case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; + case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break; } if (Swap) std::swap(Op0, Op1); // Check that the operation in question is available (most are plain SSE2, // but PCMPGTQ and PCMPEQQ have different requirements). - if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42orAVX()) + if (Opc == X86ISD::PCMPGT && VT == MVT::v2i64 && !Subtarget->hasSSE42()) return SDValue(); - if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41orAVX()) + if (Opc == X86ISD::PCMPEQ && VT == MVT::v2i64 && !Subtarget->hasSSE41()) return SDValue(); // Since SSE has no unsigned integer comparisons, we need to flip the sign @@ -9113,7 +9072,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { assert(!getTargetMachine().Options.UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && - Subtarget->hasXMM()); + Subtarget->hasSSE1()); } // Insert VAARG_64 node into the DAG @@ -9159,6 +9118,43 @@ SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); } +// getTargetVShiftNOde - Handle vector element shifts where the shift amount +// may or may not be a constant. Takes immediate version of shift as input. +static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, + SDValue SrcOp, SDValue ShAmt, + SelectionDAG &DAG) { + assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32"); + + if (isa<ConstantSDNode>(ShAmt)) { + switch (Opc) { + default: llvm_unreachable("Unknown target vector shift node"); + case X86ISD::VSHLI: + case X86ISD::VSRLI: + case X86ISD::VSRAI: + return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); + } + } + + // Change opcode to non-immediate version + switch (Opc) { + default: llvm_unreachable("Unknown target vector shift node"); + case X86ISD::VSHLI: Opc = X86ISD::VSHL; break; + case X86ISD::VSRLI: Opc = X86ISD::VSRL; break; + case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; + } + + // Need to build a vector containing shift amount + // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0 + SDValue ShOps[4]; + ShOps[0] = ShAmt; + ShOps[1] = DAG.getConstant(0, MVT::i32); + ShOps[2] = DAG.getUNDEF(MVT::i32); + ShOps[3] = DAG.getUNDEF(MVT::i32); + ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4); + ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); + return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); +} + SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); @@ -9193,7 +9189,7 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const unsigned Opc = 0; ISD::CondCode CC = ISD::SETCC_INVALID; switch (IntNo) { - default: break; + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_sse_comieq_ss: case Intrinsic::x86_sse2_comieq_sd: Opc = X86ISD::COMI; @@ -9265,7 +9261,201 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const DAG.getConstant(X86CC, MVT::i8), Cond); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } + // XOP comparison intrinsics + case Intrinsic::x86_xop_vpcomltb: + case Intrinsic::x86_xop_vpcomltw: + case Intrinsic::x86_xop_vpcomltd: + case Intrinsic::x86_xop_vpcomltq: + case Intrinsic::x86_xop_vpcomltub: + case Intrinsic::x86_xop_vpcomltuw: + case Intrinsic::x86_xop_vpcomltud: + case Intrinsic::x86_xop_vpcomltuq: + case Intrinsic::x86_xop_vpcomleb: + case Intrinsic::x86_xop_vpcomlew: + case Intrinsic::x86_xop_vpcomled: + case Intrinsic::x86_xop_vpcomleq: + case Intrinsic::x86_xop_vpcomleub: + case Intrinsic::x86_xop_vpcomleuw: + case Intrinsic::x86_xop_vpcomleud: + case Intrinsic::x86_xop_vpcomleuq: + case Intrinsic::x86_xop_vpcomgtb: + case Intrinsic::x86_xop_vpcomgtw: + case Intrinsic::x86_xop_vpcomgtd: + case Intrinsic::x86_xop_vpcomgtq: + case Intrinsic::x86_xop_vpcomgtub: + case Intrinsic::x86_xop_vpcomgtuw: + case Intrinsic::x86_xop_vpcomgtud: + case Intrinsic::x86_xop_vpcomgtuq: + case Intrinsic::x86_xop_vpcomgeb: + case Intrinsic::x86_xop_vpcomgew: + case Intrinsic::x86_xop_vpcomged: + case Intrinsic::x86_xop_vpcomgeq: + case Intrinsic::x86_xop_vpcomgeub: + case Intrinsic::x86_xop_vpcomgeuw: + case Intrinsic::x86_xop_vpcomgeud: + case Intrinsic::x86_xop_vpcomgeuq: + case Intrinsic::x86_xop_vpcomeqb: + case Intrinsic::x86_xop_vpcomeqw: + case Intrinsic::x86_xop_vpcomeqd: + case Intrinsic::x86_xop_vpcomeqq: + case Intrinsic::x86_xop_vpcomequb: + case Intrinsic::x86_xop_vpcomequw: + case Intrinsic::x86_xop_vpcomequd: + case Intrinsic::x86_xop_vpcomequq: + case Intrinsic::x86_xop_vpcomneb: + case Intrinsic::x86_xop_vpcomnew: + case Intrinsic::x86_xop_vpcomned: + case Intrinsic::x86_xop_vpcomneq: + case Intrinsic::x86_xop_vpcomneub: + case Intrinsic::x86_xop_vpcomneuw: + case Intrinsic::x86_xop_vpcomneud: + case Intrinsic::x86_xop_vpcomneuq: + case Intrinsic::x86_xop_vpcomfalseb: + case Intrinsic::x86_xop_vpcomfalsew: + case Intrinsic::x86_xop_vpcomfalsed: + case Intrinsic::x86_xop_vpcomfalseq: + case Intrinsic::x86_xop_vpcomfalseub: + case Intrinsic::x86_xop_vpcomfalseuw: + case Intrinsic::x86_xop_vpcomfalseud: + case Intrinsic::x86_xop_vpcomfalseuq: + case Intrinsic::x86_xop_vpcomtrueb: + case Intrinsic::x86_xop_vpcomtruew: + case Intrinsic::x86_xop_vpcomtrued: + case Intrinsic::x86_xop_vpcomtrueq: + case Intrinsic::x86_xop_vpcomtrueub: + case Intrinsic::x86_xop_vpcomtrueuw: + case Intrinsic::x86_xop_vpcomtrueud: + case Intrinsic::x86_xop_vpcomtrueuq: { + unsigned CC = 0; + unsigned Opc = 0; + + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_xop_vpcomltb: + case Intrinsic::x86_xop_vpcomltw: + case Intrinsic::x86_xop_vpcomltd: + case Intrinsic::x86_xop_vpcomltq: + CC = 0; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomltub: + case Intrinsic::x86_xop_vpcomltuw: + case Intrinsic::x86_xop_vpcomltud: + case Intrinsic::x86_xop_vpcomltuq: + CC = 0; + Opc = X86ISD::VPCOMU; + break; + case Intrinsic::x86_xop_vpcomleb: + case Intrinsic::x86_xop_vpcomlew: + case Intrinsic::x86_xop_vpcomled: + case Intrinsic::x86_xop_vpcomleq: + CC = 1; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomleub: + case Intrinsic::x86_xop_vpcomleuw: + case Intrinsic::x86_xop_vpcomleud: + case Intrinsic::x86_xop_vpcomleuq: + CC = 1; + Opc = X86ISD::VPCOMU; + break; + case Intrinsic::x86_xop_vpcomgtb: + case Intrinsic::x86_xop_vpcomgtw: + case Intrinsic::x86_xop_vpcomgtd: + case Intrinsic::x86_xop_vpcomgtq: + CC = 2; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomgtub: + case Intrinsic::x86_xop_vpcomgtuw: + case Intrinsic::x86_xop_vpcomgtud: + case Intrinsic::x86_xop_vpcomgtuq: + CC = 2; + Opc = X86ISD::VPCOMU; + break; + case Intrinsic::x86_xop_vpcomgeb: + case Intrinsic::x86_xop_vpcomgew: + case Intrinsic::x86_xop_vpcomged: + case Intrinsic::x86_xop_vpcomgeq: + CC = 3; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomgeub: + case Intrinsic::x86_xop_vpcomgeuw: + case Intrinsic::x86_xop_vpcomgeud: + case Intrinsic::x86_xop_vpcomgeuq: + CC = 3; + Opc = X86ISD::VPCOMU; + break; + case Intrinsic::x86_xop_vpcomeqb: + case Intrinsic::x86_xop_vpcomeqw: + case Intrinsic::x86_xop_vpcomeqd: + case Intrinsic::x86_xop_vpcomeqq: + CC = 4; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomequb: + case Intrinsic::x86_xop_vpcomequw: + case Intrinsic::x86_xop_vpcomequd: + case Intrinsic::x86_xop_vpcomequq: + CC = 4; + Opc = X86ISD::VPCOMU; + break; + case Intrinsic::x86_xop_vpcomneb: + case Intrinsic::x86_xop_vpcomnew: + case Intrinsic::x86_xop_vpcomned: + case Intrinsic::x86_xop_vpcomneq: + CC = 5; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomneub: + case Intrinsic::x86_xop_vpcomneuw: + case Intrinsic::x86_xop_vpcomneud: + case Intrinsic::x86_xop_vpcomneuq: + CC = 5; + Opc = X86ISD::VPCOMU; + break; + case Intrinsic::x86_xop_vpcomfalseb: + case Intrinsic::x86_xop_vpcomfalsew: + case Intrinsic::x86_xop_vpcomfalsed: + case Intrinsic::x86_xop_vpcomfalseq: + CC = 6; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomfalseub: + case Intrinsic::x86_xop_vpcomfalseuw: + case Intrinsic::x86_xop_vpcomfalseud: + case Intrinsic::x86_xop_vpcomfalseuq: + CC = 6; + Opc = X86ISD::VPCOMU; + break; + case Intrinsic::x86_xop_vpcomtrueb: + case Intrinsic::x86_xop_vpcomtruew: + case Intrinsic::x86_xop_vpcomtrued: + case Intrinsic::x86_xop_vpcomtrueq: + CC = 7; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomtrueub: + case Intrinsic::x86_xop_vpcomtrueuw: + case Intrinsic::x86_xop_vpcomtrueud: + case Intrinsic::x86_xop_vpcomtrueuq: + CC = 7; + Opc = X86ISD::VPCOMU; + break; + } + + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + return DAG.getNode(Opc, dl, Op.getValueType(), LHS, RHS, + DAG.getConstant(CC, MVT::i8)); + } + // Arithmetic intrinsics. + case Intrinsic::x86_sse2_pmulu_dq: + case Intrinsic::x86_avx2_pmulu_dq: + return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_sse3_hadd_ps: case Intrinsic::x86_sse3_hadd_pd: case Intrinsic::x86_avx_hadd_ps_256: @@ -9278,6 +9468,18 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const case Intrinsic::x86_avx_hsub_pd_256: return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_ssse3_phadd_w_128: + case Intrinsic::x86_ssse3_phadd_d_128: + case Intrinsic::x86_avx2_phadd_w: + case Intrinsic::x86_avx2_phadd_d: + return DAG.getNode(X86ISD::HADD, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_ssse3_phsub_w_128: + case Intrinsic::x86_ssse3_phsub_d_128: + case Intrinsic::x86_avx2_phsub_w: + case Intrinsic::x86_avx2_phsub_d: + return DAG.getNode(X86ISD::HSUB, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_avx2_psllv_d: case Intrinsic::x86_avx2_psllv_q: case Intrinsic::x86_avx2_psllv_d_256: @@ -9294,6 +9496,33 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const case Intrinsic::x86_avx2_psrav_d_256: return DAG.getNode(ISD::SRA, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_avx2_pshuf_b: + return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_ssse3_psign_b_128: + case Intrinsic::x86_ssse3_psign_w_128: + case Intrinsic::x86_ssse3_psign_d_128: + case Intrinsic::x86_avx2_psign_b: + case Intrinsic::x86_avx2_psign_w: + case Intrinsic::x86_avx2_psign_d: + return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_sse41_insertps: + return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::x86_avx_vperm2f128_ps_256: + case Intrinsic::x86_avx_vperm2f128_pd_256: + case Intrinsic::x86_avx_vperm2f128_si_256: + case Intrinsic::x86_avx2_vperm2i128: + return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::x86_avx_vpermil_ps: + case Intrinsic::x86_avx_vpermil_pd: + case Intrinsic::x86_avx_vpermil_ps_256: + case Intrinsic::x86_avx_vpermil_pd_256: + return DAG.getNode(X86ISD::VPERMILP, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); // ptest and testp intrinsics. The intrinsic these come from are designed to // return an integer value, not just an instruction so lower it to the ptest @@ -9361,24 +9590,53 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } - // Fix vector shift instructions where the last operand is a non-immediate - // i32 value. - case Intrinsic::x86_avx2_pslli_w: - case Intrinsic::x86_avx2_pslli_d: - case Intrinsic::x86_avx2_pslli_q: - case Intrinsic::x86_avx2_psrli_w: - case Intrinsic::x86_avx2_psrli_d: - case Intrinsic::x86_avx2_psrli_q: - case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: + // SSE/AVX shift intrinsics + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + return DAG.getNode(X86ISD::VSHL, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + return DAG.getNode(X86ISD::VSRL, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx2_psra_d: + return DAG.getNode(X86ISD::VSRA, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_sse2_pslli_w: case Intrinsic::x86_sse2_pslli_d: case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + return getTargetVShiftNode(X86ISD::VSHLI, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), DAG); case Intrinsic::x86_sse2_psrli_w: case Intrinsic::x86_sse2_psrli_d: case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + return getTargetVShiftNode(X86ISD::VSRLI, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), DAG); case Intrinsic::x86_sse2_psrai_w: case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: + return getTargetVShiftNode(X86ISD::VSRAI, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), DAG); + // Fix vector shift instructions where the last operand is a non-immediate + // i32 value. case Intrinsic::x86_mmx_pslli_w: case Intrinsic::x86_mmx_pslli_d: case Intrinsic::x86_mmx_pslli_q: @@ -9392,103 +9650,40 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const return SDValue(); unsigned NewIntNo = 0; - EVT ShAmtVT = MVT::v4i32; switch (IntNo) { - case Intrinsic::x86_sse2_pslli_w: - NewIntNo = Intrinsic::x86_sse2_psll_w; - break; - case Intrinsic::x86_sse2_pslli_d: - NewIntNo = Intrinsic::x86_sse2_psll_d; - break; - case Intrinsic::x86_sse2_pslli_q: - NewIntNo = Intrinsic::x86_sse2_psll_q; - break; - case Intrinsic::x86_sse2_psrli_w: - NewIntNo = Intrinsic::x86_sse2_psrl_w; - break; - case Intrinsic::x86_sse2_psrli_d: - NewIntNo = Intrinsic::x86_sse2_psrl_d; - break; - case Intrinsic::x86_sse2_psrli_q: - NewIntNo = Intrinsic::x86_sse2_psrl_q; - break; - case Intrinsic::x86_sse2_psrai_w: - NewIntNo = Intrinsic::x86_sse2_psra_w; + case Intrinsic::x86_mmx_pslli_w: + NewIntNo = Intrinsic::x86_mmx_psll_w; break; - case Intrinsic::x86_sse2_psrai_d: - NewIntNo = Intrinsic::x86_sse2_psra_d; + case Intrinsic::x86_mmx_pslli_d: + NewIntNo = Intrinsic::x86_mmx_psll_d; break; - case Intrinsic::x86_avx2_pslli_w: - NewIntNo = Intrinsic::x86_avx2_psll_w; + case Intrinsic::x86_mmx_pslli_q: + NewIntNo = Intrinsic::x86_mmx_psll_q; break; - case Intrinsic::x86_avx2_pslli_d: - NewIntNo = Intrinsic::x86_avx2_psll_d; + case Intrinsic::x86_mmx_psrli_w: + NewIntNo = Intrinsic::x86_mmx_psrl_w; break; - case Intrinsic::x86_avx2_pslli_q: - NewIntNo = Intrinsic::x86_avx2_psll_q; + case Intrinsic::x86_mmx_psrli_d: + NewIntNo = Intrinsic::x86_mmx_psrl_d; break; - case Intrinsic::x86_avx2_psrli_w: - NewIntNo = Intrinsic::x86_avx2_psrl_w; + case Intrinsic::x86_mmx_psrli_q: + NewIntNo = Intrinsic::x86_mmx_psrl_q; break; - case Intrinsic::x86_avx2_psrli_d: - NewIntNo = Intrinsic::x86_avx2_psrl_d; + case Intrinsic::x86_mmx_psrai_w: + NewIntNo = Intrinsic::x86_mmx_psra_w; break; - case Intrinsic::x86_avx2_psrli_q: - NewIntNo = Intrinsic::x86_avx2_psrl_q; - break; - case Intrinsic::x86_avx2_psrai_w: - NewIntNo = Intrinsic::x86_avx2_psra_w; - break; - case Intrinsic::x86_avx2_psrai_d: - NewIntNo = Intrinsic::x86_avx2_psra_d; - break; - default: { - ShAmtVT = MVT::v2i32; - switch (IntNo) { - case Intrinsic::x86_mmx_pslli_w: - NewIntNo = Intrinsic::x86_mmx_psll_w; - break; - case Intrinsic::x86_mmx_pslli_d: - NewIntNo = Intrinsic::x86_mmx_psll_d; - break; - case Intrinsic::x86_mmx_pslli_q: - NewIntNo = Intrinsic::x86_mmx_psll_q; - break; - case Intrinsic::x86_mmx_psrli_w: - NewIntNo = Intrinsic::x86_mmx_psrl_w; - break; - case Intrinsic::x86_mmx_psrli_d: - NewIntNo = Intrinsic::x86_mmx_psrl_d; - break; - case Intrinsic::x86_mmx_psrli_q: - NewIntNo = Intrinsic::x86_mmx_psrl_q; - break; - case Intrinsic::x86_mmx_psrai_w: - NewIntNo = Intrinsic::x86_mmx_psra_w; - break; - case Intrinsic::x86_mmx_psrai_d: - NewIntNo = Intrinsic::x86_mmx_psra_d; - break; - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - } + case Intrinsic::x86_mmx_psrai_d: + NewIntNo = Intrinsic::x86_mmx_psra_d; break; - } + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. } // The vector shift intrinsics with scalars uses 32b shift amounts but // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits // to be zero. - SDValue ShOps[4]; - ShOps[0] = ShAmt; - ShOps[1] = DAG.getConstant(0, MVT::i32); - if (ShAmtVT == MVT::v4i32) { - ShOps[2] = DAG.getUNDEF(MVT::i32); - ShOps[3] = DAG.getUNDEF(MVT::i32); - ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); - } else { - ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); + ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, ShAmt, + DAG.getConstant(0, MVT::i32)); // FIXME this must be lowered to get rid of the invalid type. - } EVT VT = Op.getValueType(); ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); @@ -9828,7 +10023,8 @@ SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { return Op; } -SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op, + SelectionDAG &DAG) const { EVT VT = Op.getValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); @@ -9836,26 +10032,41 @@ SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { Op = Op.getOperand(0); if (VT == MVT::i8) { + // Zero extend to i32 since there is not an i8 bsr. OpVT = MVT::i32; Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); } - // Issue a bsf (scan bits forward) which also sets EFLAGS. + // Issue a bsr (scan bits in reverse). SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); + Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); + + // And xor with NumBits-1. + Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); + + if (VT == MVT::i8) + Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); + return Op; +} + +SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + unsigned NumBits = VT.getSizeInBits(); + DebugLoc dl = Op.getDebugLoc(); + Op = Op.getOperand(0); + + // Issue a bsf (scan bits forward) which also sets EFLAGS. + SDVTList VTs = DAG.getVTList(VT, MVT::i32); Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); // If src is zero (i.e. bsf sets ZF), returns NumBits. SDValue Ops[] = { Op, - DAG.getConstant(NumBits, OpVT), + DAG.getConstant(NumBits, VT), DAG.getConstant(X86::COND_E, MVT::i8), Op.getValue(1) }; - Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); - - if (VT == MVT::i8) - Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); - return Op; + return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops)); } // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit @@ -9910,86 +10121,46 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()) return Lower256IntArith(Op, DAG); + assert((VT == MVT::v2i64 || VT == MVT::v4i64) && + "Only know how to lower V2I64/V4I64 multiply"); + DebugLoc dl = Op.getDebugLoc(); + // Ahi = psrlqi(a, 32); + // Bhi = psrlqi(b, 32); + // + // AloBlo = pmuludq(a, b); + // AloBhi = pmuludq(a, Bhi); + // AhiBlo = pmuludq(Ahi, b); + + // AloBhi = psllqi(AloBhi, 32); + // AhiBlo = psllqi(AhiBlo, 32); + // return AloBlo + AloBhi + AhiBlo; + SDValue A = Op.getOperand(0); SDValue B = Op.getOperand(1); - if (VT == MVT::v4i64) { - assert(Subtarget->hasAVX2() && "Lowering v4i64 multiply requires AVX2"); + SDValue ShAmt = DAG.getConstant(32, MVT::i32); - // ulong2 Ahi = __builtin_ia32_psrlqi256( a, 32); - // ulong2 Bhi = __builtin_ia32_psrlqi256( b, 32); - // ulong2 AloBlo = __builtin_ia32_pmuludq256( a, b ); - // ulong2 AloBhi = __builtin_ia32_pmuludq256( a, Bhi ); - // ulong2 AhiBlo = __builtin_ia32_pmuludq256( Ahi, b ); - // - // AloBhi = __builtin_ia32_psllqi256( AloBhi, 32 ); - // AhiBlo = __builtin_ia32_psllqi256( AhiBlo, 32 ); - // return AloBlo + AloBhi + AhiBlo; - - SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32), - A, DAG.getConstant(32, MVT::i32)); - SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32), - B, DAG.getConstant(32, MVT::i32)); - SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32), - A, B); - SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32), - A, Bhi); - SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32), - Ahi, B); - AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32), - AloBhi, DAG.getConstant(32, MVT::i32)); - AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32), - AhiBlo, DAG.getConstant(32, MVT::i32)); - SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); - Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); - return Res; - } + SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt); + SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt); - assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); + // Bit cast to 32-bit vectors for MULUDQ + EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32; + A = DAG.getNode(ISD::BITCAST, dl, MulVT, A); + B = DAG.getNode(ISD::BITCAST, dl, MulVT, B); + Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi); + Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi); - // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); - // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); - // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); - // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); - // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); - // - // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); - // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); - // return AloBlo + AloBhi + AhiBlo; + SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); + SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); + SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); + + AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt); + AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt); - SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), - A, DAG.getConstant(32, MVT::i32)); - SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), - B, DAG.getConstant(32, MVT::i32)); - SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), - A, B); - SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), - A, Bhi); - SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), - Ahi, B); - AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), - AloBhi, DAG.getConstant(32, MVT::i32)); - AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), - AhiBlo, DAG.getConstant(32, MVT::i32)); SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); - Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); - return Res; + return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); } SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { @@ -10000,7 +10171,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { SDValue Amt = Op.getOperand(1); LLVMContext *Context = DAG.getContext(); - if (!Subtarget->hasXMMInt()) + if (!Subtarget->hasSSE2()) return SDValue(); // Optimize shl/srl/sra with constant shift amount. @@ -10009,119 +10180,93 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { uint64_t ShiftAmt = C->getZExtValue(); - if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SHL) { - // Make a large shift. - SDValue SHL = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - // Zero out the rightmost bits. - SmallVector<SDValue, 16> V(16, DAG.getConstant(uint8_t(-1U << ShiftAmt), - MVT::i8)); - return DAG.getNode(ISD::AND, dl, VT, SHL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); - } - - if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SRL) { - // Make a large shift. - SDValue SRL = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - // Zero out the leftmost bits. - SmallVector<SDValue, 16> V(16, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, - MVT::i8)); - return DAG.getNode(ISD::AND, dl, VT, SRL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); + if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || + (Subtarget->hasAVX2() && + (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) { + if (Op.getOpcode() == ISD::SHL) + return DAG.getNode(X86ISD::VSHLI, dl, VT, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + if (Op.getOpcode() == ISD::SRL) + return DAG.getNode(X86ISD::VSRLI, dl, VT, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64) + return DAG.getNode(X86ISD::VSRAI, dl, VT, R, + DAG.getConstant(ShiftAmt, MVT::i32)); } - if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SRA) { - if (ShiftAmt == 7) { - // R s>> 7 === R s< 0 - SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl); - return DAG.getNode(X86ISD::PCMPGTB, dl, VT, Zeros, R); + if (VT == MVT::v16i8) { + if (Op.getOpcode() == ISD::SHL) { + // Make a large shift. + SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); + // Zero out the rightmost bits. + SmallVector<SDValue, 16> V(16, + DAG.getConstant(uint8_t(-1U << ShiftAmt), + MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, SHL, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); + } + if (Op.getOpcode() == ISD::SRL) { + // Make a large shift. + SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); + // Zero out the leftmost bits. + SmallVector<SDValue, 16> V(16, + DAG.getConstant(uint8_t(-1U) >> ShiftAmt, + MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, SRL, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); } + if (Op.getOpcode() == ISD::SRA) { + if (ShiftAmt == 7) { + // R s>> 7 === R s< 0 + SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); + } - // R s>> a === ((R u>> a) ^ m) - m - SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); - SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt, - MVT::i8)); - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16); - Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); - Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); - return Res; + // R s>> a === ((R u>> a) ^ m) - m + SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); + SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt, + MVT::i8)); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16); + Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); + Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); + return Res; + } } if (Subtarget->hasAVX2() && VT == MVT::v32i8) { if (Op.getOpcode() == ISD::SHL) { // Make a large shift. - SDValue SHL = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); + SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); // Zero out the rightmost bits. - SmallVector<SDValue, 32> V(32, DAG.getConstant(uint8_t(-1U << ShiftAmt), - MVT::i8)); + SmallVector<SDValue, 32> V(32, + DAG.getConstant(uint8_t(-1U << ShiftAmt), + MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. - SDValue SRL = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); + SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); // Zero out the leftmost bits. - SmallVector<SDValue, 32> V(32, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, - MVT::i8)); + SmallVector<SDValue, 32> V(32, + DAG.getConstant(uint8_t(-1U) >> ShiftAmt, + MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); } if (Op.getOpcode() == ISD::SRA) { if (ShiftAmt == 7) { // R s>> 7 === R s< 0 - SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl); - return DAG.getNode(X86ISD::PCMPGTB, dl, VT, Zeros, R); + SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); } // R s>> a === ((R u>> a) ^ m) - m @@ -10139,14 +10284,11 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { // Lower SHL with variable shift amount. if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { - Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), - Op.getOperand(1), DAG.getConstant(23, MVT::i32)); + Op = DAG.getNode(X86ISD::VSHLI, dl, VT, Op.getOperand(1), + DAG.getConstant(23, MVT::i32)); - ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); - - std::vector<Constant*> CV(4, CI); - Constant *C = ConstantVector::get(CV); + const uint32_t CV[] = { 0x3f800000U, 0x3f800000U, 0x3f800000U, 0x3f800000U}; + Constant *C = ConstantDataVector::get(*Context, CV); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), @@ -10158,55 +10300,54 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::MUL, dl, VT, Op, R); } if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { + assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."); + // a = a << 5; - Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), - Op.getOperand(1), DAG.getConstant(5, MVT::i32)); + Op = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, Op.getOperand(1), + DAG.getConstant(5, MVT::i32)); + Op = DAG.getNode(ISD::BITCAST, dl, VT, Op); - ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); - ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); + // Turn 'a' into a mask suitable for VSELECT + SDValue VSelM = DAG.getConstant(0x80, VT); + SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); + OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); + + SDValue CM1 = DAG.getConstant(0x0f, VT); + SDValue CM2 = DAG.getConstant(0x3f, VT); + + // r = VSELECT(r, psllw(r & (char16)15, 4), a); + SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1); + M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M, + DAG.getConstant(4, MVT::i32), DAG); + M = DAG.getNode(ISD::BITCAST, dl, VT, M); + R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); - std::vector<Constant*> CVM1(16, CM1); - std::vector<Constant*> CVM2(16, CM2); - Constant *C = ConstantVector::get(CVM1); - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); - SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); - - // r = pblendv(r, psllw(r & (char16)15, 4), a); - M = DAG.getNode(ISD::AND, dl, VT, R, M); - M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, - DAG.getConstant(4, MVT::i32)); - R = DAG.getNode(ISD::VSELECT, dl, VT, Op, M, R); // a += a Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); + OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); + OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); + + // r = VSELECT(r, psllw(r & (char16)63, 2), a); + M = DAG.getNode(ISD::AND, dl, VT, R, CM2); + M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M, + DAG.getConstant(2, MVT::i32), DAG); + M = DAG.getNode(ISD::BITCAST, dl, VT, M); + R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); - C = ConstantVector::get(CVM2); - CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); - M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); - - // r = pblendv(r, psllw(r & (char16)63, 2), a); - M = DAG.getNode(ISD::AND, dl, VT, R, M); - M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, - DAG.getConstant(2, MVT::i32)); - R = DAG.getNode(ISD::VSELECT, dl, VT, Op, M, R); // a += a Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); + OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); + OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); - // return pblendv(r, r+r, a); - R = DAG.getNode(ISD::VSELECT, dl, VT, Op, + // return VSELECT(r, r+r, a); + R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, DAG.getNode(ISD::ADD, dl, VT, R, R), R); return R; } // Decompose 256-bit shifts into smaller 128-bit shifts. if (VT.getSizeInBits() == 256) { - int NumElems = VT.getVectorNumElements(); + unsigned NumElems = VT.getVectorNumElements(); MVT EltVT = VT.getVectorElementType().getSimpleVT(); EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); @@ -10221,9 +10362,9 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { // Constant shift amount SmallVector<SDValue, 4> Amt1Csts; SmallVector<SDValue, 4> Amt2Csts; - for (int i = 0; i < NumElems/2; ++i) + for (unsigned i = 0; i != NumElems/2; ++i) Amt1Csts.push_back(Amt->getOperand(i)); - for (int i = NumElems/2; i < NumElems; ++i) + for (unsigned i = NumElems/2; i != NumElems; ++i) Amt2Csts.push_back(Amt->getOperand(i)); Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, @@ -10323,77 +10464,58 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } -SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const{ +SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, + SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); EVT VT = Op.getValueType(); - if (Subtarget->hasXMMInt() && VT.isVector()) { - unsigned BitsDiff = VT.getScalarType().getSizeInBits() - - ExtraVT.getScalarType().getSizeInBits(); - SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32); + if (!Subtarget->hasSSE2() || !VT.isVector()) + return SDValue(); - unsigned SHLIntrinsicsID = 0; - unsigned SRAIntrinsicsID = 0; - switch (VT.getSimpleVT().SimpleTy) { - default: + unsigned BitsDiff = VT.getScalarType().getSizeInBits() - + ExtraVT.getScalarType().getSizeInBits(); + SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32); + + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v8i32: + case MVT::v16i16: + if (!Subtarget->hasAVX()) return SDValue(); - case MVT::v4i32: - SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d; - SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d; - break; - case MVT::v8i16: - SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_w; - SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_w; - break; - case MVT::v8i32: - case MVT::v16i16: - if (!Subtarget->hasAVX()) - return SDValue(); - if (!Subtarget->hasAVX2()) { - // needs to be split - int NumElems = VT.getVectorNumElements(); - SDValue Idx0 = DAG.getConstant(0, MVT::i32); - SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); - - // Extract the LHS vectors - SDValue LHS = Op.getOperand(0); - SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); - SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); - - MVT EltVT = VT.getVectorElementType().getSimpleVT(); - EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); - - EVT ExtraEltVT = ExtraVT.getVectorElementType(); - int ExtraNumElems = ExtraVT.getVectorNumElements(); - ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT, - ExtraNumElems/2); - SDValue Extra = DAG.getValueType(ExtraVT); - - LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); - LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); - - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);; - } - if (VT == MVT::v8i32) { - SHLIntrinsicsID = Intrinsic::x86_avx2_pslli_d; - SRAIntrinsicsID = Intrinsic::x86_avx2_psrai_d; - } else { - SHLIntrinsicsID = Intrinsic::x86_avx2_pslli_w; - SRAIntrinsicsID = Intrinsic::x86_avx2_psrai_w; - } + if (!Subtarget->hasAVX2()) { + // needs to be split + int NumElems = VT.getVectorNumElements(); + SDValue Idx0 = DAG.getConstant(0, MVT::i32); + SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); + + // Extract the LHS vectors + SDValue LHS = Op.getOperand(0); + SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); + SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); + + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + + EVT ExtraEltVT = ExtraVT.getVectorElementType(); + int ExtraNumElems = ExtraVT.getVectorNumElements(); + ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT, + ExtraNumElems/2); + SDValue Extra = DAG.getValueType(ExtraVT); + + LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); + LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);; + } + // fall through + case MVT::v4i32: + case MVT::v8i16: { + SDValue Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, + Op.getOperand(0), ShAmt, DAG); + return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG); } - - SDValue Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(SHLIntrinsicsID, MVT::i32), - Op.getOperand(0), ShAmt); - - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(SRAIntrinsicsID, MVT::i32), - Tmp1, ShAmt); } - - return SDValue(); } @@ -10402,7 +10524,7 @@ SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ // Go ahead and emit the fence on x86-64 even if we asked for no-sse2. // There isn't any reason to disable it if the target processor supports it. - if (!Subtarget->hasXMMInt() && !Subtarget->is64Bit()) { + if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) { SDValue Chain = Op.getOperand(0); SDValue Zero = DAG.getConstant(0, MVT::i32); SDValue Ops[] = { @@ -10456,7 +10578,7 @@ SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op, // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for // no-sse2). There isn't any reason to disable it if the target processor // supports it. - if (Subtarget->hasXMMInt() || Subtarget->is64Bit()) + if (Subtarget->hasSSE2() || Subtarget->is64Bit()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); SDValue Chain = Op.getOperand(0); @@ -10487,8 +10609,7 @@ SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { unsigned Reg = 0; unsigned size = 0; switch(T.getSimpleVT().SimpleTy) { - default: - assert(false && "Invalid value type!"); + default: llvm_unreachable("Invalid value type!"); case MVT::i8: Reg = X86::AL; size = 1; break; case MVT::i16: Reg = X86::AX; size = 2; break; case MVT::i32: Reg = X86::EAX; size = 4; break; @@ -10536,7 +10657,7 @@ SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { EVT SrcVT = Op.getOperand(0).getValueType(); EVT DstVT = Op.getValueType(); - assert(Subtarget->is64Bit() && !Subtarget->hasXMMInt() && + assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && Subtarget->hasMMX() && "Unexpected custom BITCAST"); assert((DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && @@ -10606,7 +10727,7 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { unsigned Opc; bool ExtraOp = false; switch (Op.getOpcode()) { - default: assert(0 && "Invalid code"); + default: llvm_unreachable("Invalid code"); case ISD::ADDC: Opc = X86ISD::ADD; break; case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; case ISD::SUBC: Opc = X86ISD::SUB; break; @@ -10673,6 +10794,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::CTLZ: return LowerCTLZ(Op, DAG); + case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::SRA: @@ -10747,8 +10869,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, DebugLoc dl = N->getDebugLoc(); switch (N->getOpcode()) { default: - assert(false && "Do not know how to custom type legalize this operation!"); - return; + llvm_unreachable("Do not know how to custom type legalize this operation!"); case ISD::SIGN_EXTEND_INREG: case ISD::ADDC: case ISD::ADDE: @@ -10756,16 +10877,25 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::SUBE: // We don't want to expand or promote these. return; - case ISD::FP_TO_SINT: { + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: { + bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; + + if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType())) + return; + std::pair<SDValue,SDValue> Vals = - FP_TO_INTHelper(SDValue(N, 0), DAG, true); + FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); SDValue FIST = Vals.first, StackSlot = Vals.second; if (FIST.getNode() != 0) { EVT VT = N->getValueType(0); // Return a load from the stack slot. - Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, - MachinePointerInfo(), - false, false, false, 0)); + if (StackSlot.getNode() != 0) + Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, + MachinePointerInfo(), + false, false, false, 0)); + else + Results.push_back(FIST); } return; } @@ -10924,18 +11054,17 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; + case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; + case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; case X86ISD::VSRL: return "X86ISD::VSRL"; - case X86ISD::CMPPD: return "X86ISD::CMPPD"; - case X86ISD::CMPPS: return "X86ISD::CMPPS"; - case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; - case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; - case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; - case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; - case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; - case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; - case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; - case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; + case X86ISD::VSRA: return "X86ISD::VSRA"; + case X86ISD::VSHLI: return "X86ISD::VSHLI"; + case X86ISD::VSRLI: return "X86ISD::VSRLI"; + case X86ISD::VSRAI: return "X86ISD::VSRAI"; + case X86ISD::CMPP: return "X86ISD::CMPP"; + case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; + case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; case X86ISD::ADD: return "X86ISD::ADD"; case X86ISD::SUB: return "X86ISD::SUB"; case X86ISD::ADC: return "X86ISD::ADC"; @@ -10957,22 +11086,16 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PALIGN: return "X86ISD::PALIGN"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; - case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; - case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; - case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; - case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; + case X86ISD::SHUFP: return "X86ISD::SHUFP"; case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; - case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; - case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; - case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; case X86ISD::MOVSD: return "X86ISD::MOVSD"; case X86ISD::MOVSS: return "X86ISD::MOVSS"; case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; @@ -10980,11 +11103,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; + case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; + case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; } } @@ -11093,15 +11218,15 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, return (VT.getVectorNumElements() == 2 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isMOVLMask(M, VT) || - isSHUFPMask(M, VT) || + isSHUFPMask(M, VT, Subtarget->hasAVX()) || isPSHUFDMask(M, VT) || isPSHUFHWMask(M, VT) || isPSHUFLWMask(M, VT) || - isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX()) || + isPALIGNRMask(M, VT, Subtarget) || isUNPCKLMask(M, VT, Subtarget->hasAVX2()) || isUNPCKHMask(M, VT, Subtarget->hasAVX2()) || - isUNPCKL_v_undef_Mask(M, VT) || - isUNPCKH_v_undef_Mask(M, VT)); + isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasAVX2()) || + isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasAVX2())); } bool @@ -11114,8 +11239,8 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, if (NumElts == 4 && VT.getSizeInBits() == 128) { return (isMOVLMask(Mask, VT) || isCommutedMOVLMask(Mask, VT, true) || - isSHUFPMask(Mask, VT) || - isSHUFPMask(Mask, VT, /* Commuted */ true)); + isSHUFPMask(Mask, VT, Subtarget->hasAVX()) || + isSHUFPMask(Mask, VT, Subtarget->hasAVX(), /* Commuted */ true)); } return false; } @@ -11134,7 +11259,7 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, unsigned CXchgOpc, unsigned notOpc, unsigned EAXreg, - TargetRegisterClass *RC, + const TargetRegisterClass *RC, bool invSrc) const { // For the atomic bitwise operator, we generate // thisMBB: @@ -11506,7 +11631,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, MachineBasicBlock * X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, unsigned numArgs, bool memArg) const { - assert(Subtarget->hasSSE42orAVX() && + assert(Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); DebugLoc dl = MI->getDebugLoc(); @@ -11911,6 +12036,42 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( return EndMBB; } +// The EFLAGS operand of SelectItr might be missing a kill marker +// because there were multiple uses of EFLAGS, and ISel didn't know +// which to mark. Figure out whether SelectItr should have had a +// kill marker, and set it if it should. Returns the correct kill +// marker value. +static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, + MachineBasicBlock* BB, + const TargetRegisterInfo* TRI) { + // Scan forward through BB for a use/def of EFLAGS. + MachineBasicBlock::iterator miI(llvm::next(SelectItr)); + for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { + const MachineInstr& mi = *miI; + if (mi.readsRegister(X86::EFLAGS)) + return false; + if (mi.definesRegister(X86::EFLAGS)) + break; // Should have kill-flag - update below. + } + + // If we hit the end of the block, check whether EFLAGS is live into a + // successor. + if (miI == BB->end()) { + for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), + sEnd = BB->succ_end(); + sItr != sEnd; ++sItr) { + MachineBasicBlock* succ = *sItr; + if (succ->isLiveIn(X86::EFLAGS)) + return false; + } + } + + // We found a def, or hit the end of the basic block and EFLAGS wasn't live + // out. SelectMI should have a kill flag on EFLAGS. + SelectItr->addRegisterKilled(X86::EFLAGS, TRI); + return true; +} + MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *BB) const { @@ -11940,7 +12101,9 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. - if (!MI->killsRegister(X86::EFLAGS)) { + const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); + if (!MI->killsRegister(X86::EFLAGS) && + !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { copy0MBB->addLiveIn(X86::EFLAGS); sinkMBB->addLiveIn(X86::EFLAGS); } @@ -12038,7 +12201,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) .addReg(tmpSPVReg).addReg(sizeVReg); BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr)) - .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg) + .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) .addReg(SPLimitVReg); BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB); @@ -12051,17 +12214,23 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); // Calls into a routine in libgcc to allocate more space from the heap. + const uint32_t *RegMask = + getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); if (Is64Bit) { BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) .addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) - .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI); + .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI) + .addRegMask(RegMask) + .addReg(X86::RAX, RegState::ImplicitDefine); } else { BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) .addImm(12); BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) - .addExternalSymbol("__morestack_allocate_stack_space"); + .addExternalSymbol("__morestack_allocate_stack_space") + .addRegMask(RegMask) + .addReg(X86::EAX, RegState::ImplicitDefine); } if (!Is64Bit) @@ -12159,6 +12328,11 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); assert(MI->getOperand(3).isGlobal() && "This should be a global"); + // Get a register mask for the lowered call. + // FIXME: The 32-bit calls have non-standard calling conventions. Use a + // proper register mask. + const uint32_t *RegMask = + getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); if (Subtarget->is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI) @@ -12169,6 +12343,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); addDirectMem(MIB, X86::RDI); + MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) @@ -12179,6 +12354,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); addDirectMem(MIB, X86::EAX); + MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); } else { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) @@ -12189,6 +12365,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); addDirectMem(MIB, X86::EAX); + MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); } MI->eraseFromParent(); // The pseudo instruction is gone now. @@ -12199,30 +12376,14 @@ MachineBasicBlock * X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { switch (MI->getOpcode()) { - default: assert(0 && "Unexpected instr type to insert"); + default: llvm_unreachable("Unexpected instr type to insert"); case X86::TAILJMPd64: case X86::TAILJMPr64: case X86::TAILJMPm64: - assert(0 && "TAILJMP64 would not be touched here."); + llvm_unreachable("TAILJMP64 would not be touched here."); case X86::TCRETURNdi64: case X86::TCRETURNri64: case X86::TCRETURNmi64: - // Defs of TCRETURNxx64 has Win64's callee-saved registers, as subset. - // On AMD64, additional defs should be added before register allocation. - if (!Subtarget->isTargetWin64()) { - MI->addRegisterDefined(X86::RSI); - MI->addRegisterDefined(X86::RDI); - MI->addRegisterDefined(X86::XMM6); - MI->addRegisterDefined(X86::XMM7); - MI->addRegisterDefined(X86::XMM8); - MI->addRegisterDefined(X86::XMM9); - MI->addRegisterDefined(X86::XMM10); - MI->addRegisterDefined(X86::XMM11); - MI->addRegisterDefined(X86::XMM12); - MI->addRegisterDefined(X86::XMM13); - MI->addRegisterDefined(X86::XMM14); - MI->addRegisterDefined(X86::XMM15); - } return BB; case X86::WIN_ALLOCA: return EmitLoweredWinAlloca(MI, BB); @@ -12572,15 +12733,18 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, case Intrinsic::x86_sse2_movmsk_pd: case Intrinsic::x86_avx_movmsk_pd_256: case Intrinsic::x86_mmx_pmovmskb: - case Intrinsic::x86_sse2_pmovmskb_128: { + case Intrinsic::x86_sse2_pmovmskb_128: + case Intrinsic::x86_avx2_pmovmskb: { // High bits of movmskp{s|d}, pmovmskb are known zero. switch (IntId) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break; case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break; case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break; case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break; case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break; case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; + case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break; } KnownZero = APInt::getHighBitsSet(Mask.getBitWidth(), Mask.getBitWidth() - NumLoBits); @@ -12651,7 +12815,8 @@ static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget* Subtarget) { DebugLoc dl = N->getDebugLoc(); ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); SDValue V1 = SVOp->getOperand(0); @@ -12687,9 +12852,23 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) return SDValue(); + // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. + if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) { + SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); + SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; + SDValue ResNode = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2, + Ld->getMemoryVT(), + Ld->getPointerInfo(), + Ld->getAlignment(), + false/*isVolatile*/, true/*ReadMem*/, + false/*WriteMem*/); + return DAG.getNode(ISD::BITCAST, dl, VT, ResNode); + } + // Emit a zeroed vector and insert the desired subvector on its // first half. - SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl); + SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), DAG.getConstant(0, MVT::i32), DAG, dl); return DCI.CombineTo(N, InsV); @@ -12734,7 +12913,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, // Combine 256-bit vector shuffles. This is only profitable when in AVX mode if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 && N->getOpcode() == ISD::VECTOR_SHUFFLE) - return PerformShuffleCombine256(N, DAG, DCI); + return PerformShuffleCombine256(N, DAG, DCI, Subtarget); // Only handle 128 wide vector from here on. if (VT.getSizeInBits() != 128) @@ -12750,6 +12929,82 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); } + +/// PerformTruncateCombine - Converts truncate operation to +/// a sequence of vector shuffle operations. +/// It is possible when we truncate 256-bit vector to 128-bit vector + +SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, + DAGCombinerInfo &DCI) const { + if (!DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (!Subtarget->hasAVX()) return SDValue(); + + EVT VT = N->getValueType(0); + SDValue Op = N->getOperand(0); + EVT OpVT = Op.getValueType(); + DebugLoc dl = N->getDebugLoc(); + + if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) { + + SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, + DAG.getIntPtrConstant(0)); + + SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, + DAG.getIntPtrConstant(2)); + + OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi); + + // PSHUFD + int ShufMask1[] = {0, 2, 0, 0}; + + OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT), + ShufMask1); + OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT), + ShufMask1); + + // MOVLHPS + int ShufMask2[] = {0, 1, 4, 5}; + + return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2); + } + if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) { + + SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op, + DAG.getIntPtrConstant(0)); + + SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op, + DAG.getIntPtrConstant(4)); + + OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi); + + // PSHUFB + int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, + -1, -1, -1, -1, -1, -1, -1, -1}; + + OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, + DAG.getUNDEF(MVT::v16i8), + ShufMask1); + OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, + DAG.getUNDEF(MVT::v16i8), + ShufMask1); + + OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi); + + // MOVLHPS + int ShufMask2[] = {0, 1, 4, 5}; + + SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2); + return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res); + } + + return SDValue(); +} + /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index /// generation and convert it from being a bunch of shuffles and extracts /// to a simple store and scalar loads to extract the elements. @@ -12836,6 +13091,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT /// nodes. static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { DebugLoc DL = N->getDebugLoc(); SDValue Cond = N->getOperand(0); @@ -12850,7 +13106,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // ignored in unsafe-math mode). if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && - (Subtarget->hasXMMInt() || + (Subtarget->hasSSE2() || (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); @@ -13081,6 +13337,57 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } } + // Canonicalize max and min: + // (x > y) ? x : y -> (x >= y) ? x : y + // (x < y) ? x : y -> (x <= y) ? x : y + // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates + // the need for an extra compare + // against zero. e.g. + // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0 + // subl %esi, %edi + // testl %edi, %edi + // movl $0, %eax + // cmovgl %edi, %eax + // => + // xorl %eax, %eax + // subl %esi, $edi + // cmovsl %eax, %edi + if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && + DAG.isEqualTo(LHS, Cond.getOperand(0)) && + DAG.isEqualTo(RHS, Cond.getOperand(1))) { + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + switch (CC) { + default: break; + case ISD::SETLT: + case ISD::SETGT: { + ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; + Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(), + Cond.getOperand(0), Cond.getOperand(1), NewCC); + return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); + } + } + } + + // If we know that this node is legal then we know that it is going to be + // matched by one of the SSE/AVX BLEND instructions. These instructions only + // depend on the highest bit in each word. Try to use SimplifyDemandedBits + // to simplify previous instructions. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && + !DCI.isBeforeLegalize() && + TLI.isOperationLegal(ISD::VSELECT, VT)) { + unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); + assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); + APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); + + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), + DCI.isBeforeLegalizeOps()); + if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || + TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) + DCI.CommitTargetLoweringOpt(TLO); + } + return SDValue(); } @@ -13314,6 +13621,7 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { /// PerformShiftCombine - Transforms vector shift nodes to use vector shifts /// when possible. static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); if (N->getOpcode() == ISD::SHL) { @@ -13325,7 +13633,7 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, // all elements are shifted by the same amount. We can't do this in legalize // because the a constant vector is typically transformed to a constant pool // so we have no knowledge of the shift amount. - if (!Subtarget->hasXMMInt()) + if (!Subtarget->hasSSE2()) return SDValue(); if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && @@ -13346,6 +13654,11 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, BaseShAmt = Arg; break; } + // Handle the case where the build_vector is all undef + // FIXME: Should DAG allow this? + if (i == NumElts) + return SDValue(); + for (; i != NumElts; ++i) { SDValue Arg = ShAmtOp.getOperand(i); if (Arg.getOpcode() == ISD::UNDEF) continue; @@ -13372,9 +13685,16 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, BaseShAmt = InVec.getOperand(1); } } - if (BaseShAmt.getNode() == 0) + if (BaseShAmt.getNode() == 0) { + // Don't create instructions with illegal types after legalize + // types has run. + if (!DAG.getTargetLoweringInfo().isTypeLegal(EltVT) && + !DCI.isBeforeLegalize()) + return SDValue(); + BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, DAG.getIntPtrConstant(0)); + } } else return SDValue(); @@ -13389,79 +13709,38 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, switch (N->getOpcode()) { default: llvm_unreachable("Unknown shift opcode!"); - break; case ISD::SHL: - if (VT == MVT::v2i64) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v4i32) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v8i16) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v4i64) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v8i32) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_avx2_pslli_d, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v16i16) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32), - ValOp, BaseShAmt); - break; + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v2i64: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v4i64: + case MVT::v8i32: + case MVT::v16i16: + return getTargetVShiftNode(X86ISD::VSHLI, DL, VT, ValOp, BaseShAmt, DAG); + } case ISD::SRA: - if (VT == MVT::v4i32) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v8i16) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v8i32) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_avx2_psrai_d, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v16i16) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_avx2_psrai_w, MVT::i32), - ValOp, BaseShAmt); - break; + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v4i32: + case MVT::v8i16: + case MVT::v8i32: + case MVT::v16i16: + return getTargetVShiftNode(X86ISD::VSRAI, DL, VT, ValOp, BaseShAmt, DAG); + } case ISD::SRL: - if (VT == MVT::v2i64) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v4i32) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v8i16) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v4i64) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v8i32) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_avx2_psrli_d, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v16i16) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32), - ValOp, BaseShAmt); - break; + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v2i64: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v4i64: + case MVT::v8i32: + case MVT::v16i16: + return getTargetVShiftNode(X86ISD::VSRLI, DL, VT, ValOp, BaseShAmt, DAG); + } } - return SDValue(); } @@ -13475,7 +13754,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but // we're requiring SSE2 for both. - if (Subtarget->hasXMMInt() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { + if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CMP0 = N0->getOperand(1); @@ -13666,14 +13945,14 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, // look for psign/blend if (VT == MVT::v2i64 || VT == MVT::v4i64) { - if (!Subtarget->hasSSSE3orAVX() || + if (!Subtarget->hasSSSE3() || (VT == MVT::v4i64 && !Subtarget->hasAVX2())) return SDValue(); // Canonicalize pandn to RHS if (N0.getOpcode() == X86ISD::ANDNP) std::swap(N0, N1); - // or (and (m, x), (pandn m, y)) + // or (and (m, y), (pandn m, x)) if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { SDValue Mask = N1.getOperand(0); SDValue X = N1.getOperand(1); @@ -13697,24 +13976,14 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, Mask = Mask.getOperand(0); EVT MaskVT = Mask.getValueType(); - // Validate that the Mask operand is a vector sra node. The sra node - // will be an intrinsic. - if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN) - return SDValue(); - + // Validate that the Mask operand is a vector sra node. // FIXME: what to do for bytes, since there is a psignb/pblendvb, but // there is no psrai.b - switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) { - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_sse2_psrai_d: - case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: - break; - default: return SDValue(); - } + if (Mask.getOpcode() != X86ISD::VSRAI) + return SDValue(); // Check that the SRA is all signbits. - SDValue SraC = Mask.getOperand(2); + SDValue SraC = Mask.getOperand(1); unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); if ((SraAmt + 1) != EltBits) @@ -13729,14 +13998,14 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, Y = Y.getOperand(0); if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && - X.getValueType() == MaskVT && X.getValueType() == Y.getValueType() && - (EltBits == 8 || EltBits == 16 || EltBits == 32)) { - SDValue Sign = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, - Mask.getOperand(1)); - return DAG.getNode(ISD::BITCAST, DL, VT, Sign); + X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { + assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && + "Unsupported VT for PSIGN"); + Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); + return DAG.getNode(ISD::BITCAST, DL, VT, Mask); } // PBLENDVB only available on SSE 4.1 - if (!Subtarget->hasSSE41orAVX()) + if (!Subtarget->hasSSE41()) return SDValue(); EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; @@ -13807,6 +14076,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { @@ -13818,6 +14088,8 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, if (VT != MVT::i32 && VT != MVT::i64) return SDValue(); + assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions"); + // Create BLSMSK instructions by finding X ^ (X-1) SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -13849,7 +14121,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, // shuffle. We need SSE4 for the shuffles. // TODO: It is possible to support ZExt by zeroing the undef values // during the shuffle phase or after the shuffle. - if (RegVT.isVector() && Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) { + if (RegVT.isVector() && RegVT.isInteger() && + Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) { assert(MemVT != RegVT && "Cannot extend to the same type"); assert(MemVT.isVector() && "Must load a vector from memory"); @@ -13896,7 +14169,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, // Bitcast the loaded value to a vector of the original element type, in // the size of the target vector type. - SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, ScalarInVector); + SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, + ScalarInVector); unsigned SizeRatio = RegSz/MemSz; // Redistribute the loaded elements into the different locations. @@ -14039,7 +14313,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, const Function *F = DAG.getMachineFunction().getFunction(); bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps - && Subtarget->hasXMMInt(); + && Subtarget->hasSSE2(); if ((VT.isVector() || (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && isa<LoadSDNode>(St->getValue()) && @@ -14057,7 +14331,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, Ld = cast<LoadSDNode>(St->getChain()); else if (St->getValue().hasOneUse() && ChainVal->getOpcode() == ISD::TokenFactor) { - for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { + for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) { if (ChainVal->getOperand(i).getNode() == LdVal) { TokenFactorIndex = i; Ld = cast<LoadSDNode>(St->getValue()); @@ -14196,7 +14470,8 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { A = LHS.getOperand(0); if (LHS.getOperand(1).getOpcode() != ISD::UNDEF) B = LHS.getOperand(1); - cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(LMask); + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(); + std::copy(Mask.begin(), Mask.end(), LMask.begin()); } else { if (LHS.getOpcode() != ISD::UNDEF) A = LHS; @@ -14213,7 +14488,8 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { C = RHS.getOperand(0); if (RHS.getOperand(1).getOpcode() != ISD::UNDEF) D = RHS.getOperand(1); - cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(RMask); + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(); + std::copy(Mask.begin(), Mask.end(), RMask.begin()); } else { if (RHS.getOpcode() != ISD::UNDEF) C = RHS; @@ -14270,7 +14546,7 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, SDValue RHS = N->getOperand(1); // Try to synthesize horizontal adds from adds of shuffles. - if (((Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, true)) return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS); @@ -14285,7 +14561,7 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, SDValue RHS = N->getOperand(1); // Try to synthesize horizontal subs from subs of shuffles. - if (((Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, false)) return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS); @@ -14352,7 +14628,58 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + if (!DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (!Subtarget->hasAVX()) + return SDValue(); + + // Optimize vectors in AVX mode + // Sign extend v8i16 to v8i32 and + // v4i32 to v4i64 + // + // Divide input vector into two parts + // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} + // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 + // concat the vectors to original VT + + EVT VT = N->getValueType(0); + SDValue Op = N->getOperand(0); + EVT OpVT = Op.getValueType(); + DebugLoc dl = N->getDebugLoc(); + + if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) || + (VT == MVT::v8i32 && OpVT == MVT::v8i16)) { + + unsigned NumElems = OpVT.getVectorNumElements(); + SmallVector<int,8> ShufMask1(NumElems, -1); + for (unsigned i = 0; i < NumElems/2; i++) ShufMask1[i] = i; + + SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT), + ShufMask1.data()); + + SmallVector<int,8> ShufMask2(NumElems, -1); + for (unsigned i = 0; i < NumElems/2; i++) ShufMask2[i] = i + NumElems/2; + + SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT), + ShufMask2.data()); + + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), + VT.getVectorNumElements()/2); + + OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); + OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); + } + return SDValue(); +} + +static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> // (and (i32 x86isd::setcc_carry), 1) // This eliminates the zext. This transformation is necessary because @@ -14360,6 +14687,8 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { DebugLoc dl = N->getDebugLoc(); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + EVT OpVT = N0.getValueType(); + if (N0.getOpcode() == ISD::AND && N0.hasOneUse() && N0.getOperand(0).hasOneUse()) { @@ -14374,6 +14703,37 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { N00.getOperand(0), N00.getOperand(1)), DAG.getConstant(1, VT)); } + // Optimize vectors in AVX mode: + // + // v8i16 -> v8i32 + // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32. + // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. + // Concat upper and lower parts. + // + // v4i32 -> v4i64 + // Use vpunpckldq for 4 lower elements v4i32 -> v2i64. + // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. + // Concat upper and lower parts. + // + if (Subtarget->hasAVX()) { + + if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) || + ((VT == MVT::v4i64) && (OpVT == MVT::v4i32))) { + + SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl); + SDValue OpLo = getTargetShuffleNode(X86ISD::UNPCKL, dl, OpVT, N0, ZeroVec, DAG); + SDValue OpHi = getTargetShuffleNode(X86ISD::UNPCKH, dl, OpVT, N0, ZeroVec, DAG); + + EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorNumElements()/2); + + OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); + } + } + return SDValue(); } @@ -14490,8 +14850,8 @@ static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, SDValue Op1 = N->getOperand(1); // Try to synthesize horizontal adds from adds of shuffles. - if (((Subtarget->hasSSSE3orAVX() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || - (Subtarget->hasAVX2() && (VT == MVT::v16i16 || MVT::v8i32))) && + if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || + (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && isHorizontalBinOp(Op0, Op1, true)) return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1); @@ -14523,7 +14883,7 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, // Try to synthesize horizontal adds from adds of shuffles. EVT VT = N->getValueType(0); - if (((Subtarget->hasSSSE3orAVX() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || + if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && isHorizontalBinOp(Op0, Op1, true)) return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1); @@ -14539,7 +14899,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::EXTRACT_VECTOR_ELT: return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); case ISD::VSELECT: - case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); + case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); @@ -14547,7 +14907,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::MUL: return PerformMulCombine(N, DAG, DCI); case ISD::SHL: case ISD::SRA: - case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); + case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget); case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); @@ -14561,10 +14921,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FAND: return PerformFANDCombine(N, DAG); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); - case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); + case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, Subtarget); + case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); + case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG, DCI); case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); - case X86ISD::SHUFPS: // Handle all target specific shuffles - case X86ISD::SHUFPD: + case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::PALIGN: case X86ISD::UNPCKH: case X86ISD::UNPCKL: @@ -14684,11 +15045,38 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { // X86 Inline Assembly Support //===----------------------------------------------------------------------===// +namespace { + // Helper to match a string separated by whitespace. + bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) { + s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace. + + for (unsigned i = 0, e = args.size(); i != e; ++i) { + StringRef piece(*args[i]); + if (!s.startswith(piece)) // Check if the piece matches. + return false; + + s = s.substr(piece.size()); + StringRef::size_type pos = s.find_first_not_of(" \t"); + if (pos == 0) // We matched a prefix. + return false; + + s = s.substr(pos); + } + + return s.empty(); + } + const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={}; +} + bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); std::string AsmStr = IA->getAsmString(); + IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); + if (!Ty || Ty->getBitWidth() % 16 != 0) + return false; + // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" SmallVector<StringRef, 4> AsmPieces; SplitString(AsmStr, AsmPieces, ";\n"); @@ -14696,35 +15084,27 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { switch (AsmPieces.size()) { default: return false; case 1: - AsmStr = AsmPieces[0]; - AsmPieces.clear(); - SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. - // FIXME: this should verify that we are targeting a 486 or better. If not, - // we will turn this bswap into something that will be lowered to logical ops - // instead of emitting the bswap asm. For now, we don't support 486 or lower - // so don't worry about this. + // we will turn this bswap into something that will be lowered to logical + // ops instead of emitting the bswap asm. For now, we don't support 486 or + // lower so don't worry about this. // bswap $0 - if (AsmPieces.size() == 2 && - (AsmPieces[0] == "bswap" || - AsmPieces[0] == "bswapq" || - AsmPieces[0] == "bswapl") && - (AsmPieces[1] == "$0" || - AsmPieces[1] == "${0:q}")) { + if (matchAsm(AsmPieces[0], "bswap", "$0") || + matchAsm(AsmPieces[0], "bswapl", "$0") || + matchAsm(AsmPieces[0], "bswapq", "$0") || + matchAsm(AsmPieces[0], "bswap", "${0:q}") || + matchAsm(AsmPieces[0], "bswapl", "${0:q}") || + matchAsm(AsmPieces[0], "bswapq", "${0:q}")) { // No need to check constraints, nothing other than the equivalent of // "=r,0" would be valid here. - IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); - if (!Ty || Ty->getBitWidth() % 16 != 0) - return false; return IntrinsicLowering::LowerToByteSwap(CI); } + // rorw $$8, ${0:w} --> llvm.bswap.i16 if (CI->getType()->isIntegerTy(16) && - AsmPieces.size() == 3 && - (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && - AsmPieces[1] == "$$8," && - AsmPieces[2] == "${0:w}" && - IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { + IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && + (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") || + matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) { AsmPieces.clear(); const std::string &ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); @@ -14733,46 +15113,26 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { AsmPieces[0] == "~{cc}" && AsmPieces[1] == "~{dirflag}" && AsmPieces[2] == "~{flags}" && - AsmPieces[3] == "~{fpsr}") { - IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); - if (!Ty || Ty->getBitWidth() % 16 != 0) - return false; - return IntrinsicLowering::LowerToByteSwap(CI); - } + AsmPieces[3] == "~{fpsr}") + return IntrinsicLowering::LowerToByteSwap(CI); } break; case 3: if (CI->getType()->isIntegerTy(32) && - IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { - SmallVector<StringRef, 4> Words; - SplitString(AsmPieces[0], Words, " \t,"); - if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && - Words[2] == "${0:w}") { - Words.clear(); - SplitString(AsmPieces[1], Words, " \t,"); - if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" && - Words[2] == "$0") { - Words.clear(); - SplitString(AsmPieces[2], Words, " \t,"); - if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && - Words[2] == "${0:w}") { - AsmPieces.clear(); - const std::string &ConstraintsStr = IA->getConstraintString(); - SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); - std::sort(AsmPieces.begin(), AsmPieces.end()); - if (AsmPieces.size() == 4 && - AsmPieces[0] == "~{cc}" && - AsmPieces[1] == "~{dirflag}" && - AsmPieces[2] == "~{flags}" && - AsmPieces[3] == "~{fpsr}") { - IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); - if (!Ty || Ty->getBitWidth() % 16 != 0) - return false; - return IntrinsicLowering::LowerToByteSwap(CI); - } - } - } - } + IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && + matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") && + matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") && + matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) { + AsmPieces.clear(); + const std::string &ConstraintsStr = IA->getConstraintString(); + SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); + std::sort(AsmPieces.begin(), AsmPieces.end()); + if (AsmPieces.size() == 4 && + AsmPieces[0] == "~{cc}" && + AsmPieces[1] == "~{dirflag}" && + AsmPieces[2] == "~{flags}" && + AsmPieces[3] == "~{fpsr}") + return IntrinsicLowering::LowerToByteSwap(CI); } if (CI->getType()->isIntegerTy(64)) { @@ -14781,23 +15141,10 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 - SmallVector<StringRef, 4> Words; - SplitString(AsmPieces[0], Words, " \t"); - if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { - Words.clear(); - SplitString(AsmPieces[1], Words, " \t"); - if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { - Words.clear(); - SplitString(AsmPieces[2], Words, " \t,"); - if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && - Words[2] == "%edx") { - IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); - if (!Ty || Ty->getBitWidth() % 16 != 0) - return false; - return IntrinsicLowering::LowerToByteSwap(CI); - } - } - } + if (matchAsm(AsmPieces[0], "bswap", "%eax") && + matchAsm(AsmPieces[1], "bswap", "%edx") && + matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx")) + return IntrinsicLowering::LowerToByteSwap(CI); } } break; @@ -14892,7 +15239,8 @@ TargetLowering::ConstraintWeight break; case 'x': case 'Y': - if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM()) + if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) || + ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasAVX())) weight = CW_Register; break; case 'I': @@ -14962,9 +15310,9 @@ LowerXConstraint(EVT ConstraintVT) const { // FP X constraints get lowered to SSE1/2 registers if available, otherwise // 'f' like normal targets. if (ConstraintVT.isFloatingPoint()) { - if (Subtarget->hasXMMInt()) + if (Subtarget->hasSSE2()) return "Y"; - if (Subtarget->hasXMM()) + if (Subtarget->hasSSE1()) return "x"; } @@ -15170,10 +15518,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, if (!Subtarget->hasMMX()) break; return std::make_pair(0U, X86::VR64RegisterClass); case 'Y': // SSE_REGS if SSE2 allowed - if (!Subtarget->hasXMMInt()) break; + if (!Subtarget->hasSSE2()) break; // FALL THROUGH. - case 'x': // SSE_REGS if SSE1 allowed - if (!Subtarget->hasXMM()) break; + case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed + if (!Subtarget->hasSSE1()) break; switch (VT.getSimpleVT().SimpleTy) { default: break; @@ -15192,6 +15540,15 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, case MVT::v4f32: case MVT::v2f64: return std::make_pair(0U, X86::VR128RegisterClass); + // AVX types. + case MVT::v32i8: + case MVT::v16i16: + case MVT::v8i32: + case MVT::v4i64: + case MVT::v8f32: + case MVT::v4f64: + return std::make_pair(0U, X86::VR256RegisterClass); + } break; } |