diff options
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
| -rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 2227 |
1 files changed, 1291 insertions, 936 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 410cc95..03727a2 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -35,7 +35,6 @@ #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" @@ -257,7 +256,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (Subtarget->is64Bit()) { setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); - } else if (!UseSoftFloat) { + } else if (!TM.Options.UseSoftFloat) { // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); @@ -271,7 +270,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); - if (!UseSoftFloat) { + if (!TM.Options.UseSoftFloat) { // SSE has no i16 to fp conversion, only i32 if (X86ScalarSSEf32) { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); @@ -314,7 +313,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (Subtarget->is64Bit()) { setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); - } else if (!UseSoftFloat) { + } else if (!TM.Options.UseSoftFloat) { // Since AVX is a superset of SSE3, only check for SSE here. if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) // Expand FP_TO_UINT into a select. @@ -379,6 +378,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i16 , Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i32 , Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i64 , Expand); if (Subtarget->hasBMI()) { setOperationAction(ISD::CTTZ , MVT::i8 , Promote); } else { @@ -389,6 +392,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::CTTZ , MVT::i64 , Custom); } + setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i8 , Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i16 , Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i32 , Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i64 , Expand); if (Subtarget->hasLZCNT()) { setOperationAction(ISD::CTLZ , MVT::i8 , Promote); } else { @@ -538,14 +545,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? MVT::i64 : MVT::i32, Custom); - else if (EnableSegmentedStacks) + else if (TM.Options.EnableSegmentedStacks) setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? MVT::i64 : MVT::i32, Custom); else setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? MVT::i64 : MVT::i32, Expand); - if (!UseSoftFloat && X86ScalarSSEf64) { + if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { // f32 and f64 use SSE. // Set up the FP register classes. addRegisterClass(MVT::f32, X86::FR32RegisterClass); @@ -577,7 +584,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // cases we handle. addLegalFPImmediate(APFloat(+0.0)); // xorpd addLegalFPImmediate(APFloat(+0.0f)); // xorps - } else if (!UseSoftFloat && X86ScalarSSEf32) { + } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. addRegisterClass(MVT::f32, X86::FR32RegisterClass); @@ -606,11 +613,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS - if (!UnsafeFPMath) { + if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f64 , Expand); setOperationAction(ISD::FCOS , MVT::f64 , Expand); } - } else if (!UseSoftFloat) { + } else if (!TM.Options.UseSoftFloat) { // f32 and f64 in x87. // Set up the FP register classes. addRegisterClass(MVT::f64, X86::RFP64RegisterClass); @@ -621,7 +628,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); - if (!UnsafeFPMath) { + if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f64 , Expand); setOperationAction(ISD::FCOS , MVT::f64 , Expand); } @@ -640,7 +647,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FMA, MVT::f32, Expand); // Long double always uses X87. - if (!UseSoftFloat) { + if (!TM.Options.UseSoftFloat) { addRegisterClass(MVT::f80, X86::RFP80RegisterClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); @@ -659,11 +666,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) addLegalFPImmediate(TmpFlt2); // FLD1/FCHS } - if (!UnsafeFPMath) { + if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f80 , Expand); setOperationAction(ISD::FCOS , MVT::f80 , Expand); } + setOperationAction(ISD::FFLOOR, MVT::f80, Expand); + setOperationAction(ISD::FCEIL, MVT::f80, Expand); + setOperationAction(ISD::FTRUNC, MVT::f80, Expand); + setOperationAction(ISD::FRINT, MVT::f80, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); setOperationAction(ISD::FMA, MVT::f80, Expand); } @@ -715,7 +727,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); @@ -749,7 +763,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // FIXME: In order to prevent SSE instructions being expanded to MMX ones // with -msoft-float, disable use of MMX as well. - if (!UseSoftFloat && Subtarget->hasMMX()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) { addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); // No operations on x86mmx supported, everything uses intrinsics. } @@ -786,7 +800,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); - if (!UseSoftFloat && Subtarget->hasXMM()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasXMM()) { addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); setOperationAction(ISD::FADD, MVT::v4f32, Legal); @@ -803,7 +817,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SETCC, MVT::v4f32, Custom); } - if (!UseSoftFloat && Subtarget->hasXMMInt()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasXMMInt()) { addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM @@ -909,7 +923,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); } - if (Subtarget->hasSSE41() || Subtarget->hasAVX()) { + if (Subtarget->hasSSE41orAVX()) { setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FCEIL, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); @@ -924,10 +938,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); - // Can turn SHL into an integer multiply. - setOperationAction(ISD::SHL, MVT::v4i32, Custom); - setOperationAction(ISD::SHL, MVT::v16i8, Custom); - setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); setOperationAction(ISD::VSELECT, MVT::v2i64, Legal); setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); @@ -948,30 +958,47 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); + // FIXME: these should be Legal but thats only for the case where + // the index is constant. For now custom expand to deal with that if (Subtarget->is64Bit()) { - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); } } if (Subtarget->hasXMMInt()) { - setOperationAction(ISD::SRL, MVT::v2i64, Custom); - setOperationAction(ISD::SRL, MVT::v4i32, Custom); - setOperationAction(ISD::SRL, MVT::v16i8, Custom); setOperationAction(ISD::SRL, MVT::v8i16, Custom); + setOperationAction(ISD::SRL, MVT::v16i8, Custom); - setOperationAction(ISD::SHL, MVT::v2i64, Custom); - setOperationAction(ISD::SHL, MVT::v4i32, Custom); setOperationAction(ISD::SHL, MVT::v8i16, Custom); + setOperationAction(ISD::SHL, MVT::v16i8, Custom); - setOperationAction(ISD::SRA, MVT::v4i32, Custom); setOperationAction(ISD::SRA, MVT::v8i16, Custom); + setOperationAction(ISD::SRA, MVT::v16i8, Custom); + + if (Subtarget->hasAVX2()) { + setOperationAction(ISD::SRL, MVT::v2i64, Legal); + setOperationAction(ISD::SRL, MVT::v4i32, Legal); + + setOperationAction(ISD::SHL, MVT::v2i64, Legal); + setOperationAction(ISD::SHL, MVT::v4i32, Legal); + + setOperationAction(ISD::SRA, MVT::v4i32, Legal); + } else { + setOperationAction(ISD::SRL, MVT::v2i64, Custom); + setOperationAction(ISD::SRL, MVT::v4i32, Custom); + + setOperationAction(ISD::SHL, MVT::v2i64, Custom); + setOperationAction(ISD::SHL, MVT::v4i32, Custom); + + setOperationAction(ISD::SRA, MVT::v4i32, Custom); + } } - if (Subtarget->hasSSE42() || Subtarget->hasAVX()) + if (Subtarget->hasSSE42orAVX()) setOperationAction(ISD::SETCC, MVT::v2i64, Custom); - if (!UseSoftFloat && Subtarget->hasAVX()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) { addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); addRegisterClass(MVT::v16i16, X86::VR256RegisterClass); addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); @@ -1008,18 +1035,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i8, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i16, Custom); - setOperationAction(ISD::SRL, MVT::v4i64, Custom); - setOperationAction(ISD::SRL, MVT::v8i32, Custom); setOperationAction(ISD::SRL, MVT::v16i16, Custom); setOperationAction(ISD::SRL, MVT::v32i8, Custom); - setOperationAction(ISD::SHL, MVT::v4i64, Custom); - setOperationAction(ISD::SHL, MVT::v8i32, Custom); setOperationAction(ISD::SHL, MVT::v16i16, Custom); setOperationAction(ISD::SHL, MVT::v32i8, Custom); - setOperationAction(ISD::SRA, MVT::v8i32, Custom); setOperationAction(ISD::SRA, MVT::v16i16, Custom); + setOperationAction(ISD::SRA, MVT::v32i8, Custom); setOperationAction(ISD::SETCC, MVT::v32i8, Custom); setOperationAction(ISD::SETCC, MVT::v16i16, Custom); @@ -1030,25 +1053,60 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); - setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); - setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); - setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); - setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); + setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); + setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); + setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); + setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); + + if (Subtarget->hasAVX2()) { + setOperationAction(ISD::ADD, MVT::v4i64, Legal); + setOperationAction(ISD::ADD, MVT::v8i32, Legal); + setOperationAction(ISD::ADD, MVT::v16i16, Legal); + setOperationAction(ISD::ADD, MVT::v32i8, Legal); + + setOperationAction(ISD::SUB, MVT::v4i64, Legal); + setOperationAction(ISD::SUB, MVT::v8i32, Legal); + setOperationAction(ISD::SUB, MVT::v16i16, Legal); + setOperationAction(ISD::SUB, MVT::v32i8, Legal); + + setOperationAction(ISD::MUL, MVT::v4i64, Custom); + setOperationAction(ISD::MUL, MVT::v8i32, Legal); + setOperationAction(ISD::MUL, MVT::v16i16, Legal); + // Don't lower v32i8 because there is no 128-bit byte mul + + setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); + + setOperationAction(ISD::SRL, MVT::v4i64, Legal); + setOperationAction(ISD::SRL, MVT::v8i32, Legal); + + setOperationAction(ISD::SHL, MVT::v4i64, Legal); + setOperationAction(ISD::SHL, MVT::v8i32, Legal); - setOperationAction(ISD::ADD, MVT::v4i64, Custom); - setOperationAction(ISD::ADD, MVT::v8i32, Custom); - setOperationAction(ISD::ADD, MVT::v16i16, Custom); - setOperationAction(ISD::ADD, MVT::v32i8, Custom); + setOperationAction(ISD::SRA, MVT::v8i32, Legal); + } else { + setOperationAction(ISD::ADD, MVT::v4i64, Custom); + setOperationAction(ISD::ADD, MVT::v8i32, Custom); + setOperationAction(ISD::ADD, MVT::v16i16, Custom); + setOperationAction(ISD::ADD, MVT::v32i8, Custom); + + setOperationAction(ISD::SUB, MVT::v4i64, Custom); + setOperationAction(ISD::SUB, MVT::v8i32, Custom); + setOperationAction(ISD::SUB, MVT::v16i16, Custom); + setOperationAction(ISD::SUB, MVT::v32i8, Custom); + + setOperationAction(ISD::MUL, MVT::v4i64, Custom); + setOperationAction(ISD::MUL, MVT::v8i32, Custom); + setOperationAction(ISD::MUL, MVT::v16i16, Custom); + // Don't lower v32i8 because there is no 128-bit byte mul - setOperationAction(ISD::SUB, MVT::v4i64, Custom); - setOperationAction(ISD::SUB, MVT::v8i32, Custom); - setOperationAction(ISD::SUB, MVT::v16i16, Custom); - setOperationAction(ISD::SUB, MVT::v32i8, Custom); + setOperationAction(ISD::SRL, MVT::v4i64, Custom); + setOperationAction(ISD::SRL, MVT::v8i32, Custom); - setOperationAction(ISD::MUL, MVT::v4i64, Custom); - setOperationAction(ISD::MUL, MVT::v8i32, Custom); - setOperationAction(ISD::MUL, MVT::v16i16, Custom); - // Don't lower v32i8 because there is no 128-bit byte mul + setOperationAction(ISD::SHL, MVT::v4i64, Custom); + setOperationAction(ISD::SHL, MVT::v8i32, Custom); + + setOperationAction(ISD::SRA, MVT::v8i32, Custom); + } // Custom lower several nodes for 256-bit types. for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; @@ -1155,6 +1213,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::SINT_TO_FP); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); + if (Subtarget->hasBMI()) + setTargetDAGCombine(ISD::XOR); computeRegisterProperties(); @@ -1166,10 +1226,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; - setPrefLoopAlignment(16); + setPrefLoopAlignment(4); // 2^4 bytes. benefitFromCodePlacementOpt = true; - setPrefFunctionAlignment(4); + setPrefFunctionAlignment(4); // 2^4 bytes. } @@ -1230,7 +1290,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it /// means there isn't a need to check it against alignment requirement, /// probably because the source does not need to be loaded. If -/// 'NonScalarIntSafe' is true, that means it's safe to return a +/// 'IsZeroVal' is true, that means it's safe to return a /// non-scalar-integer type, e.g. empty string source, constant, or loaded /// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is /// constant so it does not need to be loaded. @@ -1239,14 +1299,14 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { EVT X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, - bool NonScalarIntSafe, + bool IsZeroVal, bool MemcpyStrSrc, MachineFunction &MF) const { // FIXME: This turns off use of xmm stores for memset/memcpy on targets like // linux. This is because the stack realignment code can't handle certain // cases like PR2962. This should be removed when PR2962 is fixed. const Function *F = MF.getFunction(); - if (NonScalarIntSafe && + if (IsZeroVal && !F->hasFnAttr(Attribute::NoImplicitFloat)) { if (Size >= 16 && (Subtarget->isUnalignedMemAccessFast() || @@ -1664,7 +1724,8 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { /// FuncIsMadeTailCallSafe - Return true if the function is being made into /// a tailcall target by changing its ABI. -static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { +static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, + bool GuaranteedTailCallOpt) { return GuaranteedTailCallOpt && IsTailCallConvention(CC); } @@ -1678,7 +1739,8 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, unsigned i) const { // Create the nodes corresponding to a load from this parameter slot. ISD::ArgFlagsTy Flags = Ins[i].Flags; - bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); + bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv, + getTargetMachine().Options.GuaranteedTailCallOpt); bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); EVT ValVT; @@ -1704,7 +1766,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); return DAG.getLoad(ValVT, dl, Chain, FIN, MachinePointerInfo::getFixedStack(FI), - false, false, 0); + false, false, false, 0); } } @@ -1807,7 +1869,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // If value is passed via pointer - do a load. if (VA.getLocInfo() == CCValAssign::Indirect) ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, - MachinePointerInfo(), false, false, 0); + MachinePointerInfo(), false, false, false, 0); InVals.push_back(ArgValue); } @@ -1828,7 +1890,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, unsigned StackSize = CCInfo.getNextStackOffset(); // Align stack specially for tail calls. - if (FuncIsMadeTailCallSafe(CallConv)) + if (FuncIsMadeTailCallSafe(CallConv, + MF.getTarget().Options.GuaranteedTailCallOpt)) StackSize = GetAlignedArgumentStackSize(StackSize, DAG); // If the function takes variable number of arguments, make a frame index for @@ -1873,9 +1936,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); assert(!(NumXMMRegs && !Subtarget->hasXMM()) && "SSE register cannot be used when SSE is disabled!"); - assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && + assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && + NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"); - if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM()) + if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || + !Subtarget->hasXMM()) // Kernel mode asks for SSE to be disabled, so don't push them // on the stack. TotalNumXMMRegs = 0; @@ -1953,7 +2018,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, } // Some CCs need callee pop. - if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) { + if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, + MF.getTarget().Options.GuaranteedTailCallOpt)) { FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. @@ -2006,7 +2072,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, // Load the "old" Return address. OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), - false, false, 0); + false, false, false, 0); return SDValue(OutRetAddr.getNode(), 1); } @@ -2053,7 +2119,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Sibcalls are automatically detected tailcalls which do not require // ABI changes. - if (!GuaranteedTailCallOpt && isTailCall) + if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) IsSibcall = true; if (isTailCall) @@ -2081,7 +2147,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // This is a sibcall. The memory operands are available in caller's // own caller's stack. NumBytes = 0; - else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) + else if (getTargetMachine().Options.GuaranteedTailCallOpt && + IsTailCallConvention(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; @@ -2260,7 +2327,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, int FI = 0; // Do not flag preceding copytoreg stuff together with the following stuff. InFlag = SDValue(); - if (GuaranteedTailCallOpt) { + if (getTargetMachine().Options.GuaranteedTailCallOpt) { for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (VA.isRegLoc()) @@ -2368,7 +2435,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (ExtraLoad) Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, MachinePointerInfo::getGOT(), - false, false, 0); + false, false, false, 0); } } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { unsigned char OpFlags = 0; @@ -2440,7 +2507,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Create the CALLSEQ_END node. unsigned NumBytesForCalleeToPush; - if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) + if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, + getTargetMachine().Options.GuaranteedTailCallOpt)) NumBytesForCalleeToPush = NumBytes; // Callee pops everything else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) // If this is a call to a struct-return function, the callee @@ -2598,7 +2666,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; - if (GuaranteedTailCallOpt) { + if (getTargetMachine().Options.GuaranteedTailCallOpt) { if (IsTailCallConvention(CalleeCC) && CCMatch) return true; return false; @@ -2798,27 +2866,10 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: - case X86ISD::UNPCKLPS: - case X86ISD::UNPCKLPD: - case X86ISD::VUNPCKLPSY: - case X86ISD::VUNPCKLPDY: - case X86ISD::PUNPCKLWD: - case X86ISD::PUNPCKLBW: - case X86ISD::PUNPCKLDQ: - case X86ISD::PUNPCKLQDQ: - case X86ISD::UNPCKHPS: - case X86ISD::UNPCKHPD: - case X86ISD::VUNPCKHPSY: - case X86ISD::VUNPCKHPDY: - case X86ISD::PUNPCKHWD: - case X86ISD::PUNPCKHBW: - case X86ISD::PUNPCKHDQ: - case X86ISD::PUNPCKHQDQ: - case X86ISD::VPERMILPS: - case X86ISD::VPERMILPSY: - case X86ISD::VPERMILPD: - case X86ISD::VPERMILPDY: - case X86ISD::VPERM2F128: + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + case X86ISD::VPERMILP: + case X86ISD::VPERM2X128: return true; } return false; @@ -2844,10 +2895,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: - case X86ISD::VPERMILPS: - case X86ISD::VPERMILPSY: - case X86ISD::VPERMILPD: - case X86ISD::VPERMILPDY: + case X86ISD::VPERMILP: return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); } @@ -2861,7 +2909,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::PALIGN: case X86ISD::SHUFPD: case X86ISD::SHUFPS: - case X86ISD::VPERM2F128: + case X86ISD::VPERM2X128: return DAG.getNode(Opc, dl, VT, V1, V2, DAG.getConstant(TargetMask, MVT::i8)); } @@ -2879,22 +2927,8 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::MOVLPD: case X86ISD::MOVSS: case X86ISD::MOVSD: - case X86ISD::UNPCKLPS: - case X86ISD::UNPCKLPD: - case X86ISD::VUNPCKLPSY: - case X86ISD::VUNPCKLPDY: - case X86ISD::PUNPCKLWD: - case X86ISD::PUNPCKLBW: - case X86ISD::PUNPCKLDQ: - case X86ISD::PUNPCKLQDQ: - case X86ISD::UNPCKHPS: - case X86ISD::UNPCKHPD: - case X86ISD::VUNPCKHPSY: - case X86ISD::VUNPCKHPDY: - case X86ISD::PUNPCKHWD: - case X86ISD::PUNPCKHBW: - case X86ISD::PUNPCKHDQ: - case X86ISD::PUNPCKHQDQ: + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: return DAG.getNode(Opc, dl, VT, V1, V2); } return SDValue(); @@ -3194,7 +3228,7 @@ bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, bool hasSSSE3OrAVX) { int i, e = VT.getVectorNumElements(); - if (VT.getSizeInBits() != 128 && VT.getSizeInBits() != 64) + if (VT.getSizeInBits() != 128) return false; // Do not handle v2i64 / v2f64 shuffles with palignr. @@ -3224,17 +3258,17 @@ static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, return true; } -/// isVSHUFPSYMask - Return true if the specified VECTOR_SHUFFLE operand +/// isVSHUFPYMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 256-bit /// VSHUFPSY. -static bool isVSHUFPSYMask(const SmallVectorImpl<int> &Mask, EVT VT, - const X86Subtarget *Subtarget) { +static bool isVSHUFPYMask(const SmallVectorImpl<int> &Mask, EVT VT, + bool HasAVX, bool Commuted = false) { int NumElems = VT.getVectorNumElements(); - if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) + if (!HasAVX || VT.getSizeInBits() != 256) return false; - if (NumElems != 8) + if (NumElems != 4 && NumElems != 8) return false; // VSHUFPSY divides the resulting vector into 4 chunks. @@ -3247,134 +3281,90 @@ static bool isVSHUFPSYMask(const SmallVectorImpl<int> &Mask, EVT VT, // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, // Y3..Y0, Y3..Y0, X3..X0, X3..X0 // - int QuarterSize = NumElems/4; - int HalfSize = QuarterSize*2; - for (int i = 0; i < QuarterSize; ++i) - if (!isUndefOrInRange(Mask[i], 0, HalfSize)) - return false; - for (int i = QuarterSize; i < QuarterSize*2; ++i) - if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize)) - return false; - - // The mask of the second half must be the same as the first but with - // the appropriate offsets. This works in the same way as VPERMILPS - // works with masks. - for (int i = QuarterSize*2; i < QuarterSize*3; ++i) { - if (!isUndefOrInRange(Mask[i], HalfSize, NumElems)) - return false; - int FstHalfIdx = i-HalfSize; - if (Mask[FstHalfIdx] < 0) - continue; - if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize)) - return false; - } - for (int i = QuarterSize*3; i < NumElems; ++i) { - if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2)) - return false; - int FstHalfIdx = i-HalfSize; - if (Mask[FstHalfIdx] < 0) - continue; - if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize)) - return false; - - } - - return true; -} - -/// getShuffleVSHUFPSYImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VSHUFPSY instruction. -static unsigned getShuffleVSHUFPSYImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - EVT VT = SVOp->getValueType(0); - int NumElems = VT.getVectorNumElements(); - - assert(NumElems == 8 && VT.getSizeInBits() == 256 && - "Only supports v8i32 and v8f32 types"); - - int HalfSize = NumElems/2; - unsigned Mask = 0; - for (int i = 0; i != NumElems ; ++i) { - if (SVOp->getMaskElt(i) < 0) - continue; - // The mask of the first half must be equal to the second one. - unsigned Shamt = (i%HalfSize)*2; - unsigned Elt = SVOp->getMaskElt(i) % HalfSize; - Mask |= Elt << Shamt; - } - - return Mask; -} - -/// isVSHUFPDYMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to 256-bit -/// VSHUFPDY. This shuffle doesn't have the same restriction as the PS -/// version and the mask of the second half isn't binded with the first -/// one. -static bool isVSHUFPDYMask(const SmallVectorImpl<int> &Mask, EVT VT, - const X86Subtarget *Subtarget) { - int NumElems = VT.getVectorNumElements(); - - if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) - return false; - - if (NumElems != 4) - return false; - - // VSHUFPSY divides the resulting vector into 4 chunks. + // VSHUFPDY divides the resulting vector into 4 chunks. // The sources are also splitted into 4 chunks, and each destination // chunk must come from a different source chunk. // // SRC1 => X3 X2 X1 X0 // SRC2 => Y3 Y2 Y1 Y0 // - // DST => Y2..Y3, X2..X3, Y1..Y0, X1..X0 + // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 // - int QuarterSize = NumElems/4; - int HalfSize = QuarterSize*2; - for (int i = 0; i < QuarterSize; ++i) - if (!isUndefOrInRange(Mask[i], 0, HalfSize)) - return false; - for (int i = QuarterSize; i < QuarterSize*2; ++i) - if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize)) - return false; - for (int i = QuarterSize*2; i < QuarterSize*3; ++i) - if (!isUndefOrInRange(Mask[i], HalfSize, NumElems)) - return false; - for (int i = QuarterSize*3; i < NumElems; ++i) - if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2)) - return false; + unsigned QuarterSize = NumElems/4; + unsigned HalfSize = QuarterSize*2; + for (unsigned l = 0; l != 2; ++l) { + unsigned LaneStart = l*HalfSize; + for (unsigned s = 0; s != 2; ++s) { + unsigned QuarterStart = s*QuarterSize; + unsigned Src = (Commuted) ? (1-s) : s; + unsigned SrcStart = Src*NumElems + LaneStart; + for (unsigned i = 0; i != QuarterSize; ++i) { + int Idx = Mask[i+QuarterStart+LaneStart]; + if (!isUndefOrInRange(Idx, SrcStart, SrcStart+HalfSize)) + return false; + // For VSHUFPSY, the mask of the second half must be the same as the first + // but with the appropriate offsets. This works in the same way as + // VPERMILPS works with masks. + if (NumElems == 4 || l == 0 || Mask[i+QuarterStart] < 0) + continue; + if (!isUndefOrEqual(Idx, Mask[i+QuarterStart]+HalfSize)) + return false; + } + } + } return true; } -/// getShuffleVSHUFPDYImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VSHUFPDY instruction. -static unsigned getShuffleVSHUFPDYImmediate(SDNode *N) { +/// getShuffleVSHUFPYImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VSHUFPSY/VSHUFPDY instructions. +static unsigned getShuffleVSHUFPYImmediate(SDNode *N) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); EVT VT = SVOp->getValueType(0); int NumElems = VT.getVectorNumElements(); - assert(NumElems == 4 && VT.getSizeInBits() == 256 && - "Only supports v4i64 and v4f64 types"); + assert(VT.getSizeInBits() == 256 && "Only supports 256-bit types"); + assert((NumElems == 4 || NumElems == 8) && "Only supports v4 and v8 types"); int HalfSize = NumElems/2; + unsigned Mul = (NumElems == 8) ? 2 : 1; unsigned Mask = 0; - for (int i = 0; i != NumElems ; ++i) { - if (SVOp->getMaskElt(i) < 0) + for (int i = 0; i != NumElems; ++i) { + int Elt = SVOp->getMaskElt(i); + if (Elt < 0) continue; - int Elt = SVOp->getMaskElt(i) % HalfSize; - Mask |= Elt << i; + Elt %= HalfSize; + unsigned Shamt = i; + // For VSHUFPSY, the mask of the first half must be equal to the second one. + if (NumElems == 8) Shamt %= HalfSize; + Mask |= Elt << (Shamt*Mul); } return Mask; } +/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming +/// the two vector operands have swapped position. +static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, + unsigned NumElems) { + for (unsigned i = 0; i != NumElems; ++i) { + int idx = Mask[i]; + if (idx < 0) + continue; + else if (idx < (int)NumElems) + Mask[i] = idx + NumElems; + else + Mask[i] = idx - NumElems; + } +} + /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 128-bit -/// SHUFPS and SHUFPD. -static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { - int NumElems = VT.getVectorNumElements(); +/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be +/// reverse of what x86 shuffles want. +static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT, + bool Commuted = false) { + unsigned NumElems = VT.getVectorNumElements(); if (VT.getSizeInBits() != 128) return false; @@ -3382,12 +3372,14 @@ static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { if (NumElems != 2 && NumElems != 4) return false; - int Half = NumElems / 2; - for (int i = 0; i < Half; ++i) - if (!isUndefOrInRange(Mask[i], 0, NumElems)) + unsigned Half = NumElems / 2; + unsigned SrcStart = Commuted ? NumElems : 0; + for (unsigned i = 0; i != Half; ++i) + if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems)) return false; - for (int i = Half; i < NumElems; ++i) - if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) + SrcStart = Commuted ? 0 : NumElems; + for (unsigned i = Half; i != NumElems; ++i) + if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems)) return false; return true; @@ -3399,32 +3391,6 @@ bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { return ::isSHUFPMask(M, N->getValueType(0)); } -/// isCommutedSHUFP - Returns true if the shuffle mask is exactly -/// the reverse of what x86 shuffles want. x86 shuffles requires the lower -/// half elements to come from vector 1 (which would equal the dest.) and -/// the upper half to come from vector 2. -static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { - int NumElems = VT.getVectorNumElements(); - - if (NumElems != 2 && NumElems != 4) - return false; - - int Half = NumElems / 2; - for (int i = 0; i < Half; ++i) - if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) - return false; - for (int i = Half; i < NumElems; ++i) - if (!isUndefOrInRange(Mask[i], 0, NumElems)) - return false; - return true; -} - -static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return isCommutedSHUFPMask(M, N->getValueType(0)); -} - /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVHLPS. bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { @@ -3505,13 +3471,14 @@ bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKL. static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, - bool V2IsSplat = false) { + bool HasAVX2, bool V2IsSplat = false) { int NumElts = VT.getVectorNumElements(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); - if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8) + if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && + (!HasAVX2 || (NumElts != 16 && NumElts != 32))) return false; // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate @@ -3545,22 +3512,23 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, return true; } -bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { +bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool HasAVX2, bool V2IsSplat) { SmallVector<int, 8> M; N->getMask(M); - return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); + return ::isUNPCKLMask(M, N->getValueType(0), HasAVX2, V2IsSplat); } /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKH. static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, - bool V2IsSplat = false) { + bool HasAVX2, bool V2IsSplat = false) { int NumElts = VT.getVectorNumElements(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); - if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8) + if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && + (!HasAVX2 || (NumElts != 16 && NumElts != 32))) return false; // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate @@ -3592,10 +3560,10 @@ static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, return true; } -bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { +bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool HasAVX2, bool V2IsSplat) { SmallVector<int, 8> M; N->getMask(M); - return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); + return ::isUNPCKHMask(M, N->getValueType(0), HasAVX2, V2IsSplat); } /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form @@ -3691,15 +3659,15 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) { return ::isMOVLMask(M, N->getValueType(0)); } -/// isVPERM2F128Mask - Match 256-bit shuffles where the elements are considered +/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered /// as permutations between 128-bit chunks or halves. As an example: this /// shuffle bellow: /// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> /// The first half comes from the second half of V1 and the second half from the /// the second half of V2. -static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT, - const X86Subtarget *Subtarget) { - if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) +static bool isVPERM2X128Mask(const SmallVectorImpl<int> &Mask, EVT VT, + bool HasAVX) { + if (!HasAVX || VT.getSizeInBits() != 256) return false; // The shuffle result is divided into half A and half B. In total the two @@ -3727,10 +3695,9 @@ static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT, return MatchA && MatchB; } -/// getShuffleVPERM2F128Immediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VPERM2F128 instructions. -static unsigned getShuffleVPERM2F128Immediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); +/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. +static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { EVT VT = SVOp->getValueType(0); int HalfSize = VT.getVectorNumElements()/2; @@ -3752,81 +3719,47 @@ static unsigned getShuffleVPERM2F128Immediate(SDNode *N) { return (FstHalf | (SndHalf << 4)); } -/// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand +/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to VPERMILPD*. /// Note that VPERMIL mask matching is different depending whether theunderlying /// type is 32 or 64. In the VPERMILPS the high half of the mask should point /// to the same elements of the low, but to the higher half of the source. /// In VPERMILPD the two lanes could be shuffled independently of each other /// with the same restriction that lanes can't be crossed. -static bool isVPERMILPDMask(const SmallVectorImpl<int> &Mask, EVT VT, - const X86Subtarget *Subtarget) { +static bool isVPERMILPMask(const SmallVectorImpl<int> &Mask, EVT VT, + bool HasAVX) { int NumElts = VT.getVectorNumElements(); int NumLanes = VT.getSizeInBits()/128; - if (!Subtarget->hasAVX()) + if (!HasAVX) return false; - // Only match 256-bit with 64-bit types - if (VT.getSizeInBits() != 256 || NumElts != 4) + // Only match 256-bit with 32/64-bit types + if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8)) return false; - // The mask on the high lane is independent of the low. Both can match - // any element in inside its own lane, but can't cross. int LaneSize = NumElts/NumLanes; - for (int l = 0; l < NumLanes; ++l) - for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) { - int LaneStart = l*LaneSize; - if (!isUndefOrInRange(Mask[i], LaneStart, LaneStart+LaneSize)) + for (int l = 0; l != NumLanes; ++l) { + int LaneStart = l*LaneSize; + for (int i = 0; i != LaneSize; ++i) { + if (!isUndefOrInRange(Mask[i+LaneStart], LaneStart, LaneStart+LaneSize)) + return false; + if (NumElts == 4 || l == 0) + continue; + // VPERMILPS handling + if (Mask[i] < 0) + continue; + if (!isUndefOrEqual(Mask[i+LaneStart], Mask[i]+LaneSize)) return false; } - - return true; -} - -/// isVPERMILPSMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to VPERMILPS*. -/// Note that VPERMIL mask matching is different depending whether theunderlying -/// type is 32 or 64. In the VPERMILPS the high half of the mask should point -/// to the same elements of the low, but to the higher half of the source. -/// In VPERMILPD the two lanes could be shuffled independently of each other -/// with the same restriction that lanes can't be crossed. -static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT, - const X86Subtarget *Subtarget) { - unsigned NumElts = VT.getVectorNumElements(); - unsigned NumLanes = VT.getSizeInBits()/128; - - if (!Subtarget->hasAVX()) - return false; - - // Only match 256-bit with 32-bit types - if (VT.getSizeInBits() != 256 || NumElts != 8) - return false; - - // The mask on the high lane should be the same as the low. Actually, - // they can differ if any of the corresponding index in a lane is undef - // and the other stays in range. - int LaneSize = NumElts/NumLanes; - for (int i = 0; i < LaneSize; ++i) { - int HighElt = i+LaneSize; - bool HighValid = isUndefOrInRange(Mask[HighElt], LaneSize, NumElts); - bool LowValid = isUndefOrInRange(Mask[i], 0, LaneSize); - - if (!HighValid || !LowValid) - return false; - if (Mask[i] < 0 || Mask[HighElt] < 0) - continue; - if (Mask[HighElt]-Mask[i] != LaneSize) - return false; } return true; } -/// getShuffleVPERMILPSImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VPERMILPS* instructions. -static unsigned getShuffleVPERMILPSImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); +/// getShuffleVPERMILPImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VPERMILPS/D* instructions. +static unsigned getShuffleVPERMILPImmediate(ShuffleVectorSDNode *SVOp) { EVT VT = SVOp->getValueType(0); int NumElts = VT.getVectorNumElements(); @@ -3837,43 +3770,22 @@ static unsigned getShuffleVPERMILPSImmediate(SDNode *N) { // where a mask will match because the same mask element is undef on the // first half but valid on the second. This would get pathological cases // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid. + unsigned Shift = (LaneSize == 4) ? 2 : 1; unsigned Mask = 0; - for (int l = 0; l < NumLanes; ++l) { - for (int i = 0; i < LaneSize; ++i) { - int MaskElt = SVOp->getMaskElt(i+(l*LaneSize)); - if (MaskElt < 0) - continue; - if (MaskElt >= LaneSize) - MaskElt -= LaneSize; - Mask |= MaskElt << (i*2); - } + for (int i = 0; i != NumElts; ++i) { + int MaskElt = SVOp->getMaskElt(i); + if (MaskElt < 0) + continue; + MaskElt %= LaneSize; + unsigned Shamt = i; + // VPERMILPSY, the mask of the first half must be equal to the second one + if (NumElts == 8) Shamt %= LaneSize; + Mask |= MaskElt << (Shamt*Shift); } return Mask; } -/// getShuffleVPERMILPDImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VPERMILPD* instructions. -static unsigned getShuffleVPERMILPDImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - EVT VT = SVOp->getValueType(0); - - int NumElts = VT.getVectorNumElements(); - int NumLanes = VT.getSizeInBits()/128; - - unsigned Mask = 0; - int LaneSize = NumElts/NumLanes; - for (int l = 0; l < NumLanes; ++l) - for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) { - int MaskElt = SVOp->getMaskElt(i); - if (MaskElt < 0) - continue; - Mask |= (MaskElt-l*LaneSize) << i; - } - - return Mask; -} - /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse /// of what x86 movss want. X86 movs requires the lowest element to be lowest /// element of vector 2 and the other elements to come from vector 1 in order. @@ -3907,7 +3819,7 @@ static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N, const X86Subtarget *Subtarget) { - if (!Subtarget->hasSSE3() && !Subtarget->hasAVX()) + if (!Subtarget->hasSSE3orAVX()) return false; // The second vector must be undef @@ -3935,7 +3847,7 @@ bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N, /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N, const X86Subtarget *Subtarget) { - if (!Subtarget->hasSSE3() && !Subtarget->hasAVX()) + if (!Subtarget->hasSSE3orAVX()) return false; // The second vector must be undef @@ -3961,21 +3873,18 @@ bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N, /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 256-bit /// version of MOVDDUP. -static bool isMOVDDUPYMask(ShuffleVectorSDNode *N, - const X86Subtarget *Subtarget) { - EVT VT = N->getValueType(0); +static bool isMOVDDUPYMask(const SmallVectorImpl<int> &Mask, EVT VT, + bool HasAVX) { int NumElts = VT.getVectorNumElements(); - bool V2IsUndef = N->getOperand(1).getOpcode() == ISD::UNDEF; - if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256 || - !V2IsUndef || NumElts != 4) + if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4) return false; for (int i = 0; i != NumElts/2; ++i) - if (!isUndefOrEqual(N->getMaskElt(i), 0)) + if (!isUndefOrEqual(Mask[i], 0)) return false; for (int i = NumElts/2; i != NumElts; ++i) - if (!isUndefOrEqual(N->getMaskElt(i), NumElts/2)) + if (!isUndefOrEqual(Mask[i], NumElts/2)) return false; return true; } @@ -4090,14 +3999,13 @@ unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. -unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - EVT VVT = N->getValueType(0); - unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; +static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { + EVT VT = SVOp->getValueType(0); + unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3; int Val = 0; unsigned i, e; - for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { + for (i = 0, e = VT.getVectorNumElements(); i != e; ++i) { Val = SVOp->getMaskElt(i); if (Val >= 0) break; @@ -4170,21 +4078,6 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, SVOp->getOperand(0), &MaskVec[0]); } -/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming -/// the two vector operands have swapped position. -static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { - unsigned NumElems = VT.getVectorNumElements(); - for (unsigned i = 0; i != NumElems; ++i) { - int idx = Mask[i]; - if (idx < 0) - continue; - else if (idx < (int)NumElems) - Mask[i] = idx + NumElems; - else - Mask[i] = idx - NumElems; - } -} - /// ShouldXformToMOVHLPS - Return true if the node should be transformed to /// match movhlps. The lower half elements should come from upper half of /// V1 (and in order), and the upper half elements should come from the upper @@ -4218,6 +4111,29 @@ static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { return true; } +// Test whether the given value is a vector value which will be legalized +// into a load. +static bool WillBeConstantPoolLoad(SDNode *N) { + if (N->getOpcode() != ISD::BUILD_VECTOR) + return false; + + // Check for any non-constant elements. + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) + switch (N->getOperand(i).getNode()->getOpcode()) { + case ISD::UNDEF: + case ISD::ConstantFP: + case ISD::Constant: + break; + default: + return false; + } + + // Vectors of all-zeros and all-ones are materialized with special + // instructions rather than being loaded. + return !ISD::isBuildVectorAllZeros(N) && + !ISD::isBuildVectorAllOnes(N); +} + /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to /// match movlp{s|d}. The lower half elements should come from lower half of /// V1 (and in order), and the upper half elements should come from the upper @@ -4233,7 +4149,7 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, return false; // Is V2 is a vector load, don't do this transformation. We will try to use // load folding shufps op. - if (ISD::isNON_EXTLoad(V2)) + if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2)) return false; unsigned NumElems = VT.getVectorNumElements(); @@ -4319,23 +4235,30 @@ static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG, } /// getOnesVector - Returns a vector of specified type with all bits set. -/// Always build ones vectors as <4 x i32>. For 256-bit types, use two -/// <4 x i32> inserted in a <8 x i32> appropriately. Then bitcast to their -/// original type, ensuring they get CSE'd. -static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { +/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with +/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. +/// Then bitcast to their original type, ensuring they get CSE'd. +static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG, + DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); assert((VT.is128BitVector() || VT.is256BitVector()) && "Expected a 128-bit or 256-bit vector type"); SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); - SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, - Cst, Cst, Cst, Cst); - - if (VT.is256BitVector()) { - SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32), - Vec, DAG.getConstant(0, MVT::i32), DAG, dl); - Vec = Insert128BitVector(InsV, Vec, - DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl); + SDValue Vec; + if (VT.getSizeInBits() == 256) { + if (HasAVX2) { // AVX2 + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); + } else { // AVX + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32), + Vec, DAG.getConstant(0, MVT::i32), DAG, dl); + Vec = Insert128BitVector(InsV, Vec, + DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl); + } + } else { + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } return DAG.getNode(ISD::BITCAST, dl, VT, Vec); @@ -4542,33 +4465,14 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, case X86ISD::SHUFPS: case X86ISD::SHUFPD: ImmN = N->getOperand(N->getNumOperands()-1); - DecodeSHUFPSMask(NumElems, - cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::PUNPCKHBW: - case X86ISD::PUNPCKHWD: - case X86ISD::PUNPCKHDQ: - case X86ISD::PUNPCKHQDQ: - DecodePUNPCKHMask(NumElems, ShuffleMask); - break; - case X86ISD::UNPCKHPS: - case X86ISD::UNPCKHPD: - case X86ISD::VUNPCKHPSY: - case X86ISD::VUNPCKHPDY: - DecodeUNPCKHPMask(NumElems, ShuffleMask); + DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), + ShuffleMask); break; - case X86ISD::PUNPCKLBW: - case X86ISD::PUNPCKLWD: - case X86ISD::PUNPCKLDQ: - case X86ISD::PUNPCKLQDQ: - DecodePUNPCKLMask(VT, ShuffleMask); + case X86ISD::UNPCKH: + DecodeUNPCKHMask(VT, ShuffleMask); break; - case X86ISD::UNPCKLPS: - case X86ISD::UNPCKLPD: - case X86ISD::VUNPCKLPSY: - case X86ISD::VUNPCKLPDY: - DecodeUNPCKLPMask(VT, ShuffleMask); + case X86ISD::UNPCKL: + DecodeUNPCKLMask(VT, ShuffleMask); break; case X86ISD::MOVHLPS: DecodeMOVHLPSMask(NumElems, ShuffleMask); @@ -4601,27 +4505,12 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, Depth+1); } - case X86ISD::VPERMILPS: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodeVPERMILPSMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::VPERMILPSY: + case X86ISD::VPERMILP: ImmN = N->getOperand(N->getNumOperands()-1); - DecodeVPERMILPSMask(8, cast<ConstantSDNode>(ImmN)->getZExtValue(), + DecodeVPERMILPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), ShuffleMask); break; - case X86ISD::VPERMILPD: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodeVPERMILPDMask(2, cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::VPERMILPDY: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodeVPERMILPDMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::VPERM2F128: + case X86ISD::VPERM2X128: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), ShuffleMask); @@ -4956,7 +4845,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(StartOffset), - false, false, 0); + false, false, false, 0); // Canonicalize it to a v4i32 or v8i32 shuffle. SmallVector<int, 8> Mask; @@ -5021,11 +4910,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), - LDBase->isVolatile(), LDBase->isNonTemporal(), 0); + LDBase->isVolatile(), LDBase->isNonTemporal(), + LDBase->isInvariant(), 0); return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), LDBase->isVolatile(), LDBase->isNonTemporal(), - LDBase->getAlignment()); + LDBase->isInvariant(), LDBase->getAlignment()); } else if (NumElems == 4 && LastLoadedElt == 1 && DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); @@ -5041,6 +4931,97 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, return SDValue(); } +/// isVectorBroadcast - Check if the node chain is suitable to be xformed to +/// a vbroadcast node. We support two patterns: +/// 1. A splat BUILD_VECTOR which uses a single scalar load. +/// 2. A splat shuffle which uses a scalar_to_vector node which comes from +/// a scalar load. +/// The scalar load node is returned when a pattern is found, +/// or SDValue() otherwise. +static SDValue isVectorBroadcast(SDValue &Op, bool hasAVX2) { + EVT VT = Op.getValueType(); + SDValue V = Op; + + if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + + //A suspected load to be broadcasted. + SDValue Ld; + + switch (V.getOpcode()) { + default: + // Unknown pattern found. + return SDValue(); + + case ISD::BUILD_VECTOR: { + // The BUILD_VECTOR node must be a splat. + if (!isSplatVector(V.getNode())) + return SDValue(); + + Ld = V.getOperand(0); + + // The suspected load node has several users. Make sure that all + // of its users are from the BUILD_VECTOR node. + if (!Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) + return SDValue(); + break; + } + + case ISD::VECTOR_SHUFFLE: { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + + // Shuffles must have a splat mask where the first element is + // broadcasted. + if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) + return SDValue(); + + SDValue Sc = Op.getOperand(0); + if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR) + return SDValue(); + + Ld = Sc.getOperand(0); + + // The scalar_to_vector node and the suspected + // load node must have exactly one user. + if (!Sc.hasOneUse() || !Ld.hasOneUse()) + return SDValue(); + break; + } + } + + // The scalar source must be a normal load. + if (!ISD::isNormalLoad(Ld.getNode())) + return SDValue(); + + bool Is256 = VT.getSizeInBits() == 256; + bool Is128 = VT.getSizeInBits() == 128; + unsigned ScalarSize = Ld.getValueType().getSizeInBits(); + + if (hasAVX2) { + // VBroadcast to YMM + if (Is256 && (ScalarSize == 8 || ScalarSize == 16 || + ScalarSize == 32 || ScalarSize == 64 )) + return Ld; + + // VBroadcast to XMM + if (Is128 && (ScalarSize == 8 || ScalarSize == 32 || + ScalarSize == 16 || ScalarSize == 64 )) + return Ld; + } + + // VBroadcast to YMM + if (Is256 && (ScalarSize == 32 || ScalarSize == 64)) + return Ld; + + // VBroadcast to XMM + if (Is128 && (ScalarSize == 32)) + return Ld; + + + // Unsupported broadcast. + return SDValue(); +} + SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); @@ -5061,14 +5042,20 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { } // Vectors containing all ones can be matched by pcmpeqd on 128-bit width - // vectors or broken into v4i32 operations on 256-bit vectors. + // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use + // vpcmpeqd on 256-bit vectors. if (ISD::isBuildVectorAllOnes(Op.getNode())) { - if (Op.getValueType() == MVT::v4i32) + if (Op.getValueType() == MVT::v4i32 || + (Op.getValueType() == MVT::v8i32 && Subtarget->hasAVX2())) return Op; - return getOnesVector(Op.getValueType(), DAG, dl); + return getOnesVector(Op.getValueType(), Subtarget->hasAVX2(), DAG, dl); } + SDValue LD = isVectorBroadcast(Op, Subtarget->hasAVX2()); + if (Subtarget->hasAVX() && LD.getNode()) + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD); + unsigned EVTBits = ExtVT.getSizeInBits(); unsigned NumZero = 0; @@ -5151,8 +5138,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { DAG); } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); - assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); - EVT MiddleVT = MVT::v4i32; + unsigned NumBits = VT.getSizeInBits(); + assert((NumBits == 128 || NumBits == 256) && + "Expected an SSE or AVX value type!"); + EVT MiddleVT = NumBits == 128 ? MVT::v4i32 : MVT::v8i32; Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasXMMInt(), DAG); @@ -5310,7 +5299,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return LD; // For SSE 4.1, use insertps to put the high elements into the low element. - if (getSubtarget()->hasSSE41() || getSubtarget()->hasAVX()) { + if (getSubtarget()->hasSSE41orAVX()) { SDValue Result; if (Op.getOperand(0).getOpcode() != ISD::UNDEF) Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); @@ -5481,7 +5470,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, // quads, disable the next transformation since it does not help SSSE3. bool V1Used = InputQuads[0] || InputQuads[1]; bool V2Used = InputQuads[2] || InputQuads[3]; - if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) { + if (Subtarget->hasSSSE3orAVX()) { if (InputQuads.count() == 2 && V1Used && V2Used) { BestLoQuad = InputQuads.find_first(); BestHiQuad = InputQuads.find_next(BestLoQuad); @@ -5554,7 +5543,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, // If we have SSSE3, and all words of the result are from 1 input vector, // case 2 is generated, otherwise case 3 is generated. If no SSSE3 // is present, fall back to case 4. - if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) { + if (Subtarget->hasSSSE3orAVX()) { SmallVector<SDValue,16> pshufbMask; // If we have elements from both input vectors, set the high bit of the @@ -5622,8 +5611,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskV[0]); - if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && - (Subtarget->hasSSSE3() || Subtarget->hasAVX())) + if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3orAVX()) NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, NewV.getOperand(0), X86::getShufflePSHUFLWImmediate(NewV.getNode()), @@ -5651,8 +5639,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskV[0]); - if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && - (Subtarget->hasSSSE3() || Subtarget->hasAVX())) + if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3orAVX()) NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, NewV.getOperand(0), X86::getShufflePSHUFHWImmediate(NewV.getNode()), @@ -5718,7 +5705,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, } // If SSSE3, use 1 pshufb instruction per vector with elements in the result. - if (TLI.getSubtarget()->hasSSSE3() || TLI.getSubtarget()->hasAVX()) { + if (TLI.getSubtarget()->hasSSSE3orAVX()) { SmallVector<SDValue,16> pshufbMask; // If all result elements are from one input vector, then only translate @@ -6075,7 +6062,7 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { // from X. if (NumHi == 3) { // Normalize it so the 3 elements come from V1. - CommuteVectorShuffleMask(PermMask, VT); + CommuteVectorShuffleMask(PermMask, 4); std::swap(V1, V2); } @@ -6164,6 +6151,10 @@ static bool MayFoldVectorLoad(SDValue V) { V = V.getOperand(0); if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) V = V.getOperand(0); + if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR && + V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF) + // BUILD_VECTOR (load), undef + V = V.getOperand(0); if (MayFoldLoad(V)) return true; return false; @@ -6346,22 +6337,20 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) { // turns into: // (MOVLPSmr addr:$src1, VR128:$src2) // So, recognize this potential and also use MOVLPS or MOVLPD - if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) + else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) CanFoldLoad = true; - // Both of them can't be memory operations though. - if (MayFoldVectorLoad(V1) && MayFoldVectorLoad(V2)) - CanFoldLoad = false; - + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); if (CanFoldLoad) { if (HasXMMInt && NumElems == 2) return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); if (NumElems == 4) - return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); + // If we don't care about the second element, procede to use movss. + if (SVOp->getMaskElt(1) != -1) + return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); } - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); // movl and movlp will both match v2i64, but v2i64 is never matched by // movl earlier because we make it strict to avoid messing with the movlp load // folding logic (see the code above getMOVLP call). Match it here then, @@ -6383,104 +6372,6 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) { X86::getShuffleSHUFImmediate(SVOp), DAG); } -static inline unsigned getUNPCKLOpcode(EVT VT) { - switch(VT.getSimpleVT().SimpleTy) { - case MVT::v4i32: return X86ISD::PUNPCKLDQ; - case MVT::v2i64: return X86ISD::PUNPCKLQDQ; - case MVT::v4f32: return X86ISD::UNPCKLPS; - case MVT::v2f64: return X86ISD::UNPCKLPD; - case MVT::v8i32: // Use fp unit for int unpack. - case MVT::v8f32: return X86ISD::VUNPCKLPSY; - case MVT::v4i64: // Use fp unit for int unpack. - case MVT::v4f64: return X86ISD::VUNPCKLPDY; - case MVT::v16i8: return X86ISD::PUNPCKLBW; - case MVT::v8i16: return X86ISD::PUNPCKLWD; - default: - llvm_unreachable("Unknown type for unpckl"); - } - return 0; -} - -static inline unsigned getUNPCKHOpcode(EVT VT) { - switch(VT.getSimpleVT().SimpleTy) { - case MVT::v4i32: return X86ISD::PUNPCKHDQ; - case MVT::v2i64: return X86ISD::PUNPCKHQDQ; - case MVT::v4f32: return X86ISD::UNPCKHPS; - case MVT::v2f64: return X86ISD::UNPCKHPD; - case MVT::v8i32: // Use fp unit for int unpack. - case MVT::v8f32: return X86ISD::VUNPCKHPSY; - case MVT::v4i64: // Use fp unit for int unpack. - case MVT::v4f64: return X86ISD::VUNPCKHPDY; - case MVT::v16i8: return X86ISD::PUNPCKHBW; - case MVT::v8i16: return X86ISD::PUNPCKHWD; - default: - llvm_unreachable("Unknown type for unpckh"); - } - return 0; -} - -static inline unsigned getVPERMILOpcode(EVT VT) { - switch(VT.getSimpleVT().SimpleTy) { - case MVT::v4i32: - case MVT::v4f32: return X86ISD::VPERMILPS; - case MVT::v2i64: - case MVT::v2f64: return X86ISD::VPERMILPD; - case MVT::v8i32: - case MVT::v8f32: return X86ISD::VPERMILPSY; - case MVT::v4i64: - case MVT::v4f64: return X86ISD::VPERMILPDY; - default: - llvm_unreachable("Unknown type for vpermil"); - } - return 0; -} - -/// isVectorBroadcast - Check if the node chain is suitable to be xformed to -/// a vbroadcast node. The nodes are suitable whenever we can fold a load coming -/// from a 32 or 64 bit scalar. Update Op to the desired load to be folded. -static bool isVectorBroadcast(SDValue &Op) { - EVT VT = Op.getValueType(); - bool Is256 = VT.getSizeInBits() == 256; - - assert((VT.getSizeInBits() == 128 || Is256) && - "Unsupported type for vbroadcast node"); - - SDValue V = Op; - if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) - V = V.getOperand(0); - - if (Is256 && !(V.hasOneUse() && - V.getOpcode() == ISD::INSERT_SUBVECTOR && - V.getOperand(0).getOpcode() == ISD::UNDEF)) - return false; - - if (Is256) - V = V.getOperand(1); - - if (!V.hasOneUse()) - return false; - - // Check the source scalar_to_vector type. 256-bit broadcasts are - // supported for 32/64-bit sizes, while 128-bit ones are only supported - // for 32-bit scalars. - if (V.getOpcode() != ISD::SCALAR_TO_VECTOR) - return false; - - unsigned ScalarSize = V.getOperand(0).getValueType().getSizeInBits(); - if (ScalarSize != 32 && ScalarSize != 64) - return false; - if (!Is256 && ScalarSize == 64) - return false; - - V = V.getOperand(0); - if (!MayFoldLoad(V)) - return false; - - // Return the load node - Op = V; - return true; -} - static SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, const TargetLowering &TLI, @@ -6506,8 +6397,9 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, return Op; // Use vbroadcast whenever the splat comes from a foldable load - if (Subtarget->hasAVX() && isVectorBroadcast(V1)) - return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1); + SDValue LD = isVectorBroadcast(Op, Subtarget->hasAVX2()); + if (Subtarget->hasAVX() && LD.getNode()) + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD); // Handle splats by matching through known shuffle masks if ((Size == 128 && NumElem <= 4) || @@ -6553,18 +6445,18 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); unsigned NumElems = VT.getVectorNumElements(); - bool isMMX = VT.getSizeInBits() == 64; - bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; bool V1IsSplat = false; bool V2IsSplat = false; bool HasXMMInt = Subtarget->hasXMMInt(); + bool HasAVX = Subtarget->hasAVX(); + bool HasAVX2 = Subtarget->hasAVX2(); MachineFunction &MF = DAG.getMachineFunction(); bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); - // Shuffle operations on MMX not supported. - if (isMMX) - return Op; + assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); + + assert(V1.getOpcode() != ISD::UNDEF && "Op 1 of shuffle should not be undef"); // Vector shuffle lowering takes 3 steps: // @@ -6576,7 +6468,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // so the shuffle can be broken into other shuffles and the legalizer can // try the lowering again. // - // The general ideia is that no vector_shuffle operation should be left to + // The general idea is that no vector_shuffle operation should be left to // be matched during isel, all of them must be converted to a target specific // node here. @@ -6590,12 +6482,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and // unpckh_undef). Only use pshufd if speed is more important than size. if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) - return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) - return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); - if (X86::isMOVDDUPMask(SVOp) && - (Subtarget->hasSSE3() || Subtarget->hasAVX()) && + if (X86::isMOVDDUPMask(SVOp) && Subtarget->hasSSE3orAVX() && V2IsUndef && RelaxedMayFoldVectorLoad(V1)) return getMOVDDup(Op, dl, V1, DAG); @@ -6603,9 +6494,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getMOVHighToLow(Op, dl, DAG); // Use to match splats - if (HasXMMInt && X86::isUNPCKHMask(SVOp) && V2IsUndef && + if (HasXMMInt && X86::isUNPCKHMask(SVOp, HasAVX2) && V2IsUndef && (VT == MVT::v2f64 || VT == MVT::v2i64)) - return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); if (X86::isPSHUFDMask(SVOp)) { // The actual implementation will match the mask in the if above and then @@ -6627,8 +6518,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { bool isLeft = false; unsigned ShAmt = 0; SDValue ShVal; - bool isShift = getSubtarget()->hasXMMInt() && - isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); + bool isShift = HasXMMInt && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); if (isShift && ShVal.hasOneUse()) { // If the shifted value has multiple uses, it may be cheaper to use // v_set0 + movlhps or movhlps, etc. @@ -6638,8 +6528,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } if (X86::isMOVLMask(SVOp)) { - if (V1IsUndef) - return V2; if (ISD::isBuildVectorAllZeros(V1.getNode())) return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); if (!X86::isMOVLPMask(SVOp)) { @@ -6652,7 +6540,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } // FIXME: fold these into legal mask. - if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp)) + if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp, HasAVX2)) return getMOVLowToHigh(Op, dl, DAG, HasXMMInt); if (X86::isMOVHLPSMask(SVOp)) @@ -6685,17 +6573,19 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { V2IsSplat = isSplatVector(V2.getNode()); // Canonicalize the splat or undef, if present, to be on the RHS. - if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { + if (V1IsSplat && !V2IsSplat) { Op = CommuteVectorShuffle(SVOp, DAG); SVOp = cast<ShuffleVectorSDNode>(Op); V1 = SVOp->getOperand(0); V2 = SVOp->getOperand(1); std::swap(V1IsSplat, V2IsSplat); - std::swap(V1IsUndef, V2IsUndef); Commuted = true; } - if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { + SmallVector<int, 32> M; + SVOp->getMask(M); + + if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) { // Shuffling low element of v1 into undef, just return v1. if (V2IsUndef) return V1; @@ -6705,11 +6595,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getMOVL(DAG, dl, VT, V2, V1); } - if (X86::isUNPCKLMask(SVOp)) - return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG); + if (isUNPCKLMask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); - if (X86::isUNPCKHMask(SVOp)) - return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG); + if (isUNPCKHMask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); if (V2IsSplat) { // Normalize mask so all entries that point to V2 points to its first @@ -6718,9 +6608,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SDValue NewMask = NormalizeMask(SVOp, DAG); ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); if (NSVOp != SVOp) { - if (X86::isUNPCKLMask(NSVOp, true)) { + if (X86::isUNPCKLMask(NSVOp, HasAVX2, true)) { return NewMask; - } else if (X86::isUNPCKHMask(NSVOp, true)) { + } else if (X86::isUNPCKHMask(NSVOp, HasAVX2, true)) { return NewMask; } } @@ -6732,34 +6622,31 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); - if (X86::isUNPCKLMask(NewSVOp)) - return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG); + if (X86::isUNPCKLMask(NewSVOp, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V2, V1, DAG); - if (X86::isUNPCKHMask(NewSVOp)) - return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG); + if (X86::isUNPCKHMask(NewSVOp, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V2, V1, DAG); } // Normalize the node to match x86 shuffle ops if needed - if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) + if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true) || + isVSHUFPYMask(M, VT, HasAVX, /* Commuted */ true))) return CommuteVectorShuffle(SVOp, DAG); // The checks below are all present in isShuffleMaskLegal, but they are // inlined here right now to enable us to directly emit target specific // nodes, and remove one by one until they don't return Op anymore. - SmallVector<int, 16> M; - SVOp->getMask(M); - if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX())) + if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX())) return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, - X86::getShufflePALIGNRImmediate(SVOp), + getShufflePALIGNRImmediate(SVOp), DAG); if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && SVOp->getSplatIndex() == 0 && V2IsUndef) { - if (VT == MVT::v2f64) - return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG); - if (VT == MVT::v2i64) - return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG); + if (VT == MVT::v2f64 || VT == MVT::v2i64) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); } if (isPSHUFHWMask(M, VT)) @@ -6776,10 +6663,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, X86::getShuffleSHUFImmediate(SVOp), DAG); - if (X86::isUNPCKL_v_undef_Mask(SVOp)) - return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); - if (X86::isUNPCKH_v_undef_Mask(SVOp)) - return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); + if (isUNPCKL_v_undef_Mask(M, VT)) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); + if (isUNPCKH_v_undef_Mask(M, VT)) + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); //===--------------------------------------------------------------------===// // Generate target specific nodes for 128 or 256-bit shuffles only @@ -6787,33 +6674,23 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // // Handle VMOVDDUPY permutations - if (isMOVDDUPYMask(SVOp, Subtarget)) + if (V2IsUndef && isMOVDDUPYMask(M, VT, HasAVX)) return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); - // Handle VPERMILPS* permutations - if (isVPERMILPSMask(M, VT, Subtarget)) - return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, - getShuffleVPERMILPSImmediate(SVOp), DAG); - - // Handle VPERMILPD* permutations - if (isVPERMILPDMask(M, VT, Subtarget)) - return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, - getShuffleVPERMILPDImmediate(SVOp), DAG); + // Handle VPERMILPS/D* permutations + if (isVPERMILPMask(M, VT, HasAVX)) + return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, + getShuffleVPERMILPImmediate(SVOp), DAG); - // Handle VPERM2F128 permutations - if (isVPERM2F128Mask(M, VT, Subtarget)) - return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2, - getShuffleVPERM2F128Immediate(SVOp), DAG); + // Handle VPERM2F128/VPERM2I128 permutations + if (isVPERM2X128Mask(M, VT, HasAVX)) + return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, + V2, getShuffleVPERM2X128Immediate(SVOp), DAG); - // Handle VSHUFPSY permutations - if (isVSHUFPSYMask(M, VT, Subtarget)) + // Handle VSHUFPS/DY permutations + if (isVSHUFPYMask(M, VT, HasAVX)) return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, - getShuffleVSHUFPSYImmediate(SVOp), DAG); - - // Handle VSHUFPDY permutations - if (isVSHUFPDYMask(M, VT, Subtarget)) - return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, - getShuffleVSHUFPDYImmediate(SVOp), DAG); + getShuffleVSHUFPYImmediate(SVOp), DAG); //===--------------------------------------------------------------------===// // Since no target specific shuffle was selected for this generic one, @@ -6896,8 +6773,8 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, Op.getOperand(0)), Op.getOperand(1)); return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); - } else if (VT == MVT::i32) { - // ExtractPS works with constant index. + } else if (VT == MVT::i32 || VT == MVT::i64) { + // ExtractPS/pextrq works with constant index. if (isa<ConstantSDNode>(Op.getOperand(1))) return Op; } @@ -6933,7 +6810,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length"); - if (Subtarget->hasSSE41() || Subtarget->hasAVX()) { + if (Subtarget->hasSSE41orAVX()) { SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); if (Res.getNode()) return Res; @@ -7036,7 +6913,8 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, // Create this as a scalar to vector.. N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); - } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { + } else if ((EltVT == MVT::i32 || EltVT == MVT::i64) && + isa<ConstantSDNode>(N2)) { // PINSR* works with constant index. return Op; } @@ -7074,7 +6952,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { return Insert128BitVector(N0, V, Ins128Idx, DAG, dl); } - if (Subtarget->hasSSE41() || Subtarget->hasAVX()) + if (Subtarget->hasSSE41orAVX()) return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); if (EltVT == MVT::i8) @@ -7276,7 +7154,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { // load. if (isGlobalStubReference(OpFlag)) Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, 0); + MachinePointerInfo::getGOT(), false, false, false, 0); return Result; } @@ -7344,7 +7222,7 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, // load. if (isGlobalStubReference(OpFlags)) Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, 0); + MachinePointerInfo::getGOT(), false, false, false, 0); // If there was a non-zero offset that we didn't fold, create an explicit // addition for it. @@ -7423,7 +7301,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0), - MachinePointerInfo(Ptr), false, false, 0); + MachinePointerInfo(Ptr), + false, false, false, 0); unsigned char OperandFlags = 0; // Most TLS accesses are not RIP relative, even on x86-64. One exception is @@ -7449,7 +7328,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, if (model == TLSModel::InitialExec) Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, - MachinePointerInfo::getGOT(), false, false, 0); + MachinePointerInfo::getGOT(), false, false, false, 0); // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. @@ -7527,7 +7406,8 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // And our return value (tls address) is in the standard call return value // location. unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; - return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); + return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(), + Chain.getValue(1)); } assert(false && @@ -7672,7 +7552,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, Op.getValueType(), MMO); Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, MachinePointerInfo::getFixedStack(SSFI), - false, false, 0); + false, false, false, 0); } return Result; @@ -7719,7 +7599,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, LLVMContext *Context = DAG.getContext(); // Build some magic constants. - std::vector<Constant*> CV0; + SmallVector<Constant*,4> CV0; CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); @@ -7727,7 +7607,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, Constant *C0 = ConstantVector::get(CV0); SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); - std::vector<Constant*> CV1; + SmallVector<Constant*,2> CV1; CV1.push_back( ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); CV1.push_back( @@ -7746,12 +7626,12 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, MachinePointerInfo::getConstantPool(), - false, false, 16); + false, false, false, 16); SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2); SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, MachinePointerInfo::getConstantPool(), - false, false, 16); + false, false, false, 16); SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); // Add the halves; easiest way is to swap them into another reg first. @@ -7983,7 +7863,8 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, // Load the result. return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), - FIST, StackSlot, MachinePointerInfo(), false, false, 0); + FIST, StackSlot, MachinePointerInfo(), + false, false, false, 0); } SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, @@ -7994,7 +7875,8 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, // Load the result. return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), - FIST, StackSlot, MachinePointerInfo(), false, false, 0); + FIST, StackSlot, MachinePointerInfo(), + false, false, false, 0); } SDValue X86TargetLowering::LowerFABS(SDValue Op, @@ -8005,23 +7887,19 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, EVT EltVT = VT; if (VT.isVector()) EltVT = VT.getVectorElementType(); - std::vector<Constant*> CV; + SmallVector<Constant*,4> CV; if (EltVT == MVT::f64) { Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); - CV.push_back(C); - CV.push_back(C); + CV.assign(2, C); } else { Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); - CV.push_back(C); - CV.push_back(C); - CV.push_back(C); - CV.push_back(C); + CV.assign(4, C); } Constant *C = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, 16); + false, false, false, 16); return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); } @@ -8030,31 +7908,31 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); EVT EltVT = VT; - if (VT.isVector()) + unsigned NumElts = VT == MVT::f64 ? 2 : 4; + if (VT.isVector()) { EltVT = VT.getVectorElementType(); - std::vector<Constant*> CV; + NumElts = VT.getVectorNumElements(); + } + SmallVector<Constant*,8> CV; if (EltVT == MVT::f64) { Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); - CV.push_back(C); - CV.push_back(C); + CV.assign(NumElts, C); } else { Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); - CV.push_back(C); - CV.push_back(C); - CV.push_back(C); - CV.push_back(C); + CV.assign(NumElts, C); } Constant *C = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, 16); + false, false, false, 16); if (VT.isVector()) { + MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64; return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(ISD::XOR, dl, MVT::v2i64, - DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, + DAG.getNode(ISD::XOR, dl, XORVT, + DAG.getNode(ISD::BITCAST, dl, XORVT, Op.getOperand(0)), - DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask))); + DAG.getNode(ISD::BITCAST, dl, XORVT, Mask))); } else { return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); } @@ -8083,7 +7961,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { // type, and that won't be f80 since that is not custom lowered. // First get the sign bit of second operand. - std::vector<Constant*> CV; + SmallVector<Constant*,4> CV; if (SrcVT == MVT::f64) { CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); @@ -8097,7 +7975,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, 16); + false, false, false, 16); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); // Shift sign bit right or left if the two operands have different types. @@ -8126,7 +8004,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, 16); + false, false, false, 16); SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); // Or the value with the sign bit. @@ -8190,8 +8068,10 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, // climbing the DAG back to the root, and it doesn't seem to be worth the // effort. for (SDNode::use_iterator UI = Op.getNode()->use_begin(), - UE = Op.getNode()->use_end(); UI != UE; ++UI) - if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) + UE = Op.getNode()->use_end(); UI != UE; ++UI) + if (UI->getOpcode() != ISD::CopyToReg && + UI->getOpcode() != ISD::SETCC && + UI->getOpcode() != ISD::STORE) goto default_case; if (ConstantSDNode *C = @@ -8334,11 +8214,19 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, } } else if (Op1.getOpcode() == ISD::Constant) { ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); + uint64_t AndRHSVal = AndRHS->getZExtValue(); SDValue AndLHS = Op0; - if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { + + if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { LHS = AndLHS.getOperand(0); RHS = AndLHS.getOperand(1); } + + // Use BT if the immediate can't be encoded in a TEST instruction. + if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { + LHS = AndLHS; + RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType()); + } } if (LHS.getNode()) { @@ -8510,8 +8398,7 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); - } - else if (SetCCOpcode == ISD::SETONE) { + } else if (SetCCOpcode == ISD::SETONE) { SDValue ORD, NEQ; ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); @@ -8524,7 +8411,7 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { } // Break 256-bit integer vector compare into smaller ones. - if (!isFP && VT.getSizeInBits() == 256) + if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()) return Lower256IntVSETCC(Op, DAG); // We are handling one of the integer comparisons here. Since SSE only has @@ -8533,12 +8420,12 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = 0, EQOpc = 0, GTOpc = 0; bool Swap = false, Invert = false, FlipSigns = false; - switch (VT.getSimpleVT().SimpleTy) { + switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { default: break; - case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; - case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; - case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; - case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; + case MVT::i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; + case MVT::i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; + case MVT::i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; + case MVT::i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; } switch (SetCCOpcode) { @@ -8559,9 +8446,9 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { // Check that the operation in question is available (most are plain SSE2, // but PCMPGTQ and PCMPEQQ have different requirements). - if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42() && !Subtarget->hasAVX()) + if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42orAVX()) return SDValue(); - if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41() && !Subtarget->hasAVX()) + if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41orAVX()) return SDValue(); // Since SSE has no unsigned integer comparisons, we need to flip the sign @@ -8678,8 +8565,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. - if (Cond.getOpcode() == X86ISD::SETCC || - Cond.getOpcode() == X86ISD::SETCC_CARRY) { + unsigned CondOpcode = Cond.getOpcode(); + if (CondOpcode == X86ISD::SETCC || + CondOpcode == X86ISD::SETCC_CARRY) { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); @@ -8696,6 +8584,39 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { Cond = Cmp; addTest = false; } + } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || + CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || + ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && + Cond.getOperand(0).getValueType() != MVT::i8)) { + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + unsigned X86Opcode; + unsigned X86Cond; + SDVTList VTs; + switch (CondOpcode) { + case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; + case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; + case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; + case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; + case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; + case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; + default: llvm_unreachable("unexpected overflowing operator"); + } + if (CondOpcode == ISD::UMULO) + VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), + MVT::i32); + else + VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); + + SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS); + + if (CondOpcode == ISD::UMULO) + Cond = X86Op.getValue(2); + else + Cond = X86Op.getValue(1); + + CC = DAG.getConstant(X86Cond, MVT::i8); + addTest = false; } if (addTest) { @@ -8777,11 +8698,27 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Dest = Op.getOperand(2); DebugLoc dl = Op.getDebugLoc(); SDValue CC; + bool Inverted = false; if (Cond.getOpcode() == ISD::SETCC) { - SDValue NewCond = LowerSETCC(Cond, DAG); - if (NewCond.getNode()) - Cond = NewCond; + // Check for setcc([su]{add,sub,mul}o == 0). + if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && + isa<ConstantSDNode>(Cond.getOperand(1)) && + cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() && + Cond.getOperand(0).getResNo() == 1 && + (Cond.getOperand(0).getOpcode() == ISD::SADDO || + Cond.getOperand(0).getOpcode() == ISD::UADDO || + Cond.getOperand(0).getOpcode() == ISD::SSUBO || + Cond.getOperand(0).getOpcode() == ISD::USUBO || + Cond.getOperand(0).getOpcode() == ISD::SMULO || + Cond.getOperand(0).getOpcode() == ISD::UMULO)) { + Inverted = true; + Cond = Cond.getOperand(0); + } else { + SDValue NewCond = LowerSETCC(Cond, DAG); + if (NewCond.getNode()) + Cond = NewCond; + } } #if 0 // FIXME: LowerXALUO doesn't handle these!! @@ -8802,8 +8739,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. - if (Cond.getOpcode() == X86ISD::SETCC || - Cond.getOpcode() == X86ISD::SETCC_CARRY) { + unsigned CondOpcode = Cond.getOpcode(); + if (CondOpcode == X86ISD::SETCC || + CondOpcode == X86ISD::SETCC_CARRY) { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); @@ -8824,6 +8762,43 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { break; } } + } + CondOpcode = Cond.getOpcode(); + if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || + CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || + ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && + Cond.getOperand(0).getValueType() != MVT::i8)) { + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + unsigned X86Opcode; + unsigned X86Cond; + SDVTList VTs; + switch (CondOpcode) { + case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; + case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; + case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; + case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; + case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; + case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; + default: llvm_unreachable("unexpected overflowing operator"); + } + if (Inverted) + X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond); + if (CondOpcode == ISD::UMULO) + VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), + MVT::i32); + else + VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); + + SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS); + + if (CondOpcode == ISD::UMULO) + Cond = X86Op.getValue(2); + else + Cond = X86Op.getValue(1); + + CC = DAG.getConstant(X86Cond, MVT::i8); + addTest = false; } else { unsigned CondOpc; if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { @@ -8887,6 +8862,66 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { CC = DAG.getConstant(CCode, MVT::i8); Cond = Cond.getOperand(0).getOperand(1); addTest = false; + } else if (Cond.getOpcode() == ISD::SETCC && + cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) { + // For FCMP_OEQ, we can emit + // two branches instead of an explicit AND instruction with a + // separate test. However, we only do this if this block doesn't + // have a fall-through edge, because this requires an explicit + // jmp when the condition is false. + if (Op.getNode()->hasOneUse()) { + SDNode *User = *Op.getNode()->use_begin(); + // Look for an unconditional branch following this conditional branch. + // We need this because we need to reverse the successors in order + // to implement FCMP_OEQ. + if (User->getOpcode() == ISD::BR) { + SDValue FalseBB = User->getOperand(1); + SDNode *NewBR = + DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); + assert(NewBR == User); + (void)NewBR; + Dest = FalseBB; + + SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, + Cond.getOperand(0), Cond.getOperand(1)); + CC = DAG.getConstant(X86::COND_NE, MVT::i8); + Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), + Chain, Dest, CC, Cmp); + CC = DAG.getConstant(X86::COND_P, MVT::i8); + Cond = Cmp; + addTest = false; + } + } + } else if (Cond.getOpcode() == ISD::SETCC && + cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) { + // For FCMP_UNE, we can emit + // two branches instead of an explicit AND instruction with a + // separate test. However, we only do this if this block doesn't + // have a fall-through edge, because this requires an explicit + // jmp when the condition is false. + if (Op.getNode()->hasOneUse()) { + SDNode *User = *Op.getNode()->use_begin(); + // Look for an unconditional branch following this conditional branch. + // We need this because we need to reverse the successors in order + // to implement FCMP_UNE. + if (User->getOpcode() == ISD::BR) { + SDValue FalseBB = User->getOperand(1); + SDNode *NewBR = + DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); + assert(NewBR == User); + (void)NewBR; + + SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, + Cond.getOperand(0), Cond.getOperand(1)); + CC = DAG.getConstant(X86::COND_NE, MVT::i8); + Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), + Chain, Dest, CC, Cmp); + CC = DAG.getConstant(X86::COND_NP, MVT::i8); + Cond = Cmp; + addTest = false; + Dest = FalseBB; + } + } } } @@ -8925,7 +8960,7 @@ SDValue X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() || - EnableSegmentedStacks) && + getTargetMachine().Options.EnableSegmentedStacks) && "This should be used only on Windows targets or when segmented stacks " "are being used"); assert(!Subtarget->isTargetEnvMacho() && "Not implemented"); @@ -8939,7 +8974,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, bool Is64Bit = Subtarget->is64Bit(); EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; - if (EnableSegmentedStacks) { + if (getTargetMachine().Options.EnableSegmentedStacks) { MachineFunction &MF = DAG.getMachineFunction(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -9075,7 +9110,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { if (ArgMode == 2) { // Sanity Check: Make sure using fp_offset makes sense. - assert(!UseSoftFloat && + assert(!getTargetMachine().Options.UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && Subtarget->hasXMM()); @@ -9105,7 +9140,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { Chain, VAARG, MachinePointerInfo(), - false, false, 0); + false, false, false, 0); } SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { @@ -9243,6 +9278,23 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const case Intrinsic::x86_avx_hsub_pd_256: return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_avx2_psllv_d: + case Intrinsic::x86_avx2_psllv_q: + case Intrinsic::x86_avx2_psllv_d_256: + case Intrinsic::x86_avx2_psllv_q_256: + return DAG.getNode(ISD::SHL, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_avx2_psrlv_d: + case Intrinsic::x86_avx2_psrlv_q: + case Intrinsic::x86_avx2_psrlv_d_256: + case Intrinsic::x86_avx2_psrlv_q_256: + return DAG.getNode(ISD::SRL, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_avx2_psrav_d: + case Intrinsic::x86_avx2_psrav_d_256: + return DAG.getNode(ISD::SRA, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + // ptest and testp intrinsics. The intrinsic these come from are designed to // return an integer value, not just an instruction so lower it to the ptest // or testp pattern and a setcc for the result. @@ -9311,6 +9363,14 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const // Fix vector shift instructions where the last operand is a non-immediate // i32 value. + case Intrinsic::x86_avx2_pslli_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + case Intrinsic::x86_avx2_psrli_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: case Intrinsic::x86_sse2_pslli_w: case Intrinsic::x86_sse2_pslli_d: case Intrinsic::x86_sse2_pslli_q: @@ -9358,6 +9418,30 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const case Intrinsic::x86_sse2_psrai_d: NewIntNo = Intrinsic::x86_sse2_psra_d; break; + case Intrinsic::x86_avx2_pslli_w: + NewIntNo = Intrinsic::x86_avx2_psll_w; + break; + case Intrinsic::x86_avx2_pslli_d: + NewIntNo = Intrinsic::x86_avx2_psll_d; + break; + case Intrinsic::x86_avx2_pslli_q: + NewIntNo = Intrinsic::x86_avx2_psll_q; + break; + case Intrinsic::x86_avx2_psrli_w: + NewIntNo = Intrinsic::x86_avx2_psrl_w; + break; + case Intrinsic::x86_avx2_psrli_d: + NewIntNo = Intrinsic::x86_avx2_psrl_d; + break; + case Intrinsic::x86_avx2_psrli_q: + NewIntNo = Intrinsic::x86_avx2_psrl_q; + break; + case Intrinsic::x86_avx2_psrai_w: + NewIntNo = Intrinsic::x86_avx2_psra_w; + break; + case Intrinsic::x86_avx2_psrai_d: + NewIntNo = Intrinsic::x86_avx2_psra_d; + break; default: { ShAmtVT = MVT::v2i32; switch (IntNo) { @@ -9431,13 +9515,13 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, getPointerTy(), FrameAddr, Offset), - MachinePointerInfo(), false, false, 0); + MachinePointerInfo(), false, false, false, 0); } // Just load the return address. SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), - RetAddrFI, MachinePointerInfo(), false, false, 0); + RetAddrFI, MachinePointerInfo(), false, false, false, 0); } SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { @@ -9452,7 +9536,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { while (Depth--) FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, MachinePointerInfo(), - false, false, 0); + false, false, false, 0); return FrameAddr; } @@ -9684,7 +9768,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, // Load FP Control Word from stack slot SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, - MachinePointerInfo(), false, false, 0); + MachinePointerInfo(), false, false, false, 0); // Transform as necessary SDValue CWD1 = @@ -9823,12 +9907,55 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); // Decompose 256-bit ops into smaller 128-bit ops. - if (VT.getSizeInBits() == 256) + if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()) return Lower256IntArith(Op, DAG); - assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); DebugLoc dl = Op.getDebugLoc(); + SDValue A = Op.getOperand(0); + SDValue B = Op.getOperand(1); + + if (VT == MVT::v4i64) { + assert(Subtarget->hasAVX2() && "Lowering v4i64 multiply requires AVX2"); + + // ulong2 Ahi = __builtin_ia32_psrlqi256( a, 32); + // ulong2 Bhi = __builtin_ia32_psrlqi256( b, 32); + // ulong2 AloBlo = __builtin_ia32_pmuludq256( a, b ); + // ulong2 AloBhi = __builtin_ia32_pmuludq256( a, Bhi ); + // ulong2 AhiBlo = __builtin_ia32_pmuludq256( Ahi, b ); + // + // AloBhi = __builtin_ia32_psllqi256( AloBhi, 32 ); + // AhiBlo = __builtin_ia32_psllqi256( AhiBlo, 32 ); + // return AloBlo + AloBhi + AhiBlo; + + SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32), + A, DAG.getConstant(32, MVT::i32)); + SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32), + B, DAG.getConstant(32, MVT::i32)); + SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32), + A, B); + SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32), + A, Bhi); + SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32), + Ahi, B); + AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32), + AloBhi, DAG.getConstant(32, MVT::i32)); + AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32), + AhiBlo, DAG.getConstant(32, MVT::i32)); + SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); + Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); + return Res; + } + + assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); + // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); @@ -9839,9 +9966,6 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); // return AloBlo + AloBhi + AhiBlo; - SDValue A = Op.getOperand(0); - SDValue B = Op.getOperand(1); - SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), A, DAG.getConstant(32, MVT::i32)); @@ -9879,53 +10003,25 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { if (!Subtarget->hasXMMInt()) return SDValue(); - // Decompose 256-bit shifts into smaller 128-bit shifts. - if (VT.getSizeInBits() == 256) { - int NumElems = VT.getVectorNumElements(); - MVT EltVT = VT.getVectorElementType().getSimpleVT(); - EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); - - // Extract the two vectors - SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl); - SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32), - DAG, dl); - - // Recreate the shift amount vectors - SDValue Amt1, Amt2; - if (Amt.getOpcode() == ISD::BUILD_VECTOR) { - // Constant shift amount - SmallVector<SDValue, 4> Amt1Csts; - SmallVector<SDValue, 4> Amt2Csts; - for (int i = 0; i < NumElems/2; ++i) - Amt1Csts.push_back(Amt->getOperand(i)); - for (int i = NumElems/2; i < NumElems; ++i) - Amt2Csts.push_back(Amt->getOperand(i)); - - Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, - &Amt1Csts[0], NumElems/2); - Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, - &Amt2Csts[0], NumElems/2); - } else { - // Variable shift amount - Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl); - Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32), - DAG, dl); - } - - // Issue new vector shifts for the smaller types - V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); - V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); - - // Concatenate the result back - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); - } - // Optimize shl/srl/sra with constant shift amount. if (isSplatVector(Amt.getNode())) { SDValue SclrAmt = Amt->getOperand(0); if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { uint64_t ShiftAmt = C->getZExtValue(); + if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SHL) { + // Make a large shift. + SDValue SHL = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + // Zero out the rightmost bits. + SmallVector<SDValue, 16> V(16, DAG.getConstant(uint8_t(-1U << ShiftAmt), + MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, SHL, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); + } + if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL) return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), @@ -9941,6 +10037,19 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), R, DAG.getConstant(ShiftAmt, MVT::i32)); + if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SRL) { + // Make a large shift. + SDValue SRL = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + // Zero out the leftmost bits. + SmallVector<SDValue, 16> V(16, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, + MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, SRL, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); + } + if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL) return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), @@ -9965,6 +10074,66 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SRA) { + if (ShiftAmt == 7) { + // R s>> 7 === R s< 0 + SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl); + return DAG.getNode(X86ISD::PCMPGTB, dl, VT, Zeros, R); + } + + // R s>> a === ((R u>> a) ^ m) - m + SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); + SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt, + MVT::i8)); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16); + Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); + Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); + return Res; + } + + if (Subtarget->hasAVX2() && VT == MVT::v32i8) { + if (Op.getOpcode() == ISD::SHL) { + // Make a large shift. + SDValue SHL = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + // Zero out the rightmost bits. + SmallVector<SDValue, 32> V(32, DAG.getConstant(uint8_t(-1U << ShiftAmt), + MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, SHL, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); + } + if (Op.getOpcode() == ISD::SRL) { + // Make a large shift. + SDValue SRL = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + // Zero out the leftmost bits. + SmallVector<SDValue, 32> V(32, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, + MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, SRL, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); + } + if (Op.getOpcode() == ISD::SRA) { + if (ShiftAmt == 7) { + // R s>> 7 === R s< 0 + SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl); + return DAG.getNode(X86ISD::PCMPGTB, dl, VT, Zeros, R); + } + + // R s>> a === ((R u>> a) ^ m) - m + SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); + SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt, + MVT::i8)); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32); + Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); + Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); + return Res; + } + } } } @@ -9981,7 +10150,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, 16); + false, false, false, 16); Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); @@ -10003,14 +10172,14 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, 16); + false, false, false, 16); // r = pblendv(r, psllw(r & (char16)15, 4), a); M = DAG.getNode(ISD::AND, dl, VT, R, M); M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, DAG.getConstant(4, MVT::i32)); - R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M); + R = DAG.getNode(ISD::VSELECT, dl, VT, Op, M, R); // a += a Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); @@ -10018,22 +10187,64 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, 16); + false, false, false, 16); // r = pblendv(r, psllw(r & (char16)63, 2), a); M = DAG.getNode(ISD::AND, dl, VT, R, M); M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, DAG.getConstant(2, MVT::i32)); - R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M); + R = DAG.getNode(ISD::VSELECT, dl, VT, Op, M, R); // a += a Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); // return pblendv(r, r+r, a); R = DAG.getNode(ISD::VSELECT, dl, VT, Op, - R, DAG.getNode(ISD::ADD, dl, VT, R, R)); + DAG.getNode(ISD::ADD, dl, VT, R, R), R); return R; } + + // Decompose 256-bit shifts into smaller 128-bit shifts. + if (VT.getSizeInBits() == 256) { + int NumElems = VT.getVectorNumElements(); + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + + // Extract the two vectors + SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl); + SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32), + DAG, dl); + + // Recreate the shift amount vectors + SDValue Amt1, Amt2; + if (Amt.getOpcode() == ISD::BUILD_VECTOR) { + // Constant shift amount + SmallVector<SDValue, 4> Amt1Csts; + SmallVector<SDValue, 4> Amt2Csts; + for (int i = 0; i < NumElems/2; ++i) + Amt1Csts.push_back(Amt->getOperand(i)); + for (int i = NumElems/2; i < NumElems; ++i) + Amt2Csts.push_back(Amt->getOperand(i)); + + Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, + &Amt1Csts[0], NumElems/2); + Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, + &Amt2Csts[0], NumElems/2); + } else { + // Variable shift amount + Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl); + Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32), + DAG, dl); + } + + // Issue new vector shifts for the smaller types + V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); + V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); + + // Concatenate the result back + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); + } + return SDValue(); } @@ -10114,9 +10325,9 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const{ DebugLoc dl = Op.getDebugLoc(); - SDNode* Node = Op.getNode(); - EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT(); - EVT VT = Node->getValueType(0); + EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); + EVT VT = Op.getValueType(); + if (Subtarget->hasXMMInt() && VT.isVector()) { unsigned BitsDiff = VT.getScalarType().getSizeInBits() - ExtraVT.getScalarType().getSizeInBits(); @@ -10127,24 +10338,55 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) switch (VT.getSimpleVT().SimpleTy) { default: return SDValue(); - case MVT::v4i32: { + case MVT::v4i32: SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d; SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d; break; - } - case MVT::v8i16: { + case MVT::v8i16: SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_w; SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_w; break; - } + case MVT::v8i32: + case MVT::v16i16: + if (!Subtarget->hasAVX()) + return SDValue(); + if (!Subtarget->hasAVX2()) { + // needs to be split + int NumElems = VT.getVectorNumElements(); + SDValue Idx0 = DAG.getConstant(0, MVT::i32); + SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); + + // Extract the LHS vectors + SDValue LHS = Op.getOperand(0); + SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); + SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); + + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + + EVT ExtraEltVT = ExtraVT.getVectorElementType(); + int ExtraNumElems = ExtraVT.getVectorNumElements(); + ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT, + ExtraNumElems/2); + SDValue Extra = DAG.getValueType(ExtraVT); + + LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); + LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);; + } + if (VT == MVT::v8i32) { + SHLIntrinsicsID = Intrinsic::x86_avx2_pslli_d; + SRAIntrinsicsID = Intrinsic::x86_avx2_psrai_d; + } else { + SHLIntrinsicsID = Intrinsic::x86_avx2_pslli_w; + SRAIntrinsicsID = Intrinsic::x86_avx2_psrai_w; + } } SDValue Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(SHLIntrinsicsID, MVT::i32), - Node->getOperand(0), ShAmt); - - // In case of 1 bit sext, no need to shr - if (ExtraVT.getScalarType().getSizeInBits() == 1) return Tmp1; + Op.getOperand(0), ShAmt); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(SRAIntrinsicsID, MVT::i32), @@ -10522,7 +10764,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, EVT VT = N->getValueType(0); // Return a load from the stack slot. Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, - MachinePointerInfo(), false, false, 0)); + MachinePointerInfo(), + false, false, false, 0)); } return; } @@ -10656,15 +10899,16 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PINSRW: return "X86ISD::PINSRW"; case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; case X86ISD::ANDNP: return "X86ISD::ANDNP"; - case X86ISD::PSIGNB: return "X86ISD::PSIGNB"; - case X86ISD::PSIGNW: return "X86ISD::PSIGNW"; - case X86ISD::PSIGND: return "X86ISD::PSIGND"; + case X86ISD::PSIGN: return "X86ISD::PSIGN"; + case X86ISD::BLENDV: return "X86ISD::BLENDV"; + case X86ISD::HADD: return "X86ISD::HADD"; + case X86ISD::HSUB: return "X86ISD::HSUB"; + case X86ISD::FHADD: return "X86ISD::FHADD"; + case X86ISD::FHSUB: return "X86ISD::FHSUB"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMIN: return "X86ISD::FMIN"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; case X86ISD::FRCP: return "X86ISD::FRCP"; - case X86ISD::FHADD: return "X86ISD::FHADD"; - case X86ISD::FHSUB: return "X86ISD::FHSUB"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; @@ -10704,6 +10948,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::XOR: return "X86ISD::XOR"; case X86ISD::AND: return "X86ISD::AND"; case X86ISD::ANDN: return "X86ISD::ANDN"; + case X86ISD::BLSI: return "X86ISD::BLSI"; + case X86ISD::BLSMSK: return "X86ISD::BLSMSK"; + case X86ISD::BLSR: return "X86ISD::BLSR"; case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::PTEST: return "X86ISD::PTEST"; case X86ISD::TESTP: return "X86ISD::TESTP"; @@ -10728,25 +10975,11 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; case X86ISD::MOVSD: return "X86ISD::MOVSD"; case X86ISD::MOVSS: return "X86ISD::MOVSS"; - case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; - case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; - case X86ISD::VUNPCKLPDY: return "X86ISD::VUNPCKLPDY"; - case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; - case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; - case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; - case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; - case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; - case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; - case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; - case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; - case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; - case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; + case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; + case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; - case X86ISD::VPERMILPS: return "X86ISD::VPERMILPS"; - case X86ISD::VPERMILPSY: return "X86ISD::VPERMILPSY"; - case X86ISD::VPERMILPD: return "X86ISD::VPERMILPD"; - case X86ISD::VPERMILPDY: return "X86ISD::VPERMILPDY"; - case X86ISD::VPERM2F128: return "X86ISD::VPERM2F128"; + case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; + case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; @@ -10854,7 +11087,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const { // Very little shuffling can be done for 64-bit vectors right now. if (VT.getSizeInBits() == 64) - return isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()); + return false; // FIXME: pshufb, blends, shifts. return (VT.getVectorNumElements() == 2 || @@ -10864,9 +11097,9 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, isPSHUFDMask(M, VT) || isPSHUFHWMask(M, VT) || isPSHUFLWMask(M, VT) || - isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()) || - isUNPCKLMask(M, VT) || - isUNPCKHMask(M, VT) || + isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX()) || + isUNPCKLMask(M, VT, Subtarget->hasAVX2()) || + isUNPCKHMask(M, VT, Subtarget->hasAVX2()) || isUNPCKL_v_undef_Mask(M, VT) || isUNPCKH_v_undef_Mask(M, VT)); } @@ -10882,7 +11115,7 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, return (isMOVLMask(Mask, VT) || isCommutedMOVLMask(Mask, VT, true) || isSHUFPMask(Mask, VT) || - isCommutedSHUFPMask(Mask, VT)); + isSHUFPMask(Mask, VT, /* Commuted */ true)); } return false; } @@ -11273,7 +11506,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, MachineBasicBlock * X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, unsigned numArgs, bool memArg) const { - assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && + assert(Subtarget->hasSSE42orAVX() && "Target must have SSE4.2 or AVX features enabled"); DebugLoc dl = MI->getDebugLoc(); @@ -11752,7 +11985,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, MachineFunction *MF = BB->getParent(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); - assert(EnableSegmentedStacks); + assert(getTargetMachine().Options.EnableSegmentedStacks); unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; @@ -11784,6 +12017,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), + SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), sizeVReg = MI->getOperand(1).getReg(), physSPReg = Is64Bit ? X86::RSP : X86::ESP; @@ -11801,19 +12035,19 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, // Add code to the main basic block to check if the stack limit has been hit, // and if so, jump to mallocMBB otherwise to bumpMBB. BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); - BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), tmpSPVReg) + BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) .addReg(tmpSPVReg).addReg(sizeVReg); BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr)) .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg) - .addReg(tmpSPVReg); + .addReg(SPLimitVReg); BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB); // bumpMBB simply decreases the stack pointer, since we know the current // stacklet has enough space. BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) - .addReg(tmpSPVReg); + .addReg(SPLimitVReg); BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) - .addReg(tmpSPVReg); + .addReg(SPLimitVReg); BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); // Calls into a routine in libgcc to allocate more space from the heap. @@ -12589,7 +12823,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, // Load the scalar. SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, ScalarAddr, MachinePointerInfo(), - false, false, 0); + false, false, false, 0); // Replace the exact with the load. DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); @@ -12631,7 +12865,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) break; std::swap(LHS, RHS); @@ -12641,7 +12875,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, case ISD::SETOLE: // Converting this to a min would handle comparisons between positive // and negative zero incorrectly. - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) break; Opcode = X86ISD::FMIN; @@ -12659,7 +12893,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, case ISD::SETOGE: // Converting this to a max would handle comparisons between positive // and negative zero incorrectly. - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) break; Opcode = X86ISD::FMAX; @@ -12669,7 +12903,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) break; std::swap(LHS, RHS); @@ -12695,7 +12929,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Converting this to a min would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; @@ -12705,7 +12939,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, break; case ISD::SETUGT: // Converting this to a min would handle NaNs incorrectly. - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) break; Opcode = X86ISD::FMIN; @@ -12730,7 +12964,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Converting this to a max would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; @@ -13041,7 +13275,8 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) // since the result of setcc_c is all zero's or all ones. - if (N1C && N0.getOpcode() == ISD::AND && + if (VT.isInteger() && !VT.isVector() && + N1C && N0.getOpcode() == ISD::AND && N0.getOperand(1).getOpcode() == ISD::Constant) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == X86ISD::SETCC_CARRY || @@ -13057,6 +13292,22 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { } } + + // Hardware support for vector shifts is sparse which makes us scalarize the + // vector operations in many cases. Also, on sandybridge ADD is faster than + // shl. + // (shl V, 1) -> add V,V + if (isSplatVector(N1.getNode())) { + assert(N0.getValueType().isVector() && "Invalid vector shift type"); + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0)); + // We shift all of the values by one. In many cases we do not have + // hardware support for this operation. This is better expressed as an ADD + // of two values. + if (N1C && (1 == N1C->getZExtValue())) { + return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0); + } + } + return SDValue(); } @@ -13065,9 +13316,10 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); - if (!VT.isVector() && VT.isInteger() && - N->getOpcode() == ISD::SHL) - return PerformSHLCombine(N, DAG); + if (N->getOpcode() == ISD::SHL) { + SDValue V = PerformSHLCombine(N, DAG); + if (V.getNode()) return V; + } // On X86 with SSE2 support, we can transform this to a vector shift if // all elements are shifted by the same amount. We can't do this in legalize @@ -13076,7 +13328,9 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, if (!Subtarget->hasXMMInt()) return SDValue(); - if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) + if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && + (!Subtarget->hasAVX2() || + (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) return SDValue(); SDValue ShAmtOp = N->getOperand(1); @@ -13149,6 +13403,18 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), ValOp, BaseShAmt); + if (VT == MVT::v4i64) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32), + ValOp, BaseShAmt); + if (VT == MVT::v8i32) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_avx2_pslli_d, MVT::i32), + ValOp, BaseShAmt); + if (VT == MVT::v16i16) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32), + ValOp, BaseShAmt); break; case ISD::SRA: if (VT == MVT::v4i32) @@ -13159,6 +13425,14 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), ValOp, BaseShAmt); + if (VT == MVT::v8i32) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_avx2_psrai_d, MVT::i32), + ValOp, BaseShAmt); + if (VT == MVT::v16i16) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_avx2_psrai_w, MVT::i32), + ValOp, BaseShAmt); break; case ISD::SRL: if (VT == MVT::v2i64) @@ -13173,6 +13447,18 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), ValOp, BaseShAmt); + if (VT == MVT::v4i64) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32), + ValOp, BaseShAmt); + if (VT == MVT::v8i32) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_avx2_psrli_d, MVT::i32), + ValOp, BaseShAmt); + if (VT == MVT::v16i16) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32), + ValOp, BaseShAmt); break; } return SDValue(); @@ -13299,7 +13585,9 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); - // Create ANDN instructions + // Create ANDN, BLSI, and BLSR instructions + // BLSI is X & (-X) + // BLSR is X & (X-1) if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -13312,6 +13600,26 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1))) return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0); + // Check LHS for neg + if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 && + isZero(N0.getOperand(0))) + return DAG.getNode(X86ISD::BLSI, DL, VT, N1); + + // Check RHS for neg + if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 && + isZero(N1.getOperand(0))) + return DAG.getNode(X86ISD::BLSI, DL, VT, N0); + + // Check LHS for X-1 + if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && + isAllOnes(N0.getOperand(1))) + return DAG.getNode(X86ISD::BLSR, DL, VT, N1); + + // Check RHS for X-1 + if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && + isAllOnes(N1.getOperand(1))) + return DAG.getNode(X86ISD::BLSR, DL, VT, N0); + return SDValue(); } @@ -13352,98 +13660,98 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, return R; EVT VT = N->getValueType(0); - if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64) - return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // look for psign/blend - if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) { - if (VT == MVT::v2i64) { - // Canonicalize pandn to RHS - if (N0.getOpcode() == X86ISD::ANDNP) - std::swap(N0, N1); - // or (and (m, x), (pandn m, y)) - if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { - SDValue Mask = N1.getOperand(0); - SDValue X = N1.getOperand(1); - SDValue Y; - if (N0.getOperand(0) == Mask) - Y = N0.getOperand(1); - if (N0.getOperand(1) == Mask) - Y = N0.getOperand(0); - - // Check to see if the mask appeared in both the AND and ANDNP and - if (!Y.getNode()) - return SDValue(); + if (VT == MVT::v2i64 || VT == MVT::v4i64) { + if (!Subtarget->hasSSSE3orAVX() || + (VT == MVT::v4i64 && !Subtarget->hasAVX2())) + return SDValue(); - // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. - if (Mask.getOpcode() != ISD::BITCAST || - X.getOpcode() != ISD::BITCAST || - Y.getOpcode() != ISD::BITCAST) - return SDValue(); + // Canonicalize pandn to RHS + if (N0.getOpcode() == X86ISD::ANDNP) + std::swap(N0, N1); + // or (and (m, x), (pandn m, y)) + if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { + SDValue Mask = N1.getOperand(0); + SDValue X = N1.getOperand(1); + SDValue Y; + if (N0.getOperand(0) == Mask) + Y = N0.getOperand(1); + if (N0.getOperand(1) == Mask) + Y = N0.getOperand(0); + + // Check to see if the mask appeared in both the AND and ANDNP and + if (!Y.getNode()) + return SDValue(); - // Look through mask bitcast. - Mask = Mask.getOperand(0); - EVT MaskVT = Mask.getValueType(); + // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. + if (Mask.getOpcode() != ISD::BITCAST || + X.getOpcode() != ISD::BITCAST || + Y.getOpcode() != ISD::BITCAST) + return SDValue(); - // Validate that the Mask operand is a vector sra node. The sra node - // will be an intrinsic. - if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN) - return SDValue(); + // Look through mask bitcast. + Mask = Mask.getOperand(0); + EVT MaskVT = Mask.getValueType(); - // FIXME: what to do for bytes, since there is a psignb/pblendvb, but - // there is no psrai.b - switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) { - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_sse2_psrai_d: - break; - default: return SDValue(); - } + // Validate that the Mask operand is a vector sra node. The sra node + // will be an intrinsic. + if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN) + return SDValue(); - // Check that the SRA is all signbits. - SDValue SraC = Mask.getOperand(2); - unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); - unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); - if ((SraAmt + 1) != EltBits) - return SDValue(); + // FIXME: what to do for bytes, since there is a psignb/pblendvb, but + // there is no psrai.b + switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) { + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: + break; + default: return SDValue(); + } - DebugLoc DL = N->getDebugLoc(); - - // Now we know we at least have a plendvb with the mask val. See if - // we can form a psignb/w/d. - // psign = x.type == y.type == mask.type && y = sub(0, x); - X = X.getOperand(0); - Y = Y.getOperand(0); - if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && - ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && - X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){ - unsigned Opc = 0; - switch (EltBits) { - case 8: Opc = X86ISD::PSIGNB; break; - case 16: Opc = X86ISD::PSIGNW; break; - case 32: Opc = X86ISD::PSIGND; break; - default: break; - } - if (Opc) { - SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1)); - return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign); - } - } - // PBLENDVB only available on SSE 4.1 - if (!(Subtarget->hasSSE41() || Subtarget->hasAVX())) - return SDValue(); + // Check that the SRA is all signbits. + SDValue SraC = Mask.getOperand(2); + unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); + unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); + if ((SraAmt + 1) != EltBits) + return SDValue(); - X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X); - Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y); - Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask); - Mask = DAG.getNode(ISD::VSELECT, DL, MVT::v16i8, Mask, X, Y); - return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask); + DebugLoc DL = N->getDebugLoc(); + + // Now we know we at least have a plendvb with the mask val. See if + // we can form a psignb/w/d. + // psign = x.type == y.type == mask.type && y = sub(0, x); + X = X.getOperand(0); + Y = Y.getOperand(0); + if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && + ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && + X.getValueType() == MaskVT && X.getValueType() == Y.getValueType() && + (EltBits == 8 || EltBits == 16 || EltBits == 32)) { + SDValue Sign = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, + Mask.getOperand(1)); + return DAG.getNode(ISD::BITCAST, DL, VT, Sign); } + // PBLENDVB only available on SSE 4.1 + if (!Subtarget->hasSSE41orAVX()) + return SDValue(); + + EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; + + X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X); + Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y); + Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask); + Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); + return DAG.getNode(ISD::BITCAST, DL, VT, Mask); } } + if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) std::swap(N0, N1); @@ -13499,6 +13807,33 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + EVT VT = N->getValueType(0); + + if (VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + // Create BLSMSK instructions by finding X ^ (X-1) + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + + if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && + isAllOnes(N0.getOperand(1))) + return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1); + + if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && + isAllOnes(N1.getOperand(1))) + return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0); + + return SDValue(); +} + /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { @@ -13552,7 +13887,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->isVolatile(), - Ld->isNonTemporal(), Ld->getAlignment()); + Ld->isNonTemporal(), Ld->isInvariant(), + Ld->getAlignment()); // Insert the word loaded into a vector. SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, @@ -13592,7 +13928,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // If we are saving a concatination of two XMM registers, perform two stores. + // If we are saving a concatenation of two XMM registers, perform two stores. // This is better in Sandy Bridge cause one 256-bit mem op is done via two // 128-bit ones. If in the future the cost becomes only one memory access the // first version would be better. @@ -13702,7 +14038,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, const Function *F = DAG.getMachineFunction().getFunction(); bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); - bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps + bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps && Subtarget->hasXMMInt(); if ((VT.isVector() || (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && @@ -13748,7 +14084,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->isVolatile(), - Ld->isNonTemporal(), Ld->getAlignment()); + Ld->isNonTemporal(), Ld->isInvariant(), + Ld->getAlignment()); SDValue NewChain = NewLd.getValue(1); if (TokenFactorIndex != -1) { Ops.push_back(NewChain); @@ -13769,10 +14106,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), - Ld->getAlignment()); + Ld->isInvariant(), Ld->getAlignment()); SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, Ld->getPointerInfo().getWithOffset(4), Ld->isVolatile(), Ld->isNonTemporal(), + Ld->isInvariant(), MinAlign(Ld->getAlignment(), 4)); SDValue NewChain = LoLd.getValue(1); @@ -13816,7 +14154,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, /// set to A, RHS to B, and the routine returns 'true'. /// Note that the binary operation should have the property that if one of the /// operands is UNDEF then the result is UNDEF. -static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { +static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { // Look for the following pattern: if // A = < float a0, float a1, float a2, float a3 > // B = < float b0, float b1, float b2, float b3 > @@ -13832,7 +14170,18 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { return false; EVT VT = LHS.getValueType(); - unsigned N = VT.getVectorNumElements(); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for horizontal add/sub"); + + // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to + // operate independently on 128-bit lanes. + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts / NumLanes; + assert((NumLaneElts % 2 == 0) && + "Vector type should have an even number of elements in each lane"); + unsigned HalfLaneElts = NumLaneElts/2; // View LHS in the form // LHS = VECTOR_SHUFFLE A, B, LMask @@ -13841,7 +14190,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { // NOTE: in what follows a default initialized SDValue represents an UNDEF of // type VT. SDValue A, B; - SmallVector<int, 8> LMask(N); + SmallVector<int, 16> LMask(NumElts); if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) A = LHS.getOperand(0); @@ -13851,14 +14200,14 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { } else { if (LHS.getOpcode() != ISD::UNDEF) A = LHS; - for (unsigned i = 0; i != N; ++i) + for (unsigned i = 0; i != NumElts; ++i) LMask[i] = i; } // Likewise, view RHS in the form // RHS = VECTOR_SHUFFLE C, D, RMask SDValue C, D; - SmallVector<int, 8> RMask(N); + SmallVector<int, 16> RMask(NumElts); if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) C = RHS.getOperand(0); @@ -13868,7 +14217,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { } else { if (RHS.getOpcode() != ISD::UNDEF) C = RHS; - for (unsigned i = 0; i != N; ++i) + for (unsigned i = 0; i != NumElts; ++i) RMask[i] = i; } @@ -13883,30 +14232,28 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { // If A and B occur in reverse order in RHS, then "swap" them (which means // rewriting the mask). if (A != C) - for (unsigned i = 0; i != N; ++i) { - unsigned Idx = RMask[i]; - if (Idx < N) - RMask[i] += N; - else if (Idx < 2*N) - RMask[i] -= N; - } + CommuteVectorShuffleMask(RMask, NumElts); // At this point LHS and RHS are equivalent to // LHS = VECTOR_SHUFFLE A, B, LMask // RHS = VECTOR_SHUFFLE A, B, RMask // Check that the masks correspond to performing a horizontal operation. - for (unsigned i = 0; i != N; ++i) { - unsigned LIdx = LMask[i], RIdx = RMask[i]; + for (unsigned i = 0; i != NumElts; ++i) { + int LIdx = LMask[i], RIdx = RMask[i]; // Ignore any UNDEF components. - if (LIdx >= 2*N || RIdx >= 2*N || (!A.getNode() && (LIdx < N || RIdx < N)) - || (!B.getNode() && (LIdx >= N || RIdx >= N))) + if (LIdx < 0 || RIdx < 0 || + (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || + (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) continue; // Check that successive elements are being operated on. If not, this is // not a horizontal operation. - if (!(LIdx == 2*i && RIdx == 2*i + 1) && - !(isCommutative && LIdx == 2*i + 1 && RIdx == 2*i)) + unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs + unsigned LaneStart = (i/NumLaneElts) * NumLaneElts; + int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart; + if (!(LIdx == Index && RIdx == Index + 1) && + !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) return false; } @@ -13923,8 +14270,8 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, SDValue RHS = N->getOperand(1); // Try to synthesize horizontal adds from adds of shuffles. - if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) && - (VT == MVT::v4f32 || VT == MVT::v2f64) && + if (((Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, true)) return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS); return SDValue(); @@ -13938,8 +14285,8 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, SDValue RHS = N->getOperand(1); // Try to synthesize horizontal subs from subs of shuffles. - if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) && - (VT == MVT::v4f32 || VT == MVT::v2f64) && + if (((Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, false)) return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS); return SDValue(); @@ -14135,7 +14482,24 @@ static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { DAG.getConstant(0, OtherVal.getValueType()), NewCmp); } -static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) { +/// PerformADDCombine - Do target-specific dag combines on integer adds. +static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + // Try to synthesize horizontal adds from adds of shuffles. + if (((Subtarget->hasSSSE3orAVX() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || + (Subtarget->hasAVX2() && (VT == MVT::v16i16 || MVT::v8i32))) && + isHorizontalBinOp(Op0, Op1, true)) + return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1); + + return OptimizeConditionalInDecrement(N, DAG); +} + +static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); @@ -14157,6 +14521,13 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) { } } + // Try to synthesize horizontal adds from adds of shuffles. + EVT VT = N->getValueType(0); + if (((Subtarget->hasSSSE3orAVX() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || + (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && + isHorizontalBinOp(Op0, Op1, true)) + return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1); + return OptimizeConditionalInDecrement(N, DAG); } @@ -14170,8 +14541,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::VSELECT: case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); - case ISD::ADD: return OptimizeConditionalInDecrement(N, DAG); - case ISD::SUB: return PerformSubCombine(N, DAG); + case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); + case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); case ISD::MUL: return PerformMulCombine(N, DAG, DCI); case ISD::SHL: @@ -14179,6 +14550,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); + case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); case ISD::LOAD: return PerformLOADCombine(N, DAG, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); @@ -14194,22 +14566,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::SHUFPS: // Handle all target specific shuffles case X86ISD::SHUFPD: case X86ISD::PALIGN: - case X86ISD::PUNPCKHBW: - case X86ISD::PUNPCKHWD: - case X86ISD::PUNPCKHDQ: - case X86ISD::PUNPCKHQDQ: - case X86ISD::UNPCKHPS: - case X86ISD::UNPCKHPD: - case X86ISD::VUNPCKHPSY: - case X86ISD::VUNPCKHPDY: - case X86ISD::PUNPCKLBW: - case X86ISD::PUNPCKLWD: - case X86ISD::PUNPCKLDQ: - case X86ISD::PUNPCKLQDQ: - case X86ISD::UNPCKLPS: - case X86ISD::UNPCKLPD: - case X86ISD::VUNPCKLPSY: - case X86ISD::VUNPCKLPDY: + case X86ISD::UNPCKH: + case X86ISD::UNPCKL: case X86ISD::MOVHLPS: case X86ISD::MOVLHPS: case X86ISD::PSHUFD: @@ -14217,11 +14575,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PSHUFLW: case X86ISD::MOVSS: case X86ISD::MOVSD: - case X86ISD::VPERMILPS: - case X86ISD::VPERMILPSY: - case X86ISD::VPERMILPD: - case X86ISD::VPERMILPDY: - case X86ISD::VPERM2F128: + case X86ISD::VPERMILP: + case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); } |
