diff options
Diffstat (limited to 'lib/Target/AArch64/AArch64ISelLowering.cpp')
-rw-r--r-- | lib/Target/AArch64/AArch64ISelLowering.cpp | 292 |
1 files changed, 164 insertions, 128 deletions
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 7c94d83..a1b324e 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "AArch64ISelLowering.h" +#include "AArch64CallingConvention.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64PerfectShuffle.h" #include "AArch64Subtarget.h" @@ -66,10 +67,9 @@ EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, cl::desc("Allow AArch64 SLI/SRI formation"), cl::init(false)); - -AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM) - : TargetLowering(TM) { - Subtarget = &TM.getSubtarget<AArch64Subtarget>(); +AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, + const AArch64Subtarget &STI) + : TargetLowering(TM), Subtarget(&STI) { // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so // we have to make something up. Arbitrarily, choose ZeroOrOne. @@ -111,7 +111,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM) } // Compute derived properties from the register classes - computeRegisterProperties(); + computeRegisterProperties(Subtarget->getRegisterInfo()); // Provide all sorts of operation actions setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); @@ -386,13 +386,24 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM) setOperationAction(ISD::FSINCOS, MVT::f32, Expand); } + // Make floating-point constants legal for the large code model, so they don't + // become loads from the constant pool. + if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { + setOperationAction(ISD::ConstantFP, MVT::f32, Legal); + setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + } + // AArch64 does not have floating-point extending loads, i1 sign-extending // load, floating-point truncating stores, or v2i32->v2i16 truncating store. - setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand); + for (MVT VT : MVT::fp_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); + } + for (MVT VT : MVT::integer_valuetypes()) + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); + setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); @@ -531,26 +542,22 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM) setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); // Likewise, narrowing and extending vector loads/stores aren't handled // directly. - for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { - - setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, - Expand); - - setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); - - setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); - - for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) - setTruncStoreAction((MVT::SimpleValueType)VT, - (MVT::SimpleValueType)InnerVT, Expand); - setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); - setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); - setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); + for (MVT VT : MVT::vector_valuetypes()) { + setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); + + setOperationAction(ISD::MULHS, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + + setOperationAction(ISD::BSWAP, VT, Expand); + + for (MVT InnerVT : MVT::vector_valuetypes()) { + setTruncStoreAction(VT, InnerVT, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); + } } // AArch64 has implementations of a lot of rounding-like FP operations. @@ -615,7 +622,8 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand); - setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand); + for (MVT InnerVT : MVT::all_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand); // CNT supports only B element sizes. if (VT != MVT::v8i8 && VT != MVT::v16i8) @@ -722,13 +730,6 @@ MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i64; } -unsigned AArch64TargetLowering::getMaximalGlobalOffset() const { - // FIXME: On AArch64, this depends on the type. - // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes(). - // and the offset has to be a multiple of the related size in bytes. - return 4095; -} - FastISel * AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { @@ -869,9 +870,8 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, // EndBB: // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] - const TargetInstrInfo *TII = - getTargetMachine().getSubtargetImpl()->getInstrInfo(); MachineFunction *MF = MBB->getParent(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); DebugLoc DL = MI->getDebugLoc(); MachineFunction::iterator It = MBB; @@ -1330,10 +1330,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, RTLIB::Libcall Call) const { - SmallVector<SDValue, 2> Ops; - for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) - Ops.push_back(Op.getOperand(i)); - + SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false, SDLoc(Op)).first; } @@ -1561,10 +1558,7 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, else LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); - SmallVector<SDValue, 2> Ops; - for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) - Ops.push_back(Op.getOperand(i)); - + SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false, SDLoc(Op)).first; } @@ -1981,6 +1975,8 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, llvm_unreachable("Unsupported calling convention."); case CallingConv::WebKit_JS: return CC_AArch64_WebKit_JS; + case CallingConv::GHC: + return CC_AArch64_GHC; case CallingConv::C: case CallingConv::Fast: if (!Subtarget->isTargetDarwin()) @@ -2012,18 +2008,19 @@ SDValue AArch64TargetLowering::LowerFormalArguments( unsigned CurArgIdx = 0; for (unsigned i = 0; i != NumArgs; ++i) { MVT ValVT = Ins[i].VT; - std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx); - CurArgIdx = Ins[i].OrigArgIndex; - - // Get type of the original argument. - EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true); - MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; - // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. - if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) - ValVT = MVT::i8; - else if (ActualMVT == MVT::i16) - ValVT = MVT::i16; + if (Ins[i].isOrigArg()) { + std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx); + CurArgIdx = Ins[i].getOrigArgIndex(); + // Get type of the original argument. + EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true); + MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; + // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. + if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) + ValVT = MVT::i8; + else if (ActualMVT == MVT::i16) + ValVT = MVT::i16; + } CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); @@ -2106,7 +2103,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( unsigned ArgSize = VA.getValVT().getSizeInBits() / 8; uint32_t BEAlign = 0; - if (ArgSize < 8 && !Subtarget->isLittleEndian()) + if (!Subtarget->isLittleEndian() && ArgSize < 8 && + !Ins[i].Flags.isInConsecutiveRegs()) BEAlign = 8 - ArgSize; int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); @@ -2198,8 +2196,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, AArch64::X3, AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7 }; static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); - unsigned FirstVariadicGPR = - CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs); + unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs); unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); int GPRIdx = 0; @@ -2227,8 +2224,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs); - unsigned FirstVariadicFPR = - CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs); + unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs); unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); int FPRIdx = 0; @@ -2349,7 +2345,9 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( // cannot rely on the linker replacing the tail call with a return. if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { const GlobalValue *GV = G->getGlobal(); - if (GV->hasExternalWeakLinkage()) + const Triple TT(getTargetMachine().getTargetTriple()); + if (GV->hasExternalWeakLinkage() && + (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) return false; } @@ -2660,7 +2658,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 : VA.getValVT().getSizeInBits(); OpSize = (OpSize + 7) / 8; - if (!Subtarget->isLittleEndian() && !Flags.isByVal()) { + if (!Subtarget->isLittleEndian() && !Flags.isByVal() && + !Flags.isInConsecutiveRegs()) { if (OpSize < 8) BEAlign = 8 - OpSize; } @@ -2782,19 +2781,16 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Add a register mask operand representing the call-preserved registers. const uint32_t *Mask; - const TargetRegisterInfo *TRI = - getTargetMachine().getSubtargetImpl()->getRegisterInfo(); - const AArch64RegisterInfo *ARI = - static_cast<const AArch64RegisterInfo *>(TRI); + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); if (IsThisReturn) { // For 'this' returns, use the X0-preserving mask if applicable - Mask = ARI->getThisReturnPreservedMask(CallConv); + Mask = TRI->getThisReturnPreservedMask(CallConv); if (!Mask) { IsThisReturn = false; - Mask = ARI->getCallPreservedMask(CallConv); + Mask = TRI->getCallPreservedMask(CallConv); } } else - Mask = ARI->getCallPreservedMask(CallConv); + Mask = TRI->getCallPreservedMask(CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -3014,11 +3010,8 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, // TLS calls preserve all registers except those that absolutely must be // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be // silly). - const TargetRegisterInfo *TRI = - getTargetMachine().getSubtargetImpl()->getRegisterInfo(); - const AArch64RegisterInfo *ARI = - static_cast<const AArch64RegisterInfo *>(TRI); - const uint32_t *Mask = ARI->getTLSCallPreservedMask(); + const uint32_t *Mask = + Subtarget->getRegisterInfo()->getTLSCallPreservedMask(); // Finally, we can make the call. This is just a degenerate version of a // normal AArch64 call node: x0 takes the address of the descriptor, and @@ -3065,11 +3058,8 @@ SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr, // TLS calls preserve all registers except those that absolutely must be // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be // silly). - const TargetRegisterInfo *TRI = - getTargetMachine().getSubtargetImpl()->getRegisterInfo(); - const AArch64RegisterInfo *ARI = - static_cast<const AArch64RegisterInfo *>(TRI); - const uint32_t *Mask = ARI->getTLSCallPreservedMask(); + const uint32_t *Mask = + Subtarget->getRegisterInfo()->getTLSCallPreservedMask(); // The function takes only one argument: the address of the descriptor itself // in X0. @@ -3259,8 +3249,8 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { OFCC = getInvertedCondCode(OFCC); SDValue CCVal = DAG.getConstant(OFCC, MVT::i32); - return DAG.getNode(AArch64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest, - CCVal, Overflow); + return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, + Overflow); } if (LHS.getValueType().isInteger()) { @@ -3429,8 +3419,8 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, } SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { - if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) + if (DAG.getMachineFunction().getFunction()->hasFnAttribute( + Attribute::NoImplicitFloat)) return SDValue(); if (!Subtarget->hasNEON()) @@ -3447,18 +3437,12 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { SDValue Val = Op.getOperand(0); SDLoc DL(Op); EVT VT = Op.getValueType(); - SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8); - SDValue VecVal; - if (VT == MVT::i32) { - VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val); - VecVal = DAG.getTargetInsertSubreg(AArch64::ssub, DL, MVT::v8i8, ZeroVec, - VecVal); - } else { - VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); - } + if (VT == MVT::i32) + Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); + Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); - SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal); + SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val); SDValue UaddLV = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, MVT::i32), CtPop); @@ -4279,7 +4263,8 @@ AArch64TargetLowering::getSingleConstraintMatchWeight( std::pair<unsigned, const TargetRegisterClass *> AArch64TargetLowering::getRegForInlineAsmConstraint( - const std::string &Constraint, MVT VT) const { + const TargetRegisterInfo *TRI, const std::string &Constraint, + MVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': @@ -4308,7 +4293,7 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. std::pair<unsigned, const TargetRegisterClass *> Res; - Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); + Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); // Not found as a standard register? if (!Res.second) { @@ -4615,19 +4600,21 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, // The extraction can just take the second half Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, - DAG.getIntPtrConstant(NumSrcElts)); + DAG.getConstant(NumSrcElts, MVT::i64)); Src.WindowBase = -NumSrcElts; } else if (Src.MaxElt < NumSrcElts) { // The extraction can just take the first half - Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, - Src.ShuffleVec, DAG.getIntPtrConstant(0)); + Src.ShuffleVec = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(0, MVT::i64)); } else { // An actual VEXT is needed - SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, - Src.ShuffleVec, DAG.getIntPtrConstant(0)); + SDValue VEXTSrc1 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(0, MVT::i64)); SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, - DAG.getIntPtrConstant(NumSrcElts)); + DAG.getConstant(NumSrcElts, MVT::i64)); unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, @@ -6270,6 +6257,8 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, SDLoc dl, SelectionDAG &DAG) { EVT SrcVT = LHS.getValueType(); + assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && + "function only supposed to emit natural comparisons"); BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode()); APInt CnstBits(VT.getSizeInBits(), 0); @@ -6364,13 +6353,15 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); + EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger(); SDLoc dl(Op); if (LHS.getValueType().getVectorElementType().isInteger()) { assert(LHS.getValueType() == RHS.getValueType()); AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); - return EmitVectorComparison(LHS, RHS, AArch64CC, false, Op.getValueType(), - dl, DAG); + SDValue Cmp = + EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG); + return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); } assert(LHS.getValueType().getVectorElementType() == MVT::f32 || @@ -6384,19 +6375,21 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath; SDValue Cmp = - EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG); + EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); if (!Cmp.getNode()) return SDValue(); if (CC2 != AArch64CC::AL) { SDValue Cmp2 = - EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG); + EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG); if (!Cmp2.getNode()) return SDValue(); - Cmp = DAG.getNode(ISD::OR, dl, Cmp.getValueType(), Cmp, Cmp2); + Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2); } + Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); + if (ShouldInvert) return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); @@ -6534,6 +6527,34 @@ bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { return NumBits1 > NumBits2; } +/// Check if it is profitable to hoist instruction in then/else to if. +/// Not profitable if I and it's user can form a FMA instruction +/// because we prefer FMSUB/FMADD. +bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { + if (I->getOpcode() != Instruction::FMul) + return true; + + if (I->getNumUses() != 1) + return true; + + Instruction *User = I->user_back(); + + if (User && + !(User->getOpcode() == Instruction::FSub || + User->getOpcode() == Instruction::FAdd)) + return true; + + const TargetOptions &Options = getTargetMachine().Options; + EVT VT = getValueType(User->getOperand(0)->getType()); + + if (isFMAFasterThanFMulAndFAdd(VT) && + isOperationLegalOrCustom(ISD::FMA, VT) && + (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath)) + return false; + + return true; +} + // All 32-bit GPR operations implicitly zero the high-half of the corresponding // 64-bit GPR. bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { @@ -6604,8 +6625,7 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, bool Fast; const Function *F = MF.getFunction(); if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 && - !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::NoImplicitFloat) && + !F->hasFnAttribute(Attribute::NoImplicitFloat) && (memOpAlign(SrcAlign, DstAlign, 16) || (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast))) return MVT::f128; @@ -6948,7 +6968,8 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, return SDValue(); } -static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { // First try to optimize away the conversion when it's conditionally from // a constant. Vectors only. SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); @@ -6967,7 +6988,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) { // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. // This eliminates an "integer-to-vector-move UOP and improve throughput. SDValue N0 = N->getOperand(0); - if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && + if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && // Do not change the width of a volatile load. !cast<LoadSDNode>(N0)->isVolatile()) { LoadSDNode *LN0 = cast<LoadSDNode>(N0); @@ -7756,9 +7777,9 @@ static SDValue performExtendCombine(SDNode *N, EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), LoVT.getVectorNumElements()); Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, - DAG.getIntPtrConstant(0)); + DAG.getConstant(0, MVT::i64)); Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, - DAG.getIntPtrConstant(InNVT.getVectorNumElements())); + DAG.getConstant(InNVT.getVectorNumElements(), MVT::i64)); Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo); Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi); @@ -7839,14 +7860,13 @@ static SDValue performSTORECombine(SDNode *N, return SDValue(); // Cyclone has bad performance on unaligned 16B stores when crossing line and - // page boundries. We want to split such stores. + // page boundaries. We want to split such stores. if (!Subtarget->isCyclone()) return SDValue(); // Don't split at Oz. MachineFunction &MF = DAG.getMachineFunction(); - bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::MinSize); + bool IsMinSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize); if (IsMinSize) return SDValue(); @@ -7880,9 +7900,9 @@ static SDValue performSTORECombine(SDNode *N, EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts); SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, - DAG.getIntPtrConstant(0)); + DAG.getConstant(0, MVT::i64)); SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, - DAG.getIntPtrConstant(NumElts)); + DAG.getConstant(NumElts, MVT::i64)); SDValue BasePtr = S->getBasePtr(); SDValue NewST1 = DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), @@ -7973,7 +7993,7 @@ static SDValue performPostLD1Combine(SDNode *N, LoadSDN->getMemOperand()); // Update the uses. - std::vector<SDValue> NewResults; + SmallVector<SDValue, 2> NewResults; NewResults.push_back(SDValue(LD, 0)); // The result of load NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain DCI.CombineTo(LD, NewResults); @@ -8478,6 +8498,12 @@ static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) { // largest real NEON comparison is 64-bits per lane, which means the result is // at most 32-bits and an illegal vector. Just bail out for now. EVT SrcVT = N0.getOperand(0).getValueType(); + + // Don't try to do this optimization when the setcc itself has i1 operands. + // There are no legal vectors of i1, so this would be pointless. + if (SrcVT == MVT::i1) + return SDValue(); + int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits(); if (!ResVT.isVector() || NumMaskElts == 0) return SDValue(); @@ -8518,7 +8544,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performMulCombine(N, DAG, DCI, Subtarget); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: - return performIntToFpCombine(N, DAG); + return performIntToFpCombine(N, DAG, Subtarget); case ISD::OR: return performORCombine(N, DCI, Subtarget); case ISD::INTRINSIC_WO_CHAIN: @@ -8696,13 +8722,12 @@ bool AArch64TargetLowering::getPostIndexedAddressParts( static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) { - if (N->getValueType(0) != MVT::i16) - return; - SDLoc DL(N); SDValue Op = N->getOperand(0); - assert(Op.getValueType() == MVT::f16 && - "Inconsistent bitcast? Only 16-bit types should be i16 or f16"); + + if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16) + return; + Op = SDValue( DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32, DAG.getUNDEF(MVT::i32), Op, @@ -8732,6 +8757,12 @@ bool AArch64TargetLowering::useLoadStackGuardNode() const { return true; } +bool AArch64TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { + // Combine multiple FDIVs with the same divisor into multiple FMULs by the + // reciprocal if there are three or more FDIVs. + return NumUsers > 2; +} + TargetLoweringBase::LegalizeTypeAction AArch64TargetLowering::getPreferredVectorAction(EVT VT) const { MVT SVT = VT.getSimpleVT(); @@ -8836,3 +8867,8 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, Val, Stxr->getFunctionType()->getParamType(0)), Addr); } + +bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( + Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { + return Ty->isArrayTy(); +} |