diff options
Diffstat (limited to 'lib/Target/ARM/ARMISelLowering.cpp')
-rw-r--r-- | lib/Target/ARM/ARMISelLowering.cpp | 711 |
1 files changed, 403 insertions, 308 deletions
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 0d0d81f..56290aa 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -156,11 +156,11 @@ void ARMTargetLowering::addQRTypeForNEON(MVT VT) { addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); } -ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM) - : TargetLowering(TM) { - Subtarget = &TM.getSubtarget<ARMSubtarget>(); - RegInfo = TM.getSubtargetImpl()->getRegisterInfo(); - Itins = TM.getSubtargetImpl()->getInstrItineraryData(); +ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, + const ARMSubtarget &STI) + : TargetLowering(TM), Subtarget(&STI) { + RegInfo = Subtarget->getRegisterInfo(); + Itins = Subtarget->getInstrItineraryData(); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); @@ -404,22 +404,20 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM) addRegisterClass(MVT::f64, &ARM::DPRRegClass); } - for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { - for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) - setTruncStoreAction((MVT::SimpleValueType)VT, - (MVT::SimpleValueType)InnerVT, Expand); - setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); - setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); - setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); + for (MVT VT : MVT::vector_valuetypes()) { + for (MVT InnerVT : MVT::vector_valuetypes()) { + setTruncStoreAction(VT, InnerVT, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); + } - setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::MULHS, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); - setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::BSWAP, VT, Expand); } setOperationAction(ISD::ConstantFP, MVT::f32, Custom); @@ -567,15 +565,18 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM) setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::FP_TO_UINT); setTargetDAGCombine(ISD::FDIV); + setTargetDAGCombine(ISD::LOAD); // It is legal to extload from v4i8 to v4i16 or v4i32. MVT Tys[6] = {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, MVT::v2i32}; for (unsigned i = 0; i < 6; ++i) { - setLoadExtAction(ISD::EXTLOAD, Tys[i], Legal); - setLoadExtAction(ISD::ZEXTLOAD, Tys[i], Legal); - setLoadExtAction(ISD::SEXTLOAD, Tys[i], Legal); + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, VT, Tys[i], Legal); + setLoadExtAction(ISD::ZEXTLOAD, VT, Tys[i], Legal); + setLoadExtAction(ISD::SEXTLOAD, VT, Tys[i], Legal); + } } } @@ -617,11 +618,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM) setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); } - computeRegisterProperties(); + computeRegisterProperties(Subtarget->getRegisterInfo()); // ARM does not have floating-point extending loads. - setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); + for (MVT VT : MVT::fp_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); + } // ... or truncating stores setTruncStoreAction(MVT::f64, MVT::f32, Expand); @@ -629,7 +632,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM) setTruncStoreAction(MVT::f64, MVT::f16, Expand); // ARM does not have i1 sign extending load. - setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); + for (MVT VT : MVT::integer_valuetypes()) + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); // ARM supports all 4 flavors of integer indexed load / store. if (!Subtarget->isThumb1Only()) { @@ -963,13 +967,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM) // of the difficulty prior to coalescing of modeling operand register classes // due to the common occurrence of cross class copies and subregister insertions // and extractions. -std::pair<const TargetRegisterClass*, uint8_t> -ARMTargetLowering::findRepresentativeClass(MVT VT) const{ +std::pair<const TargetRegisterClass *, uint8_t> +ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, + MVT VT) const { const TargetRegisterClass *RRC = nullptr; uint8_t Cost = 1; switch (VT.SimpleTy) { default: - return TargetLowering::findRepresentativeClass(VT); + return TargetLowering::findRepresentativeClass(TRI, VT); // Use DPR as representative register class for all floating point // and vector types. Since there are 32 SPR registers and 32 DPR registers so // the cost is 1 for both f32 and f64. @@ -1166,12 +1171,6 @@ ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, return ARM::createFastISel(funcInfo, libInfo); } -/// getMaximalGlobalOffset - Returns the maximal possible offset which can -/// be used for loads / stores from the global. -unsigned ARMTargetLowering::getMaximalGlobalOffset() const { - return (Subtarget->isThumb1Only() ? 127 : 4095); -} - Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { unsigned NumVals = N->getNumValues(); if (!NumVals) @@ -1190,8 +1189,7 @@ Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { // Load are scheduled for latency even if there instruction itinerary // is not available. - const TargetInstrInfo *TII = - getTargetMachine().getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); if (MCID.getNumDefs() == 0) @@ -1783,8 +1781,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // FIXME: handle tail calls differently. unsigned CallOpc; - bool HasMinSizeAttr = MF.getFunction()->getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::MinSize); + bool HasMinSizeAttr = MF.getFunction()->hasFnAttribute(Attribute::MinSize); if (Subtarget->isThumb()) { if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; @@ -1815,9 +1812,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Add a register mask operand representing the call-preserved registers. if (!isTailCall) { const uint32_t *Mask; - const TargetRegisterInfo *TRI = - getTargetMachine().getSubtargetImpl()->getRegisterInfo(); - const ARMBaseRegisterInfo *ARI = static_cast<const ARMBaseRegisterInfo*>(TRI); + const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); if (isThisReturn) { // For 'this' returns, use the R0-preserving mask if applicable Mask = ARI->getThisReturnPreservedMask(CallConv); @@ -1865,7 +1860,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, void ARMTargetLowering::HandleByVal( CCState *State, unsigned &size, unsigned Align) const { - unsigned reg = State->AllocateReg(GPRArgRegs, 4); + unsigned reg = State->AllocateReg(GPRArgRegs); assert((State->getCallOrPrologue() == Prologue || State->getCallOrPrologue() == Call) && "unhandled ParmContext"); @@ -1875,7 +1870,7 @@ ARMTargetLowering::HandleByVal( unsigned AlignInRegs = Align / 4; unsigned Waste = (ARM::R4 - reg) % AlignInRegs; for (unsigned i = 0; i < Waste; ++i) - reg = State->AllocateReg(GPRArgRegs, 4); + reg = State->AllocateReg(GPRArgRegs); } if (reg != 0) { unsigned excess = 4 * (ARM::R4 - reg); @@ -1886,7 +1881,7 @@ ARMTargetLowering::HandleByVal( // remained registers. const unsigned NSAAOffset = State->getNextStackOffset(); if (Subtarget->isAAPCS_ABI() && NSAAOffset != 0 && size > excess) { - while (State->AllocateReg(GPRArgRegs, 4)) + while (State->AllocateReg(GPRArgRegs)) ; return; } @@ -1903,7 +1898,7 @@ ARMTargetLowering::HandleByVal( // Note, first register is allocated in the beginning of function already, // allocate remained amount of registers we need. for (unsigned i = reg+1; i != ByValRegEnd; ++i) - State->AllocateReg(GPRArgRegs, 4); + State->AllocateReg(GPRArgRegs); // A byval parameter that is split between registers and memory needs its // size truncated here. // In the case where the entire structure fits in registers, we set the @@ -2025,7 +2020,9 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // cannot rely on the linker replacing the tail call with a return. if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { const GlobalValue *GV = G->getGlobal(); - if (GV->hasExternalWeakLinkage()) + const Triple TT(getTargetMachine().getTargetTriple()); + if (GV->hasExternalWeakLinkage() && + (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) return false; } @@ -2084,8 +2081,7 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // the caller's fixed stack objects. MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); - const TargetInstrInfo *TII = - getTargetMachine().getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; ++i, ++realArgIdx) { @@ -2837,16 +2833,11 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF, NumGPRs = REnd - RBegin; } else { unsigned int firstUnalloced; - firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs, - sizeof(GPRArgRegs) / - sizeof(GPRArgRegs[0])); + firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs); NumGPRs = (firstUnalloced <= 3) ? (4 - firstUnalloced) : 0; } - unsigned Align = MF.getTarget() - .getSubtargetImpl() - ->getFrameLowering() - ->getStackAlignment(); + unsigned Align = Subtarget->getFrameLowering()->getStackAlignment(); ArgRegsSize = NumGPRs * 4; // If parameter is split between stack and GPRs... @@ -2913,8 +2904,7 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, firstRegToSaveIndex = RBegin - ARM::R0; lastRegToSaveIndex = REnd - ARM::R0; } else { - firstRegToSaveIndex = CCInfo.getFirstUnallocated - (GPRArgRegs, array_lengthof(GPRArgRegs)); + firstRegToSaveIndex = CCInfo.getFirstUnallocated(GPRArgRegs); lastRegToSaveIndex = 4; } @@ -3087,8 +3077,11 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; - std::advance(CurOrigArg, Ins[VA.getValNo()].OrigArgIndex - CurArgIdx); - CurArgIdx = Ins[VA.getValNo()].OrigArgIndex; + if (Ins[VA.getValNo()].isOrigArg()) { + std::advance(CurOrigArg, + Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); + CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); + } // Arguments stored in registers. if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); @@ -3129,9 +3122,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, else if (RegVT == MVT::v2f64) RC = &ARM::QPRRegClass; else if (RegVT == MVT::i32) - RC = AFI->isThumb1OnlyFunction() ? - (const TargetRegisterClass*)&ARM::tGPRRegClass : - (const TargetRegisterClass*)&ARM::GPRRegClass; + RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass + : &ARM::GPRRegClass; else llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); @@ -3169,7 +3161,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, assert(VA.isMemLoc()); assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); - int index = ArgLocs[i].getValNo(); + int index = VA.getValNo(); // Some Ins[] entries become multiple ArgLoc[] entries. // Process them only once. @@ -3182,6 +3174,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, // Since they could be overwritten by lowering of arguments in case of // a tail call. if (Flags.isByVal()) { + assert(Ins[index].isOrigArg() && + "Byval arguments cannot be implicit"); unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); ByValStoreOffset = RoundUpToAlignment(ByValStoreOffset, Flags.getByValAlign()); @@ -3596,8 +3590,8 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { // inverting the compare condition, swapping 'less' and 'greater') and // sometimes need to swap the operands to the VSEL (which inverts the // condition in the sense of firing whenever the previous condition didn't) - if (getSubtarget()->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || - TrueVal.getValueType() == MVT::f64)) { + if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || + TrueVal.getValueType() == MVT::f64)) { ARMCC::CondCodes CondCode = IntCCToARMCC(CC); if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || CondCode == ARMCC::VC || CondCode == ARMCC::NE) { @@ -3616,8 +3610,8 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { FPCCToARMCC(CC, CondCode, CondCode2); // Try to generate VSEL on ARMv8. - if (getSubtarget()->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || - TrueVal.getValueType() == MVT::f64)) { + if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || + TrueVal.getValueType() == MVT::f64)) { // We can select VMAXNM/VMINNM from a compare followed by a select with the // same operands, as follows: // c = fcmp [ogt, olt, ugt, ult] a, b @@ -4483,6 +4477,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); + EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); EVT VT = Op.getValueType(); ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); SDLoc dl(Op); @@ -4512,8 +4507,8 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { TmpOp0 = Op0; TmpOp1 = Op1; Opc = ISD::OR; - Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0); - Op1 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp0, TmpOp1); + Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); + Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1); break; case ISD::SETUO: Invert = true; // Fallthrough case ISD::SETO: @@ -4521,8 +4516,8 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { TmpOp0 = Op0; TmpOp1 = Op1; Opc = ISD::OR; - Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0); - Op1 = DAG.getNode(ARMISD::VCGE, dl, VT, TmpOp0, TmpOp1); + Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); + Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1); break; } } else { @@ -4556,8 +4551,8 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { Opc = ARMISD::VTST; - Op0 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(0)); - Op1 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(1)); + Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); + Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); Invert = !Invert; } } @@ -4583,22 +4578,24 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { if (SingleOp.getNode()) { switch (Opc) { case ARMISD::VCEQ: - Result = DAG.getNode(ARMISD::VCEQZ, dl, VT, SingleOp); break; + Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break; case ARMISD::VCGE: - Result = DAG.getNode(ARMISD::VCGEZ, dl, VT, SingleOp); break; + Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break; case ARMISD::VCLEZ: - Result = DAG.getNode(ARMISD::VCLEZ, dl, VT, SingleOp); break; + Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break; case ARMISD::VCGT: - Result = DAG.getNode(ARMISD::VCGTZ, dl, VT, SingleOp); break; + Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break; case ARMISD::VCLTZ: - Result = DAG.getNode(ARMISD::VCLTZ, dl, VT, SingleOp); break; + Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break; default: - Result = DAG.getNode(Opc, dl, VT, Op0, Op1); + Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); } } else { - Result = DAG.getNode(Opc, dl, VT, Op0, Op1); + Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); } + Result = DAG.getSExtOrTrunc(Result, dl, VT); + if (Invert) Result = DAG.getNOT(dl, Result, VT); @@ -6497,8 +6494,7 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, void ARMTargetLowering:: SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB, MachineBasicBlock *DispatchBB, int FI) const { - const TargetInstrInfo *TII = - getTargetMachine().getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc dl = MI->getDebugLoc(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); @@ -6515,9 +6511,8 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB, ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj); unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); - const TargetRegisterClass *TRC = isThumb ? - (const TargetRegisterClass*)&ARM::tGPRRegClass : - (const TargetRegisterClass*)&ARM::GPRRegClass; + const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass + : &ARM::GPRRegClass; // Grab constant pool and fixed stack memory operands. MachineMemOperand *CPMMO = @@ -6613,8 +6608,7 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB, MachineBasicBlock *ARMTargetLowering:: EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { - const TargetInstrInfo *TII = - getTargetMachine().getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc dl = MI->getDebugLoc(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); @@ -6622,9 +6616,8 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { MachineFrameInfo *MFI = MF->getFrameInfo(); int FI = MFI->getFunctionContextIndex(); - const TargetRegisterClass *TRC = Subtarget->isThumb() ? - (const TargetRegisterClass*)&ARM::tGPRRegClass : - (const TargetRegisterClass*)&ARM::GPRnopcRegClass; + const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass + : &ARM::GPRnopcRegClass; // Get a mapping of the call site numbers to all of the landing pads they're // associated with. @@ -7129,8 +7122,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI, // This pseudo instruction has 3 operands: dst, src, size // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). // Otherwise, we will generate unrolled scalar copies. - const TargetInstrInfo *TII = - getTargetMachine().getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator It = BB; ++It; @@ -7156,9 +7148,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI, UnitSize = 2; } else { // Check whether we can use NEON instructions. - if (!MF->getFunction()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, - Attribute::NoImplicitFloat) && + if (!MF->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) && Subtarget->hasNEON()) { if ((Align % 16 == 0) && SizeVal >= 16) UnitSize = 16; @@ -7172,14 +7162,11 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI, // Select the correct opcode and register class for unit size load/store bool IsNeon = UnitSize >= 8; - TRC = (IsThumb1 || IsThumb2) ? (const TargetRegisterClass *)&ARM::tGPRRegClass - : (const TargetRegisterClass *)&ARM::GPRRegClass; + TRC = (IsThumb1 || IsThumb2) ? &ARM::tGPRRegClass : &ARM::GPRRegClass; if (IsNeon) - VecTRC = UnitSize == 16 - ? (const TargetRegisterClass *)&ARM::DPairRegClass - : UnitSize == 8 - ? (const TargetRegisterClass *)&ARM::DPRRegClass - : nullptr; + VecTRC = UnitSize == 16 ? &ARM::DPairRegClass + : UnitSize == 8 ? &ARM::DPRRegClass + : nullptr; unsigned BytesLeft = SizeVal % UnitSize; unsigned LoopSize = SizeVal - BytesLeft; @@ -7364,7 +7351,7 @@ MachineBasicBlock * ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI, MachineBasicBlock *MBB) const { const TargetMachine &TM = getTargetMachine(); - const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); assert(Subtarget->isTargetWindows() && @@ -7429,8 +7416,7 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI, MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { - const TargetInstrInfo *TII = - getTargetMachine().getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc dl = MI->getDebugLoc(); bool isThumb2 = Subtarget->isThumb2(); switch (MI->getOpcode()) { @@ -7627,9 +7613,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineRegisterInfo &MRI = Fn->getRegInfo(); // In Thumb mode S must not be specified if source register is the SP or // PC and if destination register is the SP, so restrict register class - unsigned NewRsbDstReg = MRI.createVirtualRegister(isThumb2 ? - (const TargetRegisterClass*)&ARM::rGPRRegClass : - (const TargetRegisterClass*)&ARM::GPRRegClass); + unsigned NewRsbDstReg = + MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); // Transfer the remainder of BB and its successor edges to sinkMBB. SinkBB->splice(SinkBB->begin(), BB, @@ -7694,8 +7679,7 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, // Rename pseudo opcodes. unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode()); if (NewOpc) { - const ARMBaseInstrInfo *TII = static_cast<const ARMBaseInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); + const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); MCID = &TII->get(NewOpc); assert(MCID->getNumOperands() == MI->getDesc().getNumOperands() + 1 && @@ -8059,29 +8043,35 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, else IsLeftOperandMUL = true; if (MULOp == SDValue()) - return SDValue(); + return SDValue(); // Figure out the right opcode. unsigned Opc = MULOp->getOpcode(); unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; // Figure out the high and low input values to the MLAL node. - SDValue* HiMul = &MULOp; SDValue* HiAdd = nullptr; SDValue* LoMul = nullptr; SDValue* LowAdd = nullptr; + // Ensure that ADDE is from high result of ISD::SMUL_LOHI. + if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1))) + return SDValue(); + if (IsLeftOperandMUL) HiAdd = &AddeOp1; else HiAdd = &AddeOp0; - if (AddcOp0->getOpcode() == Opc) { + // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node + // whose low result is fed to the ADDC we are checking. + + if (AddcOp0 == MULOp.getValue(0)) { LoMul = &AddcOp0; LowAdd = &AddcOp1; } - if (AddcOp1->getOpcode() == Opc) { + if (AddcOp1 == MULOp.getValue(0)) { LoMul = &AddcOp1; LowAdd = &AddcOp0; } @@ -8089,9 +8079,6 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, if (!LoMul) return SDValue(); - if (LoMul->getNode() != HiMul->getNode()) - return SDValue(); - // Create the merged node. SelectionDAG &DAG = DCI.DAG; @@ -8583,7 +8570,10 @@ static SDValue PerformBFICombine(SDNode *N, unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); unsigned LSB = countTrailingZeros(~InvMask); unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; - unsigned Mask = (1 << Width)-1; + assert(Width < + static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && + "undefined behavior"); + unsigned Mask = (1u << Width) - 1; unsigned Mask2 = N11C->getZExtValue(); if ((Mask & (~Mask2)) == 0) return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), @@ -8655,147 +8645,6 @@ static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -/// PerformSTORECombine - Target-specific dag combine xforms for -/// ISD::STORE. -static SDValue PerformSTORECombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - StoreSDNode *St = cast<StoreSDNode>(N); - if (St->isVolatile()) - return SDValue(); - - // Optimize trunc store (of multiple scalars) to shuffle and store. First, - // pack all of the elements in one place. Next, store to memory in fewer - // chunks. - SDValue StVal = St->getValue(); - EVT VT = StVal.getValueType(); - if (St->isTruncatingStore() && VT.isVector()) { - SelectionDAG &DAG = DCI.DAG; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - EVT StVT = St->getMemoryVT(); - unsigned NumElems = VT.getVectorNumElements(); - assert(StVT != VT && "Cannot truncate to the same type"); - unsigned FromEltSz = VT.getVectorElementType().getSizeInBits(); - unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits(); - - // From, To sizes and ElemCount must be pow of two - if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue(); - - // We are going to use the original vector elt for storing. - // Accumulated smaller vector elements must be a multiple of the store size. - if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue(); - - unsigned SizeRatio = FromEltSz / ToEltSz; - assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); - - // Create a type on which we perform the shuffle. - EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), - NumElems*SizeRatio); - assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); - - SDLoc DL(St); - SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); - SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i < NumElems; ++i) - ShuffleVec[i] = TLI.isBigEndian() ? (i+1) * SizeRatio - 1 : i * SizeRatio; - - // Can't shuffle using an illegal type. - if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); - - SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec, - DAG.getUNDEF(WideVec.getValueType()), - ShuffleVec.data()); - // At this point all of the data is stored at the bottom of the - // register. We now need to save it to mem. - - // Find the largest store unit - MVT StoreType = MVT::i8; - for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; - tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { - MVT Tp = (MVT::SimpleValueType)tp; - if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) - StoreType = Tp; - } - // Didn't find a legal store type. - if (!TLI.isTypeLegal(StoreType)) - return SDValue(); - - // Bitcast the original vector into a vector of store-size units - EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), - StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); - assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); - SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); - SmallVector<SDValue, 8> Chains; - SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, - TLI.getPointerTy()); - SDValue BasePtr = St->getBasePtr(); - - // Perform one or more big stores into memory. - unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits(); - for (unsigned I = 0; I < E; I++) { - SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, - StoreType, ShuffWide, - DAG.getIntPtrConstant(I)); - SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, - St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), St->getAlignment()); - BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, - Increment); - Chains.push_back(Ch); - } - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); - } - - if (!ISD::isNormalStore(St)) - return SDValue(); - - // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and - // ARM stores of arguments in the same cache line. - if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && - StVal.getNode()->hasOneUse()) { - SelectionDAG &DAG = DCI.DAG; - bool isBigEndian = DAG.getTargetLoweringInfo().isBigEndian(); - SDLoc DL(St); - SDValue BasePtr = St->getBasePtr(); - SDValue NewST1 = DAG.getStore(St->getChain(), DL, - StVal.getNode()->getOperand(isBigEndian ? 1 : 0 ), - BasePtr, St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), St->getAlignment()); - - SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, - DAG.getConstant(4, MVT::i32)); - return DAG.getStore(NewST1.getValue(0), DL, - StVal.getNode()->getOperand(isBigEndian ? 0 : 1), - OffsetPtr, St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), - std::min(4U, St->getAlignment() / 2)); - } - - if (StVal.getValueType() != MVT::i64 || - StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT) - return SDValue(); - - // Bitcast an i64 store extracted from a vector to f64. - // Otherwise, the i64 value will be legalized to a pair of i32 values. - SelectionDAG &DAG = DCI.DAG; - SDLoc dl(StVal); - SDValue IntVec = StVal.getOperand(0); - EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, - IntVec.getValueType().getVectorNumElements()); - SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); - SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, - Vec, StVal.getOperand(1)); - dl = SDLoc(N); - SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); - // Make the DAGCombiner fold the bitcasts. - DCI.AddToWorklist(Vec.getNode()); - DCI.AddToWorklist(ExtElt.getNode()); - DCI.AddToWorklist(V.getNode()); - return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), - St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), St->getAlignment(), - St->getAAInfo()); -} - /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node /// are normal, non-volatile loads. If so, it is profitable to bitcast an /// i64 vector to have f64 elements, since the value can then be loaded @@ -9016,18 +8865,20 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { DAG.getUNDEF(VT), NewMask.data()); } -/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and -/// NEON load/store intrinsics to merge base address updates. +/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, +/// NEON load/store intrinsics, and generic vector load/stores, to merge +/// base address updates. +/// For generic load/stores, the memory type is assumed to be a vector. +/// The caller is assumed to have checked legality. static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { - if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) - return SDValue(); - SelectionDAG &DAG = DCI.DAG; - bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || - N->getOpcode() == ISD::INTRINSIC_W_CHAIN); - unsigned AddrOpIdx = (isIntrinsic ? 2 : 1); + const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || + N->getOpcode() == ISD::INTRINSIC_W_CHAIN); + const bool isStore = N->getOpcode() == ISD::STORE; + const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); SDValue Addr = N->getOperand(AddrOpIdx); + MemSDNode *MemN = cast<MemSDNode>(N); // Search for a use of the address operand that is an increment. for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), @@ -9043,7 +8894,7 @@ static SDValue CombineBaseUpdate(SDNode *N, continue; // Find the new opcode for the updating load/store. - bool isLoad = true; + bool isLoadOp = true; bool isLaneOp = false; unsigned NewOpc = 0; unsigned NumVecs = 0; @@ -9066,19 +8917,19 @@ static SDValue CombineBaseUpdate(SDNode *N, case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; NumVecs = 4; isLaneOp = true; break; case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; - NumVecs = 1; isLoad = false; break; + NumVecs = 1; isLoadOp = false; break; case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; - NumVecs = 2; isLoad = false; break; + NumVecs = 2; isLoadOp = false; break; case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; - NumVecs = 3; isLoad = false; break; + NumVecs = 3; isLoadOp = false; break; case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; - NumVecs = 4; isLoad = false; break; + NumVecs = 4; isLoadOp = false; break; case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; - NumVecs = 2; isLoad = false; isLaneOp = true; break; + NumVecs = 2; isLoadOp = false; isLaneOp = true; break; case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; - NumVecs = 3; isLoad = false; isLaneOp = true; break; + NumVecs = 3; isLoadOp = false; isLaneOp = true; break; case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; - NumVecs = 4; isLoad = false; isLaneOp = true; break; + NumVecs = 4; isLoadOp = false; isLaneOp = true; break; } } else { isLaneOp = true; @@ -9087,15 +8938,24 @@ static SDValue CombineBaseUpdate(SDNode *N, case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; + case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; + NumVecs = 1; isLaneOp = false; break; + case ISD::STORE: NewOpc = ARMISD::VST1_UPD; + NumVecs = 1; isLaneOp = false; isLoadOp = false; break; } } // Find the size of memory referenced by the load/store. EVT VecTy; - if (isLoad) + if (isLoadOp) { VecTy = N->getValueType(0); - else + } else if (isIntrinsic) { VecTy = N->getOperand(AddrOpIdx+1).getValueType(); + } else { + assert(isStore && "Node has to be a load, a store, or an intrinsic!"); + VecTy = N->getOperand(1).getValueType(); + } + unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; if (isLaneOp) NumBytes /= VecTy.getVectorNumElements(); @@ -9112,32 +8972,99 @@ static SDValue CombineBaseUpdate(SDNode *N, continue; } + // OK, we found an ADD we can fold into the base update. + // Now, create a _UPD node, taking care of not breaking alignment. + + EVT AlignedVecTy = VecTy; + unsigned Alignment = MemN->getAlignment(); + + // If this is a less-than-standard-aligned load/store, change the type to + // match the standard alignment. + // The alignment is overlooked when selecting _UPD variants; and it's + // easier to introduce bitcasts here than fix that. + // There are 3 ways to get to this base-update combine: + // - intrinsics: they are assumed to be properly aligned (to the standard + // alignment of the memory type), so we don't need to do anything. + // - ARMISD::VLDx nodes: they are only generated from the aforementioned + // intrinsics, so, likewise, there's nothing to do. + // - generic load/store instructions: the alignment is specified as an + // explicit operand, rather than implicitly as the standard alignment + // of the memory type (like the intrisics). We need to change the + // memory type to match the explicit alignment. That way, we don't + // generate non-standard-aligned ARMISD::VLDx nodes. + if (isa<LSBaseSDNode>(N)) { + if (Alignment == 0) + Alignment = 1; + if (Alignment < VecTy.getScalarSizeInBits() / 8) { + MVT EltTy = MVT::getIntegerVT(Alignment * 8); + assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); + assert(!isLaneOp && "Unexpected generic load/store lane."); + unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); + AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); + } + // Don't set an explicit alignment on regular load/stores that we want + // to transform to VLD/VST 1_UPD nodes. + // This matches the behavior of regular load/stores, which only get an + // explicit alignment if the MMO alignment is larger than the standard + // alignment of the memory type. + // Intrinsics, however, always get an explicit alignment, set to the + // alignment of the MMO. + Alignment = 1; + } + // Create the new updating load/store node. + // First, create an SDVTList for the new updating node's results. EVT Tys[6]; - unsigned NumResultVecs = (isLoad ? NumVecs : 0); + unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); unsigned n; for (n = 0; n < NumResultVecs; ++n) - Tys[n] = VecTy; + Tys[n] = AlignedVecTy; Tys[n++] = MVT::i32; Tys[n] = MVT::Other; SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); + + // Then, gather the new node's operands. SmallVector<SDValue, 8> Ops; Ops.push_back(N->getOperand(0)); // incoming chain Ops.push_back(N->getOperand(AddrOpIdx)); Ops.push_back(Inc); - for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) { - Ops.push_back(N->getOperand(i)); + + if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) { + // Try to match the intrinsic's signature + Ops.push_back(StN->getValue()); + } else { + // Loads (and of course intrinsics) match the intrinsics' signature, + // so just add all but the alignment operand. + for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) + Ops.push_back(N->getOperand(i)); + } + + // For all node types, the alignment operand is always the last one. + Ops.push_back(DAG.getConstant(Alignment, MVT::i32)); + + // If this is a non-standard-aligned STORE, the penultimate operand is the + // stored value. Bitcast it to the aligned type. + if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { + SDValue &StVal = Ops[Ops.size()-2]; + StVal = DAG.getNode(ISD::BITCAST, SDLoc(N), AlignedVecTy, StVal); } - MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N); + SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, - Ops, MemInt->getMemoryVT(), - MemInt->getMemOperand()); + Ops, AlignedVecTy, + MemN->getMemOperand()); // Update the uses. - std::vector<SDValue> NewResults; - for (unsigned i = 0; i < NumResultVecs; ++i) { + SmallVector<SDValue, 5> NewResults; + for (unsigned i = 0; i < NumResultVecs; ++i) NewResults.push_back(SDValue(UpdN.getNode(), i)); + + // If this is an non-standard-aligned LOAD, the first result is the loaded + // value. Bitcast it to the expected result type. + if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { + SDValue &LdVal = NewResults[0]; + LdVal = DAG.getNode(ISD::BITCAST, SDLoc(N), VecTy, LdVal); } + NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain DCI.CombineTo(N, NewResults); DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); @@ -9147,6 +9074,14 @@ static SDValue CombineBaseUpdate(SDNode *N, return SDValue(); } +static SDValue PerformVLDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + return CombineBaseUpdate(N, DCI); +} + /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and @@ -9260,6 +9195,164 @@ static SDValue PerformVDUPLANECombine(SDNode *N, return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); } +static SDValue PerformLOADCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + + // If this is a legal vector load, try to combine it into a VLD1_UPD. + if (ISD::isNormalLoad(N) && VT.isVector() && + DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return CombineBaseUpdate(N, DCI); + + return SDValue(); +} + +/// PerformSTORECombine - Target-specific dag combine xforms for +/// ISD::STORE. +static SDValue PerformSTORECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + StoreSDNode *St = cast<StoreSDNode>(N); + if (St->isVolatile()) + return SDValue(); + + // Optimize trunc store (of multiple scalars) to shuffle and store. First, + // pack all of the elements in one place. Next, store to memory in fewer + // chunks. + SDValue StVal = St->getValue(); + EVT VT = StVal.getValueType(); + if (St->isTruncatingStore() && VT.isVector()) { + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT StVT = St->getMemoryVT(); + unsigned NumElems = VT.getVectorNumElements(); + assert(StVT != VT && "Cannot truncate to the same type"); + unsigned FromEltSz = VT.getVectorElementType().getSizeInBits(); + unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits(); + + // From, To sizes and ElemCount must be pow of two + if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue(); + + // We are going to use the original vector elt for storing. + // Accumulated smaller vector elements must be a multiple of the store size. + if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue(); + + unsigned SizeRatio = FromEltSz / ToEltSz; + assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle. + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), + NumElems*SizeRatio); + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + SDLoc DL(St); + SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); + SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i < NumElems; ++i) + ShuffleVec[i] = TLI.isBigEndian() ? (i+1) * SizeRatio - 1 : i * SizeRatio; + + // Can't shuffle using an illegal type. + if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec, + DAG.getUNDEF(WideVec.getValueType()), + ShuffleVec.data()); + // At this point all of the data is stored at the bottom of the + // register. We now need to save it to mem. + + // Find the largest store unit + MVT StoreType = MVT::i8; + for (MVT Tp : MVT::integer_valuetypes()) { + if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) + StoreType = Tp; + } + // Didn't find a legal store type. + if (!TLI.isTypeLegal(StoreType)) + return SDValue(); + + // Bitcast the original vector into a vector of store-size units + EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), + StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); + assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); + SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); + SmallVector<SDValue, 8> Chains; + SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, + TLI.getPointerTy()); + SDValue BasePtr = St->getBasePtr(); + + // Perform one or more big stores into memory. + unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits(); + for (unsigned I = 0; I < E; I++) { + SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, + StoreType, ShuffWide, + DAG.getIntPtrConstant(I)); + SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, + Increment); + Chains.push_back(Ch); + } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + } + + if (!ISD::isNormalStore(St)) + return SDValue(); + + // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and + // ARM stores of arguments in the same cache line. + if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && + StVal.getNode()->hasOneUse()) { + SelectionDAG &DAG = DCI.DAG; + bool isBigEndian = DAG.getTargetLoweringInfo().isBigEndian(); + SDLoc DL(St); + SDValue BasePtr = St->getBasePtr(); + SDValue NewST1 = DAG.getStore(St->getChain(), DL, + StVal.getNode()->getOperand(isBigEndian ? 1 : 0 ), + BasePtr, St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + + SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, + DAG.getConstant(4, MVT::i32)); + return DAG.getStore(NewST1.getValue(0), DL, + StVal.getNode()->getOperand(isBigEndian ? 0 : 1), + OffsetPtr, St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), + std::min(4U, St->getAlignment() / 2)); + } + + if (StVal.getValueType() == MVT::i64 && + StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + + // Bitcast an i64 store extracted from a vector to f64. + // Otherwise, the i64 value will be legalized to a pair of i32 values. + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(StVal); + SDValue IntVec = StVal.getOperand(0); + EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, + IntVec.getValueType().getVectorNumElements()); + SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); + SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, + Vec, StVal.getOperand(1)); + dl = SDLoc(N); + SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); + // Make the DAGCombiner fold the bitcasts. + DCI.AddToWorklist(Vec.getNode()); + DCI.AddToWorklist(ExtElt.getNode()); + DCI.AddToWorklist(V.getNode()); + return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment(), + St->getAAInfo()); + } + + // If this is a legal vector store, try to combine it into a VST1_UPD. + if (ISD::isNormalStore(N) && VT.isVector() && + DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return CombineBaseUpdate(N, DCI); + + return SDValue(); +} + // isConstVecPow2 - Return true if each vector element is a power of 2, all // elements are the same constant, C, and Log2(C) ranges from 1 to 32. static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C) @@ -9316,16 +9409,18 @@ static SDValue PerformVCVTCombine(SDNode *N, MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); - if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) { + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32 || + NumLanes > 4) { // These instructions only exist converting from f32 to i32. We can handle // smaller integers by generating an extra truncate, but larger ones would - // be lossy. + // be lossy. We also can't handle more then 4 lanes, since these intructions + // only support v2i32/v4i32 types. return SDValue(); } unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : Intrinsic::arm_neon_vcvtfp2fxu; - unsigned NumLanes = Op.getValueType().getVectorNumElements(); SDValue FixConv = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, DAG.getConstant(IntrinsicOpcode, MVT::i32), N0, @@ -9848,10 +9943,11 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); case ISD::SELECT_CC: return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget); case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); + case ISD::LOAD: return PerformLOADCombine(N, DCI); case ARMISD::VLD2DUP: case ARMISD::VLD3DUP: case ARMISD::VLD4DUP: - return CombineBaseUpdate(N, DCI); + return PerformVLDCombine(N, DCI); case ARMISD::BUILD_VECTOR: return PerformARMBUILD_VECTORCombine(N, DCI); case ISD::INTRINSIC_VOID: @@ -9871,7 +9967,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case Intrinsic::arm_neon_vst2lane: case Intrinsic::arm_neon_vst3lane: case Intrinsic::arm_neon_vst4lane: - return CombineBaseUpdate(N, DCI); + return PerformVLDCombine(N, DCI); default: break; } break; @@ -9934,10 +10030,8 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, const Function *F = MF.getFunction(); // See if we can use NEON instructions for this... - if ((!IsMemset || ZeroMemset) && - Subtarget->hasNEON() && - !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::NoImplicitFloat)) { + if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() && + !F->hasFnAttribute(Attribute::NoImplicitFloat)) { bool Fast; if (Size >= 16 && (memOpAlign(SrcAlign, DstAlign, 16) || @@ -10535,7 +10629,8 @@ ARMTargetLowering::getSingleConstraintMatchWeight( typedef std::pair<unsigned, const TargetRegisterClass*> RCPair; RCPair -ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, +ARMTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + const std::string &Constraint, MVT VT) const { if (Constraint.size() == 1) { // GCC ARM Constraint Letters @@ -10581,7 +10676,7 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, if (StringRef("{cc}").equals_lower(Constraint)) return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); - return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops @@ -10861,11 +10956,7 @@ bool ARM::isBitFieldInvertedMask(unsigned v) { // there can be 1's on either or both "outsides", all the "inside" // bits must be 0's - unsigned TO = CountTrailingOnes_32(v); - unsigned LO = CountLeadingOnes_32(v); - v = (v >> TO) << TO; - v = (v << LO) >> LO; - return v == 0; + return isShiftedMask_32(~v); } /// isFPImmLegal - Returns true if the target can instruction select the @@ -11114,7 +11205,7 @@ bool ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { // This has so far only been implemented for MachO. bool ARMTargetLowering::useLoadStackGuardNode() const { - return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO; + return Subtarget->isTargetMachO(); } bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, @@ -11274,7 +11365,9 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, return (Members > 0 && Members <= 4); } -/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate. +/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of +/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when +/// passing according to AAPCS rules. bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { if (getEffectiveCallingConv(CallConv, isVarArg) != @@ -11283,7 +11376,9 @@ bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( HABaseType Base = HA_UNKNOWN; uint64_t Members = 0; - bool result = isHomogeneousAggregate(Ty, Base, Members); - DEBUG(dbgs() << "isHA: " << result << " "; Ty->dump()); - return result; + bool IsHA = isHomogeneousAggregate(Ty, Base, Members); + DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); + + bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); + return IsHA || IsIntArray; } |