diff options
Diffstat (limited to 'lib/Target/ARM')
39 files changed, 1491 insertions, 467 deletions
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index 46915ee..2d747091 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -59,6 +59,8 @@ def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true", "FP compare + branch is slow">; def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true", "Floating point unit supports single precision only">; +def FeatureTrustZone : SubtargetFeature<"trustzone", "HasTrustZone", "true", + "Enable support for TrustZone security extensions">; // Some processors have FP multiply-accumulate instructions that don't // play nicely with other VFP / NEON instructions, and it's generally better @@ -143,32 +145,34 @@ include "ARMSchedule.td" // ARM processor families. def ProcA5 : SubtargetFeature<"a5", "ARMProcFamily", "CortexA5", "Cortex-A5 ARM processors", - [FeatureSlowFPBrcc, FeatureNEONForFP, - FeatureHasSlowFPVMLx, FeatureVMLxForwarding, - FeatureT2XtPk]>; + [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, + FeatureVMLxForwarding, FeatureT2XtPk, + FeatureTrustZone]>; def ProcA8 : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8", "Cortex-A8 ARM processors", - [FeatureSlowFPBrcc, FeatureNEONForFP, - FeatureHasSlowFPVMLx, FeatureVMLxForwarding, - FeatureT2XtPk]>; + [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, + FeatureVMLxForwarding, FeatureT2XtPk, + FeatureTrustZone]>; def ProcA9 : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9", "Cortex-A9 ARM processors", [FeatureVMLxForwarding, FeatureT2XtPk, FeatureFP16, - FeatureAvoidPartialCPSR]>; + FeatureAvoidPartialCPSR, + FeatureTrustZone]>; def ProcSwift : SubtargetFeature<"swift", "ARMProcFamily", "Swift", "Swift ARM processors", [FeatureNEONForFP, FeatureT2XtPk, FeatureVFP4, FeatureMP, FeatureHWDiv, FeatureHWDivARM, FeatureAvoidPartialCPSR, FeatureAvoidMOVsShOp, - FeatureHasSlowFPVMLx]>; + FeatureHasSlowFPVMLx, FeatureTrustZone]>; // FIXME: It has not been determined if A15 has these features. def ProcA15 : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15", "Cortex-A15 ARM processors", [FeatureT2XtPk, FeatureFP16, - FeatureAvoidPartialCPSR]>; + FeatureAvoidPartialCPSR, + FeatureTrustZone]>; def ProcR5 : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5", "Cortex-R5 ARM processors", [FeatureSlowFPBrcc, FeatureHWDivARM, diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index ed8b9cd..0d1417d 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -747,10 +747,10 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, Mov->addRegisterKilled(SrcReg, TRI); } -static const -MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, - unsigned Reg, unsigned SubIdx, unsigned State, - const TargetRegisterInfo *TRI) { +const MachineInstrBuilder & +ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg, + unsigned SubIdx, unsigned State, + const TargetRegisterInfo *TRI) const { if (!SubIdx) return MIB.addReg(Reg, State); @@ -795,12 +795,22 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) { - MachineInstrBuilder MIB = - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STMIA)) - .addFrameIndex(FI)) - .addMemOperand(MMO); - MIB = AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI); - AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI); + if (Subtarget.hasV5TEOps()) { + MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::STRD)); + AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI); + AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI); + MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO); + + AddDefaultPred(MIB); + } else { + // Fallback to STM instruction, which has existed since the dawn of + // time. + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STMIA)) + .addFrameIndex(FI).addMemOperand(MMO)); + AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI); + AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI); + } } else llvm_unreachable("Unknown reg class!"); break; @@ -948,7 +958,6 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); MachineFunction &MF = *MBB.getParent(); - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); MachineFrameInfo &MFI = *MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FI); MachineMemOperand *MMO = @@ -975,12 +984,24 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRD), DestReg) .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) { - unsigned LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA : ARM::LDMIA; - MachineInstrBuilder MIB = - AddDefaultPred(BuildMI(MBB, I, DL, get(LdmOpc)) - .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); - MIB = AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI); + MachineInstrBuilder MIB; + + if (Subtarget.hasV5TEOps()) { + MIB = BuildMI(MBB, I, DL, get(ARM::LDRD)); + AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI); + AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI); + MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO); + + AddDefaultPred(MIB); + } else { + // Fallback to LDM instruction, which has existed since the dawn of + // time. + MIB = AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDMIA)) + .addFrameIndex(FI).addMemOperand(MMO)); + MIB = AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI); + } + if (TargetRegisterInfo::isPhysicalRegister(DestReg)) MIB.addReg(DestReg, RegState::ImplicitDefine); } else @@ -3734,9 +3755,9 @@ ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const { if (MI->getOpcode() == ARM::VMOVD && !isPredicated(MI)) return std::make_pair(ExeVFP, (1<<ExeVFP) | (1<<ExeNEON)); - // A9-like cores are particularly picky about mixing the two and want these + // CortexA9 is particularly picky about mixing the two and wants these // converted. - if (Subtarget.isLikeA9() && !isPredicated(MI) && + if (Subtarget.isCortexA9() && !isPredicated(MI) && (MI->getOpcode() == ARM::VMOVRS || MI->getOpcode() == ARM::VMOVSR || MI->getOpcode() == ARM::VMOVS)) @@ -4023,14 +4044,12 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { // VLD1DUPd32 - Writes all D-regs, no partial reg update, 2 uops. // // FCONSTD can be used as a dependency-breaking instruction. - - unsigned ARMBaseInstrInfo:: getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, const TargetRegisterInfo *TRI) const { - // Only Swift has partial register update problems. - if (!SwiftPartialUpdateClearance || !Subtarget.isSwift()) + if (!SwiftPartialUpdateClearance || + !(Subtarget.isSwift() || Subtarget.isCortexA15())) return 0; assert(TRI && "Need TRI instance"); @@ -4056,7 +4075,7 @@ getPartialRegUpdateClearance(const MachineInstr *MI, // Explicitly reads the dependency. case ARM::VLD1LNd32: - UseOp = 1; + UseOp = 3; break; default: return 0; @@ -4125,3 +4144,15 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, bool ARMBaseInstrInfo::hasNOP() const { return (Subtarget.getFeatureBits() & ARM::HasV6T2Ops) != 0; } + +bool ARMBaseInstrInfo::isSwiftFastImmShift(const MachineInstr *MI) const { + unsigned ShOpVal = MI->getOperand(3).getImm(); + unsigned ShImm = ARM_AM::getSORegOffset(ShOpVal); + // Swift supports faster shifts for: lsl 2, lsl 1, and lsr 1. + if ((ShImm == 1 && ARM_AM::getSORegShOp(ShOpVal) == ARM_AM::lsr) || + ((ShImm == 1 || ShImm == 2) && + ARM_AM::getSORegShOp(ShOpVal) == ARM_AM::lsl)) + return true; + + return false; +} diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index 2698132..2ef659c 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -141,6 +141,10 @@ public: MachineInstr *commuteInstruction(MachineInstr*, bool=false) const; + const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg, + unsigned SubIdx, unsigned State, + const TargetRegisterInfo *TRI) const; + virtual bool produceSameValue(const MachineInstr *MI0, const MachineInstr *MI1, const MachineRegisterInfo *MRI) const; @@ -314,6 +318,10 @@ public: bool canCauseFpMLxStall(unsigned Opcode) const { return MLxHazardOpcodes.count(Opcode); } + + /// Returns true if the instruction has a shift by immediate that can be + /// executed in one cycle less. + bool isSwiftFastImmShift(const MachineInstr *MI) const; }; static inline diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp index abdd251..b0d34a7 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -75,6 +75,12 @@ ARMBaseRegisterInfo::getCallPreservedMask(CallingConv::ID) const { } const uint32_t* +ARMBaseRegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const { + return (STI.isTargetIOS() && !STI.isAAPCS_ABI()) + ? CSR_iOS_ThisReturn_RegMask : CSR_AAPCS_ThisReturn_RegMask; +} + +const uint32_t* ARMBaseRegisterInfo::getNoPreservedMask() const { return CSR_NoRegs_RegMask; } @@ -680,7 +686,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // means the stack pointer cannot be used to access the emergency spill slot // when !hasReservedCallFrame(). #ifndef NDEBUG - if (RS && FrameReg == ARM::SP && FrameIndex == RS->getScavengingFrameIndex()){ + if (RS && FrameReg == ARM::SP && RS->isScavengingFrameIndex(FrameIndex)){ assert(TFI->hasReservedCallFrame(MF) && "Cannot use SP to access the emergency spill slot in " "functions without a reserved call frame"); diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h index 725033b..0679919 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -96,6 +96,7 @@ public: /// Code Generation virtual methods... const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const; const uint32_t *getCallPreservedMask(CallingConv::ID) const; + const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const; const uint32_t *getNoPreservedMask() const; BitVector getReservedRegs(const MachineFunction &MF) const; diff --git a/lib/Target/ARM/ARMCallingConv.h b/lib/Target/ARM/ARMCallingConv.h index e6e8c3d..4f94ad2 100644 --- a/lib/Target/ARM/ARMCallingConv.h +++ b/lib/Target/ARM/ARMCallingConv.h @@ -74,9 +74,15 @@ static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, static const uint16_t HiRegList[] = { ARM::R0, ARM::R2 }; static const uint16_t LoRegList[] = { ARM::R1, ARM::R3 }; static const uint16_t ShadowRegList[] = { ARM::R0, ARM::R1 }; + static const uint16_t GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList, 2); if (Reg == 0) { + + // If we had R3 unallocated only, now we still must to waste it. + Reg = State.AllocateReg(GPRArgRegs, 4); + assert((!Reg || Reg == ARM::R3) && "Wrong GPRs usage for f64"); + // For the 2nd half of a v2f64, do not just fail. if (CanFail) return false; diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td index b378b96..8ff666e 100644 --- a/lib/Target/ARM/ARMCallingConv.td +++ b/lib/Target/ARM/ARMCallingConv.td @@ -111,8 +111,7 @@ def CC_ARM_AAPCS_Common : CallingConv<[ // i64 is 8-aligned i32 here, so we may need to eat R1 as a pad register // (and the same is true for f64 if VFP is not enabled) CCIfType<[i32], CCIfAlign<"8", CCAssignToRegWithShadow<[R0, R2], [R0, R1]>>>, - CCIfType<[i32], CCIf<"State.getNextStackOffset() == 0 &&" - "ArgFlags.getOrigAlign() != 8", + CCIfType<[i32], CCIf<"ArgFlags.getOrigAlign() != 8", CCAssignToReg<[R0, R1, R2, R3]>>>, CCIfType<[i32], CCIfAlign<"8", CCAssignToStackWithShadow<4, 8, R3>>>, @@ -195,10 +194,21 @@ def CSR_NoRegs : CalleeSavedRegs<(add)>; def CSR_AAPCS : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6, R5, R4, (sequence "D%u", 15, 8))>; +// Constructors and destructors return 'this' in the ARM C++ ABI; since 'this' +// and the pointer return value are both passed in R0 in these cases, this can +// be partially modelled by treating R0 as a callee-saved register +// Only the resulting RegMask is used; the SaveList is ignored +def CSR_AAPCS_ThisReturn : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6, + R5, R4, (sequence "D%u", 15, 8), + R0)>; + // iOS ABI deviates from ARM standard ABI. R9 is not a callee-saved register. // Also save R7-R4 first to match the stack frame fixed spill areas. def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>; +def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4, + (sub CSR_AAPCS_ThisReturn, R9))>; + // GHC set of callee saved regs is empty as all those regs are // used for passing STG regs around // add is a workaround for not being able to compile empty list: diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 29fcd40..5d45f64 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -144,8 +144,8 @@ class ARMFastISel : public FastISel { virtual bool TargetSelectInstruction(const Instruction *I); virtual unsigned TargetMaterializeConstant(const Constant *C); virtual unsigned TargetMaterializeAlloca(const AllocaInst *AI); - virtual bool TryToFoldLoad(MachineInstr *MI, unsigned OpNo, - const LoadInst *LI); + virtual bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, + const LoadInst *LI); virtual bool FastLowerArguments(); private: #include "ARMGenFastISel.inc" @@ -2605,7 +2605,7 @@ unsigned ARMFastISel::ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, unsigned Opc; bool isBoolZext = false; - const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::i32); + const TargetRegisterClass *RC; switch (SrcVT.SimpleTy) { default: return 0; case MVT::i16: @@ -2797,12 +2797,12 @@ bool ARMFastISel::TargetSelectInstruction(const Instruction *I) { return false; } -/// TryToFoldLoad - The specified machine instr operand is a vreg, and that +/// \brief The specified machine instr operand is a vreg, and that /// vreg is being provided by the specified load instruction. If possible, /// try to fold the load as an operand to the instruction, returning true if /// successful. -bool ARMFastISel::TryToFoldLoad(MachineInstr *MI, unsigned OpNo, - const LoadInst *LI) { +bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, + const LoadInst *LI) { // Verify we have a legal type before going any further. MVT VT; if (!isLoadTypeLegal(LI->getType(), VT)) diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index 3b12408..483802b 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -141,7 +141,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { assert(!AFI->isThumb1OnlyFunction() && "This emitPrologue does not support Thumb1!"); bool isARM = !AFI->isThumbFunction(); - unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); unsigned NumBytes = MFI->getStackSize(); const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); @@ -159,8 +159,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { return; // Allocate the vararg register save area. This is not counted in NumBytes. - if (VARegSaveSize) - emitSPUpdate(isARM, MBB, MBBI, dl, TII, -VARegSaveSize, + if (ArgRegsSaveSize) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -ArgRegsSaveSize, MachineInstr::FrameSetup); if (!AFI->hasStackFrame()) { @@ -357,7 +357,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, "This emitEpilogue does not support Thumb1!"); bool isARM = !AFI->isThumbFunction(); - unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); int NumBytes = (int)MFI->getStackSize(); unsigned FramePtr = RegInfo->getFrameRegister(MF); @@ -471,8 +471,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, MBBI = NewMI; } - if (VARegSaveSize) - emitSPUpdate(isARM, MBB, MBBI, dl, TII, VARegSaveSize); + if (ArgRegsSaveSize) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize); } /// getFrameIndexReference - Provide a base+offset reference to an FI slot for @@ -1003,7 +1003,7 @@ bool ARMFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, MachineFunction &MF = *MBB.getParent(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - bool isVarArg = AFI->getVarArgsRegSaveSize() > 0; + bool isVarArg = AFI->getArgRegsSaveSize() > 0; unsigned NumAlignedDPRCS2Regs = AFI->getNumAlignedDPRCS2Regs(); // The emitPopInst calls below do not insert reloads for the aligned DPRCS2 @@ -1174,7 +1174,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, if (AFI->isThumb1OnlyFunction()) { // Spill LR if Thumb1 function uses variable length argument lists. - if (AFI->getVarArgsRegSaveSize() > 0) + if (AFI->getArgRegsSaveSize() > 0) MRI.setPhysRegUsed(ARM::LR); // Spill R4 if Thumb1 epilogue has to restore SP from FP. We don't know @@ -1368,7 +1368,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // note: Thumb1 functions spill to R12, not the stack. Reserve a slot // closest to SP or frame pointer. const TargetRegisterClass *RC = &ARM::GPRRegClass; - RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), + RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false)); } diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 2c51de2..9e1782e 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1469,14 +1469,14 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) { SDValue Ops[]= { Base, AMOpc, getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), Chain }; return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32, - MVT::i32, MVT::Other, Ops, 5); + MVT::i32, MVT::Other, Ops); } else { SDValue Chain = LD->getChain(); SDValue Base = LD->getBasePtr(); SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), Chain }; return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32, - MVT::i32, MVT::Other, Ops, 6); + MVT::i32, MVT::Other, Ops); } } @@ -1525,7 +1525,7 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) { SDValue Ops[]= { Base, Offset, getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), Chain }; return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32, MVT::i32, - MVT::Other, Ops, 5); + MVT::Other, Ops); } return NULL; @@ -1539,7 +1539,7 @@ SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) { SDValue SubReg0 = CurDAG->getTargetConstant(ARM::gsub_0, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::gsub_1, MVT::i32); const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 5); + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops); } /// \brief Form a D register from a pair of S registers. @@ -1550,7 +1550,7 @@ SDNode *ARMDAGToDAGISel::createSRegPairNode(EVT VT, SDValue V0, SDValue V1) { SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, MVT::i32); const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 5); + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops); } /// \brief Form a quad register from a pair of D registers. @@ -1560,7 +1560,7 @@ SDNode *ARMDAGToDAGISel::createDRegPairNode(EVT VT, SDValue V0, SDValue V1) { SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32); const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 5); + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops); } /// \brief Form 4 consecutive D registers from a pair of Q registers. @@ -1570,7 +1570,7 @@ SDNode *ARMDAGToDAGISel::createQRegPairNode(EVT VT, SDValue V0, SDValue V1) { SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, MVT::i32); const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 5); + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops); } /// \brief Form 4 consecutive S registers. @@ -1585,7 +1585,7 @@ SDNode *ARMDAGToDAGISel::createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue SubReg3 = CurDAG->getTargetConstant(ARM::ssub_3, MVT::i32); const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, SubReg3 }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 9); + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops); } /// \brief Form 4 consecutive D registers. @@ -1599,7 +1599,7 @@ SDNode *ARMDAGToDAGISel::createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue SubReg3 = CurDAG->getTargetConstant(ARM::dsub_3, MVT::i32); const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, SubReg3 }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 9); + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops); } /// \brief Form 4 consecutive Q registers. @@ -1613,7 +1613,7 @@ SDNode *ARMDAGToDAGISel::createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue SubReg3 = CurDAG->getTargetConstant(ARM::qsub_3, MVT::i32); const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, SubReg3 }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 9); + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops); } /// GetVLDSTAlign - Get the alignment (in bytes) for the alignment operand @@ -1761,7 +1761,7 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, Ops.push_back(Pred); Ops.push_back(Reg0); Ops.push_back(Chain); - VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size()); + VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); } else { // Otherwise, quad registers are loaded with two separate instructions, @@ -1774,7 +1774,7 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0); const SDValue OpsA[] = { MemAddr, Align, Reg0, ImplDef, Pred, Reg0, Chain }; SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, - ResTy, AddrTy, MVT::Other, OpsA, 7); + ResTy, AddrTy, MVT::Other, OpsA); Chain = SDValue(VLdA, 2); // Load the odd subregs. @@ -1791,8 +1791,7 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, Ops.push_back(Pred); Ops.push_back(Reg0); Ops.push_back(Chain); - VLd = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, - Ops.data(), Ops.size()); + VLd = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, Ops); } // Transfer memoperands. @@ -1913,8 +1912,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, Ops.push_back(Pred); Ops.push_back(Reg0); Ops.push_back(Chain); - SDNode *VSt = - CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size()); + SDNode *VSt = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); // Transfer memoperands. cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1); @@ -1939,7 +1937,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, const SDValue OpsA[] = { MemAddr, Align, Reg0, RegSeq, Pred, Reg0, Chain }; SDNode *VStA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, MemAddr.getValueType(), - MVT::Other, OpsA, 7); + MVT::Other, OpsA); cast<MachineSDNode>(VStA)->setMemRefs(MemOp, MemOp + 1); Chain = SDValue(VStA, 1); @@ -1958,7 +1956,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, Ops.push_back(Reg0); Ops.push_back(Chain); SDNode *VStB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, - Ops.data(), Ops.size()); + Ops); cast<MachineSDNode>(VStB)->setMemRefs(MemOp, MemOp + 1); return VStB; } @@ -2063,8 +2061,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] : QOpcodes[OpcodeIndex]); - SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, - Ops.data(), Ops.size()); + SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1); if (!IsLoad) return VLdLn; @@ -2150,8 +2147,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, if (isUpdating) ResTys.push_back(MVT::i32); ResTys.push_back(MVT::Other); - SDNode *VLdDup = - CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size()); + SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1); SuperReg = SDValue(VLdDup, 0); @@ -2197,7 +2193,7 @@ SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, Ops.push_back(N->getOperand(FirstTblReg + NumVecs)); Ops.push_back(getAL(CurDAG)); // predicate Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // predicate register - return CurDAG->getMachineNode(Opc, dl, VT, Ops.data(), Ops.size()); + return CurDAG->getMachineNode(Opc, dl, VT, Ops); } SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N, @@ -2542,7 +2538,7 @@ SDNode *ARMDAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) { MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(), MVT::i32, MVT::i32, MVT::Other, - Ops.data() ,Ops.size()); + Ops); cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1); return ResNode; } @@ -2599,7 +2595,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { SDValue PredReg = CurDAG->getRegister(0, MVT::i32); SDValue Ops[] = { CPIdx, Pred, PredReg, CurDAG->getEntryNode() }; ResNode = CurDAG->getMachineNode(ARM::tLDRpci, dl, MVT::i32, MVT::Other, - Ops, 4); + Ops); } else { SDValue Ops[] = { CPIdx, @@ -2609,7 +2605,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { CurDAG->getEntryNode() }; ResNode=CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other, - Ops, 5); + Ops); } ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0)); return NULL; @@ -2719,7 +2715,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { MVT::i32); SDValue Ops[] = { N0.getOperand(0), Imm16, getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) }; - return CurDAG->getMachineNode(Opc, dl, VT, Ops, 4); + return CurDAG->getMachineNode(Opc, dl, VT, Ops); } } break; @@ -2733,16 +2729,15 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { break; if (Subtarget->isThumb()) { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), - getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), - CurDAG->getRegister(0, MVT::i32) }; - return CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32,Ops,4); + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops); } else { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), CurDAG->getRegister(0, MVT::i32) }; return CurDAG->getMachineNode(Subtarget->hasV6Ops() ? ARM::UMULL : ARM::UMULLv5, - dl, MVT::i32, MVT::i32, Ops, 5); + dl, MVT::i32, MVT::i32, Ops); } } case ISD::SMUL_LOHI: { @@ -2751,14 +2746,14 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { if (Subtarget->isThumb()) { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) }; - return CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32,Ops,4); + return CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops); } else { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), CurDAG->getRegister(0, MVT::i32) }; return CurDAG->getMachineNode(Subtarget->hasV6Ops() ? ARM::SMULL : ARM::SMULLv5, - dl, MVT::i32, MVT::i32, Ops, 5); + dl, MVT::i32, MVT::i32, Ops); } } case ARMISD::UMLAL:{ @@ -2766,7 +2761,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), getAL(CurDAG), CurDAG->getRegister(0, MVT::i32)}; - return CurDAG->getMachineNode(ARM::t2UMLAL, dl, MVT::i32, MVT::i32, Ops, 6); + return CurDAG->getMachineNode(ARM::t2UMLAL, dl, MVT::i32, MVT::i32, Ops); }else{ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), getAL(CurDAG), @@ -2774,7 +2769,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { CurDAG->getRegister(0, MVT::i32) }; return CurDAG->getMachineNode(Subtarget->hasV6Ops() ? ARM::UMLAL : ARM::UMLALv5, - dl, MVT::i32, MVT::i32, Ops, 7); + dl, MVT::i32, MVT::i32, Ops); } } case ARMISD::SMLAL:{ @@ -2782,7 +2777,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), getAL(CurDAG), CurDAG->getRegister(0, MVT::i32)}; - return CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops, 6); + return CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops); }else{ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), getAL(CurDAG), @@ -2790,7 +2785,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { CurDAG->getRegister(0, MVT::i32) }; return CurDAG->getMachineNode(Subtarget->hasV6Ops() ? ARM::SMLAL : ARM::SMLALv5, - dl, MVT::i32, MVT::i32, Ops, 7); + dl, MVT::i32, MVT::i32, Ops); } } case ISD::LOAD: { @@ -2833,7 +2828,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { MVT::i32); SDValue Ops[] = { N1, Tmp2, N3, Chain, InFlag }; SDNode *ResNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, - MVT::Glue, Ops, 5); + MVT::Glue, Ops); Chain = SDValue(ResNode, 0); if (N->getNumValues() == 2) { InFlag = SDValue(ResNode, 1); @@ -2863,7 +2858,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { SDValue Pred = getAL(CurDAG); SDValue PredReg = CurDAG->getRegister(0, MVT::i32); SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg }; - return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops, 4); + return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops); } case ARMISD::VUZP: { unsigned Opc = 0; @@ -2883,7 +2878,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { SDValue Pred = getAL(CurDAG); SDValue PredReg = CurDAG->getRegister(0, MVT::i32); SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg }; - return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops, 4); + return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops); } case ARMISD::VTRN: { unsigned Opc = 0; @@ -2902,7 +2897,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { SDValue Pred = getAL(CurDAG); SDValue PredReg = CurDAG->getRegister(0, MVT::i32); SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg }; - return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops, 4); + return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops); } case ARMISD::BUILD_VECTOR: { EVT VecVT = N->getValueType(0); @@ -3147,8 +3142,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { Ops.push_back(getAL(CurDAG)); Ops.push_back(CurDAG->getRegister(0, MVT::i32)); Ops.push_back(Chain); - SDNode *Ld = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops.data(), - Ops.size()); + SDNode *Ld = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops); // Transfer memoperands. MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); @@ -3211,8 +3205,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { unsigned NewOpc = isThumb ? ARM::t2STREXD : ARM::STREXD; - SDNode *St = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops.data(), - Ops.size()); + SDNode *St = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops); // Transfer memoperands. MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); @@ -3398,7 +3391,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { Ops.push_back(N->getOperand(1)); Ops.push_back(getAL(CurDAG)); // Predicate Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register - return CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops.data(), Ops.size()); + return CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops); } case ARMISD::VTBL2: { DebugLoc dl = N->getDebugLoc(); @@ -3414,8 +3407,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { Ops.push_back(N->getOperand(2)); Ops.push_back(getAL(CurDAG)); // Predicate Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register - return CurDAG->getMachineNode(ARM::VTBL2, dl, VT, - Ops.data(), Ops.size()); + return CurDAG->getMachineNode(ARM::VTBL2, dl, VT, Ops); } case ISD::CONCAT_VECTORS: diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 514971f..9475f1b 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -564,6 +564,16 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); + // Custom expand long extensions to vectors. + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); + // NEON does not have single instruction CTPOP for vectors with element // types wider than 8-bits. However, custom lowering can leverage the // v8i8/v16i8 vcnt instruction. @@ -719,7 +729,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) (Subtarget->hasV6Ops() && !Subtarget->isThumb())) { // membarrier needs custom lowering; the rest are legal and handled // normally. - setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom); setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); // Custom lowering for 64-bit ops setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); @@ -737,7 +746,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setInsertFencesForAtomic(true); } else { // Set them all for expansion, which will force libcalls. - setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand); setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand); setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); @@ -755,8 +763,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) // Unordered/Monotonic case. setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); - // Since the libcalls include locking, fold in the fences - setShouldFoldAtomicFences(true); } setOperationAction(ISD::PREFETCH, MVT::Other, Custom); @@ -870,8 +876,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) // are at least 4 bytes aligned. setMinStackArgumentAlignment(4); - BenefitFromCodePlacementOpt = true; - // Prefer likely predicted branches to selects on out-of-order cores. PredictableSelectIsExpensive = Subtarget->isLikeA9(); @@ -1230,7 +1234,8 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, DebugLoc dl, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const { + SmallVectorImpl<SDValue> &InVals, + bool isThisReturn, SDValue ThisVal) const { // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; @@ -1244,6 +1249,15 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign VA = RVLocs[i]; + // Pass 'this' value directly from the argument to return value, to avoid + // reg unit interference + if (i == 0 && isThisReturn) { + assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && + "unexpected return calling convention register assignment"); + InVals.push_back(ThisVal); + continue; + } + SDValue Val; if (VA.needsCustom()) { // Handle f64 or half of a v2f64. @@ -1355,21 +1369,22 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool isVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); - bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); - bool IsSibCall = false; + bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); + bool isThisReturn = false; + bool isSibCall = false; // Disable tail calls if they're not supported. if (!EnableARMTailCalls && !Subtarget->supportsTailCall()) isTailCall = false; if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, - isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), + isVarArg, isStructRet, MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG); // We don't support GuaranteedTailCallOpt for ARM, only automatically // detected sibcalls. if (isTailCall) { ++NumTailCalls; - IsSibCall = true; + isSibCall = true; } } @@ -1385,12 +1400,12 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned NumBytes = CCInfo.getNextStackOffset(); // For tail calls, memory operands are available in our caller's stack. - if (IsSibCall) + if (isSibCall) NumBytes = 0; // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - if (!IsSibCall) + if (!isSibCall) Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); @@ -1452,6 +1467,13 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, StackPtr, MemOpChains, Flags); } } else if (VA.isRegLoc()) { + if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i32) { + assert(VA.getLocVT() == MVT::i32 && + "unexpected calling convention register assignment"); + assert(!Ins.empty() && Ins[0].VT == MVT::i32 && + "unexpected use of 'returned'"); + isThisReturn = true; + } RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else if (isByVal) { assert(VA.isMemLoc()); @@ -1491,7 +1513,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops, array_lengthof(Ops))); } - } else if (!IsSibCall) { + } else if (!isSibCall) { assert(VA.isMemLoc()); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, @@ -1531,7 +1553,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } - InFlag =SDValue(); + InFlag = SDValue(); } // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every @@ -1672,8 +1694,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. + const uint32_t *Mask; const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); - const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); + const ARMBaseRegisterInfo *ARI = static_cast<const ARMBaseRegisterInfo*>(TRI); + if (isThisReturn) + // For 'this' returns, use the R0-preserving mask + Mask = ARI->getThisReturnPreservedMask(CallConv); + else + Mask = ARI->getCallPreservedMask(CallConv); + assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -1695,8 +1724,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Handle result values, copying them out of physregs into vregs that we // return. - return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, - dl, DAG, InVals); + return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, + InVals, isThisReturn, + isThisReturn ? OutVals[0] : SDValue()); } /// HandleByVal - Every parameter *after* a byval parameter is passed @@ -1711,6 +1741,7 @@ ARMTargetLowering::HandleByVal( State->getCallOrPrologue() == Call) && "unhandled ParmContext"); if ((!State->isFirstByValRegValid()) && + (!Subtarget->isAAPCS_ABI() || State->getNextStackOffset() == 0) && (ARM::R0 <= reg) && (reg <= ARM::R3)) { if (Subtarget->isAAPCS_ABI() && Align > 4) { unsigned AlignInRegs = Align / 4; @@ -1866,7 +1897,7 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // local frame. const ARMFunctionInfo *AFI_Caller = DAG.getMachineFunction(). getInfo<ARMFunctionInfo>(); - if (AFI_Caller->getVarArgsRegSaveSize()) + if (AFI_Caller->getArgRegsSaveSize()) return false; // If the callee takes no arguments then go on to check the results of the @@ -2453,35 +2484,6 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, } } -static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) { - DebugLoc dl = Op.getDebugLoc(); - if (!Subtarget->hasDataBarrier()) { - // Some ARMv6 cpus can support data barriers with an mcr instruction. - // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get - // here. - assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && - "Unexpected ISD::MEMBARRIER encountered. Should be libcall!"); - return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), - DAG.getConstant(0, MVT::i32)); - } - - SDValue Op5 = Op.getOperand(5); - bool isDeviceBarrier = cast<ConstantSDNode>(Op5)->getZExtValue() != 0; - unsigned isLL = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - unsigned isLS = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); - bool isOnlyStoreBarrier = (isLL == 0 && isLS == 0); - - ARM_MB::MemBOpt DMBOpt; - if (isDeviceBarrier) - DMBOpt = isOnlyStoreBarrier ? ARM_MB::ST : ARM_MB::SY; - else - DMBOpt = isOnlyStoreBarrier ? ARM_MB::ISHST : ARM_MB::ISH; - return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0), - DAG.getConstant(DMBOpt, MVT::i32)); -} - - static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { // FIXME: handle "fence singlethread" more efficiently. @@ -2578,7 +2580,8 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, void ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF, - unsigned &VARegSize, unsigned &VARegSaveSize) + unsigned &ArgRegsSize, + unsigned &ArgRegsSaveSize) const { unsigned NumGPRs; if (CCInfo.isFirstByValRegValid()) @@ -2592,8 +2595,8 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF, } unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment(); - VARegSize = NumGPRs * 4; - VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1); + ArgRegsSize = NumGPRs * 4; + ArgRegsSaveSize = (ArgRegsSize + Align - 1) & ~(Align - 1); } // The remaining GPRs hold either the beginning of variable-argument @@ -2603,13 +2606,26 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF, // If this is a variadic function, the va_list pointer will begin with // these values; otherwise, this reassembles a (byval) structure that // was split between registers and memory. -void -ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, - DebugLoc dl, SDValue &Chain, - const Value *OrigArg, - unsigned OffsetFromOrigArg, - unsigned ArgOffset, - bool ForceMutable) const { +// Return: The frame index registers were stored into. +int +ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, + DebugLoc dl, SDValue &Chain, + const Value *OrigArg, + unsigned OffsetFromOrigArg, + unsigned ArgOffset, + bool ForceMutable) const { + + // Currently, two use-cases possible: + // Case #1. Non var-args function, and we meet first byval parameter. + // Setup first unallocated register as first byval register; + // eat all remained registers + // (these two actions are performed by HandleByVal method). + // Then, here, we initialize stack frame with + // "store-reg" instructions. + // Case #2. Var-args function, that doesn't contain byval parameters. + // The same: eat all remained unallocated registers, + // initialize stack frame. + MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); @@ -2621,19 +2637,22 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0])); } - unsigned VARegSize, VARegSaveSize; - computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize); - if (VARegSaveSize) { - // If this function is vararg, store any remaining integer argument regs - // to their spots on the stack so that they may be loaded by deferencing - // the result of va_next. - AFI->setVarArgsRegSaveSize(VARegSaveSize); - AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(VARegSaveSize, - ArgOffset + VARegSaveSize - - VARegSize, - false)); - SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(), - getPointerTy()); + unsigned ArgRegsSize, ArgRegsSaveSize; + computeRegArea(CCInfo, MF, ArgRegsSize, ArgRegsSaveSize); + + // Store any by-val regs to their spots on the stack so that they may be + // loaded by deferencing the result of formal parameter pointer or va_next. + // Note: once stack area for byval/varargs registers + // was initialized, it can't be initialized again. + if (!AFI->getArgRegsSaveSize() && ArgRegsSaveSize) { + + AFI->setArgRegsSaveSize(ArgRegsSaveSize); + + int FrameIndex = MFI->CreateFixedObject( + ArgRegsSaveSize, + ArgOffset + ArgRegsSaveSize - ArgRegsSize, + false); + SDValue FIN = DAG.getFrameIndex(FrameIndex, getPointerTy()); SmallVector<SDValue, 4> MemOps; for (unsigned i = 0; firstRegToSaveIndex < 4; ++firstRegToSaveIndex, ++i) { @@ -2656,10 +2675,30 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOps[0], MemOps.size()); + return FrameIndex; } else // This will point to the next argument passed via stack. - AFI->setVarArgsFrameIndex( - MFI->CreateFixedObject(4, ArgOffset, !ForceMutable)); + return MFI->CreateFixedObject(4, ArgOffset, !ForceMutable); +} + +// Setup stack frame, the va_list pointer will start from. +void +ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, + DebugLoc dl, SDValue &Chain, + unsigned ArgOffset, + bool ForceMutable) const { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + // Try to store any remaining integer argument regs + // to their spots on the stack so that they may be loaded by deferencing + // the result of va_next. + // If there is no regs to be stored, just point address after last + // argument passed via stack. + int FrameIndex = + StoreByValRegs(CCInfo, DAG, dl, Chain, 0, 0, ArgOffset, ForceMutable); + + AFI->setVarArgsFrameIndex(FrameIndex); } SDValue @@ -2785,20 +2824,12 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, // Since they could be overwritten by lowering of arguments in case of // a tail call. if (Flags.isByVal()) { - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - if (!AFI->getVarArgsFrameIndex()) { - VarArgStyleRegisters(CCInfo, DAG, - dl, Chain, CurOrigArg, - Ins[VA.getValNo()].PartOffset, - VA.getLocMemOffset(), - true /*force mutable frames*/); - int VAFrameIndex = AFI->getVarArgsFrameIndex(); - InVals.push_back(DAG.getFrameIndex(VAFrameIndex, getPointerTy())); - } else { - int FI = MFI->CreateFixedObject(Flags.getByValSize(), - VA.getLocMemOffset(), false); - InVals.push_back(DAG.getFrameIndex(FI, getPointerTy())); - } + int FrameIndex = StoreByValRegs( + CCInfo, DAG, dl, Chain, CurOrigArg, + Ins[VA.getValNo()].PartOffset, + VA.getLocMemOffset(), + true /*force mutable frames*/); + InVals.push_back(DAG.getFrameIndex(FrameIndex, getPointerTy())); } else { int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8, VA.getLocMemOffset(), true); @@ -2816,7 +2847,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, // varargs if (isVarArg) - VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 0, 0, + VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset()); return Chain; @@ -3433,6 +3464,47 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { return FrameAddr; } +/// Custom Expand long vector extensions, where size(DestVec) > 2*size(SrcVec), +/// and size(DestVec) > 128-bits. +/// This is achieved by doing the one extension from the SrcVec, splitting the +/// result, extending these parts, and then concatenating these into the +/// destination. +static SDValue ExpandVectorExtension(SDNode *N, SelectionDAG &DAG) { + SDValue Op = N->getOperand(0); + EVT SrcVT = Op.getValueType(); + EVT DestVT = N->getValueType(0); + + assert(DestVT.getSizeInBits() > 128 && + "Custom sext/zext expansion needs >128-bit vector."); + // If this is a normal length extension, use the default expansion. + if (SrcVT.getSizeInBits()*4 != DestVT.getSizeInBits() && + SrcVT.getSizeInBits()*8 != DestVT.getSizeInBits()) + return SDValue(); + + DebugLoc dl = N->getDebugLoc(); + unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits(); + unsigned DestEltSize = DestVT.getVectorElementType().getSizeInBits(); + unsigned NumElts = SrcVT.getVectorNumElements(); + LLVMContext &Ctx = *DAG.getContext(); + SDValue Mid, SplitLo, SplitHi, ExtLo, ExtHi; + + EVT MidVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2), + NumElts); + EVT SplitVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2), + NumElts/2); + EVT ExtVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, DestEltSize), + NumElts/2); + + Mid = DAG.getNode(N->getOpcode(), dl, MidVT, Op); + SplitLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid, + DAG.getIntPtrConstant(0)); + SplitHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid, + DAG.getIntPtrConstant(NumElts/2)); + ExtLo = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitLo); + ExtHi = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitHi); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, ExtLo, ExtHi); +} + /// ExpandBITCAST - If the target supports VFP, this function is called to /// expand a bit convert where either the source or destination type is i64 to /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 @@ -5565,7 +5637,6 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BR_CC: return LowerBR_CC(Op, DAG); case ISD::BR_JT: return LowerBR_JT(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); - case ISD::MEMBARRIER: return LowerMEMBARRIER(Op, DAG, Subtarget); case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); case ISD::SINT_TO_FP: @@ -5621,6 +5692,10 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, case ISD::BITCAST: Res = ExpandBITCAST(N, DAG); break; + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + Res = ExpandVectorExtension(N, DAG); + break; case ISD::SRL: case ISD::SRA: Res = Expand64BitShift(N, DAG, Subtarget); diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 9ee17f0..46b8438 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -464,7 +464,8 @@ namespace llvm { CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, DebugLoc dl, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const; + SmallVectorImpl<SDValue> &InVals, + bool isThisReturn, SDValue ThisVal) const; virtual SDValue LowerFormalArguments(SDValue Chain, @@ -473,16 +474,21 @@ namespace llvm { DebugLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const; + int StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, + DebugLoc dl, SDValue &Chain, + const Value *OrigArg, + unsigned OffsetFromOrigArg, + unsigned ArgOffset, + bool ForceMutable) const; + void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, DebugLoc dl, SDValue &Chain, - const Value *OrigArg, - unsigned OffsetFromOrigArg, unsigned ArgOffset, - bool ForceMutable = false) - const; + bool ForceMutable = false) const; void computeRegArea(CCState &CCInfo, MachineFunction &MF, - unsigned &VARegSize, unsigned &VARegSaveSize) const; + unsigned &ArgRegsSize, + unsigned &ArgRegsSaveSize) const; virtual SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI, diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 9409f35..1bd174e 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -221,6 +221,9 @@ def HasDB : Predicate<"Subtarget->hasDataBarrier()">, def HasMP : Predicate<"Subtarget->hasMPExtension()">, AssemblerPredicate<"FeatureMP", "mp-extensions">; +def HasTrustZone : Predicate<"Subtarget->hasTrustZone()">, + AssemblerPredicate<"FeatureTrustZone", + "TrustZone">; def UseNEONForFP : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">; def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">; def IsThumb : Predicate<"Subtarget->isThumb()">, @@ -578,6 +581,17 @@ def imm0_1 : Operand<i32> { let ParserMatchClass = Imm0_1AsmOperand; } def Imm0_3AsmOperand: ImmAsmOperand { let Name = "Imm0_3"; } def imm0_3 : Operand<i32> { let ParserMatchClass = Imm0_3AsmOperand; } +/// imm0_4 predicate - Immediate in the range [0,4]. +def Imm0_4AsmOperand : ImmAsmOperand +{ + let Name = "Imm0_4"; + let DiagnosticType = "ImmRange0_4"; +} +def imm0_4 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 5; }]> { + let ParserMatchClass = Imm0_4AsmOperand; + let DecoderMethod = "DecodeImm0_4"; +} + /// imm0_7 predicate - Immediate in the range [0,7]. def Imm0_7AsmOperand: ImmAsmOperand { let Name = "Imm0_7"; } def imm0_7 : Operand<i32>, ImmLeaf<i32, [{ @@ -741,18 +755,26 @@ def imm1_16 : Operand<i32>, PatLeaf<(imm), [{ return Imm > 0 && Imm <= 16; }], // addrmode_imm12 := reg +/- imm12 // def MemImm12OffsetAsmOperand : AsmOperandClass { let Name = "MemImm12Offset"; } -def addrmode_imm12 : Operand<i32>, +class AddrMode_Imm12 : Operand<i32>, ComplexPattern<i32, 2, "SelectAddrModeImm12", []> { // 12-bit immediate operand. Note that instructions using this encode // #0 and #-0 differently. We flag #-0 as the magic value INT32_MIN. All other // immediate values are as normal. let EncoderMethod = "getAddrModeImm12OpValue"; - let PrintMethod = "printAddrModeImm12Operand"; let DecoderMethod = "DecodeAddrModeImm12Operand"; let ParserMatchClass = MemImm12OffsetAsmOperand; let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); } + +def addrmode_imm12 : AddrMode_Imm12 { + let PrintMethod = "printAddrModeImm12Operand<false>"; +} + +def addrmode_imm12_pre : AddrMode_Imm12 { + let PrintMethod = "printAddrModeImm12Operand<true>"; +} + // ldst_so_reg := reg +/- reg shop imm // def MemRegOffsetAsmOperand : AsmOperandClass { let Name = "MemRegOffset"; } @@ -852,14 +874,23 @@ def am2offset_imm : Operand<i32>, // // FIXME: split into imm vs. reg versions. def AddrMode3AsmOperand : AsmOperandClass { let Name = "AddrMode3"; } -def addrmode3 : Operand<i32>, - ComplexPattern<i32, 3, "SelectAddrMode3", []> { +class AddrMode3 : Operand<i32>, + ComplexPattern<i32, 3, "SelectAddrMode3", []> { let EncoderMethod = "getAddrMode3OpValue"; - let PrintMethod = "printAddrMode3Operand"; let ParserMatchClass = AddrMode3AsmOperand; let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm); } +def addrmode3 : AddrMode3 +{ + let PrintMethod = "printAddrMode3Operand<false>"; +} + +def addrmode3_pre : AddrMode3 +{ + let PrintMethod = "printAddrMode3Operand<true>"; +} + // FIXME: split into imm vs. reg versions. // FIXME: parser method to handle +/- register. def AM3OffsetAsmOperand : AsmOperandClass { @@ -885,15 +916,22 @@ def ldstm_mode : OptionalDefOperand<OtherVT, (ops i32), (ops (i32 1))> { // addrmode5 := reg +/- imm8*4 // def AddrMode5AsmOperand : AsmOperandClass { let Name = "AddrMode5"; } -def addrmode5 : Operand<i32>, - ComplexPattern<i32, 2, "SelectAddrMode5", []> { - let PrintMethod = "printAddrMode5Operand"; +class AddrMode5 : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrMode5", []> { let EncoderMethod = "getAddrMode5OpValue"; let DecoderMethod = "DecodeAddrMode5Operand"; let ParserMatchClass = AddrMode5AsmOperand; let MIOperandInfo = (ops GPR:$base, i32imm); } +def addrmode5 : AddrMode5 { + let PrintMethod = "printAddrMode5Operand<false>"; +} + +def addrmode5_pre : AddrMode5 { + let PrintMethod = "printAddrMode5Operand<true>"; +} + // addrmode6 := reg with optional alignment // def AddrMode6AsmOperand : AsmOperandClass { let Name = "AlignedMemory"; } @@ -1010,7 +1048,8 @@ multiclass AsI1_bin_irs<bits<4> opcod, string opc, let isReMaterializable = 1 in { def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm, iii, opc, "\t$Rd, $Rn, $imm", - [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]> { + [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]>, + Sched<[WriteALU, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<12> imm; @@ -1022,7 +1061,8 @@ multiclass AsI1_bin_irs<bits<4> opcod, string opc, } def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, iir, opc, "\t$Rd, $Rn, $Rm", - [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]> { + [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]>, + Sched<[WriteALU, ReadALU, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<4> Rm; @@ -1037,7 +1077,8 @@ multiclass AsI1_bin_irs<bits<4> opcod, string opc, def rsi : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, iis, opc, "\t$Rd, $Rn, $shift", - [(set GPR:$Rd, (opnode GPR:$Rn, so_reg_imm:$shift))]> { + [(set GPR:$Rd, (opnode GPR:$Rn, so_reg_imm:$shift))]>, + Sched<[WriteALUsi, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<12> shift; @@ -1052,7 +1093,8 @@ multiclass AsI1_bin_irs<bits<4> opcod, string opc, def rsr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, iis, opc, "\t$Rd, $Rn, $shift", - [(set GPR:$Rd, (opnode GPR:$Rn, so_reg_reg:$shift))]> { + [(set GPR:$Rd, (opnode GPR:$Rn, so_reg_reg:$shift))]>, + Sched<[WriteALUsr, ReadALUsr]> { bits<4> Rd; bits<4> Rn; bits<12> shift; @@ -1079,7 +1121,8 @@ multiclass AsI1_rbin_irs<bits<4> opcod, string opc, let isReMaterializable = 1 in { def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm, iii, opc, "\t$Rd, $Rn, $imm", - [(set GPR:$Rd, (opnode so_imm:$imm, GPR:$Rn))]> { + [(set GPR:$Rd, (opnode so_imm:$imm, GPR:$Rn))]>, + Sched<[WriteALU, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<12> imm; @@ -1091,7 +1134,8 @@ multiclass AsI1_rbin_irs<bits<4> opcod, string opc, } def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, iir, opc, "\t$Rd, $Rn, $Rm", - [/* pattern left blank */]> { + [/* pattern left blank */]>, + Sched<[WriteALU, ReadALU, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<4> Rm; @@ -1105,7 +1149,8 @@ multiclass AsI1_rbin_irs<bits<4> opcod, string opc, def rsi : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, iis, opc, "\t$Rd, $Rn, $shift", - [(set GPR:$Rd, (opnode so_reg_imm:$shift, GPR:$Rn))]> { + [(set GPR:$Rd, (opnode so_reg_imm:$shift, GPR:$Rn))]>, + Sched<[WriteALUsi, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<12> shift; @@ -1120,7 +1165,8 @@ multiclass AsI1_rbin_irs<bits<4> opcod, string opc, def rsr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, iis, opc, "\t$Rd, $Rn, $shift", - [(set GPR:$Rd, (opnode so_reg_reg:$shift, GPR:$Rn))]> { + [(set GPR:$Rd, (opnode so_reg_reg:$shift, GPR:$Rn))]>, + Sched<[WriteALUsr, ReadALUsr]> { bits<4> Rd; bits<4> Rn; bits<12> shift; @@ -1145,24 +1191,28 @@ multiclass AsI1_bin_s_irs<InstrItinClass iii, InstrItinClass iir, bit Commutable = 0> { def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm, pred:$p), 4, iii, - [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_imm:$imm))]>; + [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_imm:$imm))]>, + Sched<[WriteALU, ReadALU]>; def rr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, pred:$p), 4, iir, - [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm))]> { + [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm))]>, + Sched<[WriteALU, ReadALU, ReadALU]> { let isCommutable = Commutable; } def rsi : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg_imm:$shift, pred:$p), 4, iis, [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, - so_reg_imm:$shift))]>; + so_reg_imm:$shift))]>, + Sched<[WriteALUsi, ReadALU]>; def rsr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg_reg:$shift, pred:$p), 4, iis, [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, - so_reg_reg:$shift))]>; + so_reg_reg:$shift))]>, + Sched<[WriteALUSsr, ReadALUsr]>; } } @@ -1174,19 +1224,22 @@ multiclass AsI1_rbin_s_is<InstrItinClass iii, InstrItinClass iir, bit Commutable = 0> { def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm, pred:$p), 4, iii, - [(set GPR:$Rd, CPSR, (opnode so_imm:$imm, GPR:$Rn))]>; + [(set GPR:$Rd, CPSR, (opnode so_imm:$imm, GPR:$Rn))]>, + Sched<[WriteALU, ReadALU]>; def rsi : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg_imm:$shift, pred:$p), 4, iis, [(set GPR:$Rd, CPSR, (opnode so_reg_imm:$shift, - GPR:$Rn))]>; + GPR:$Rn))]>, + Sched<[WriteALUsi, ReadALU]>; def rsr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg_reg:$shift, pred:$p), 4, iis, [(set GPR:$Rd, CPSR, (opnode so_reg_reg:$shift, - GPR:$Rn))]>; + GPR:$Rn))]>, + Sched<[WriteALUSsr, ReadALUsr]>; } } @@ -1199,7 +1252,8 @@ multiclass AI1_cmp_irs<bits<4> opcod, string opc, PatFrag opnode, bit Commutable = 0> { def ri : AI1<opcod, (outs), (ins GPR:$Rn, so_imm:$imm), DPFrm, iii, opc, "\t$Rn, $imm", - [(opnode GPR:$Rn, so_imm:$imm)]> { + [(opnode GPR:$Rn, so_imm:$imm)]>, + Sched<[WriteCMP, ReadALU]> { bits<4> Rn; bits<12> imm; let Inst{25} = 1; @@ -1212,7 +1266,8 @@ multiclass AI1_cmp_irs<bits<4> opcod, string opc, } def rr : AI1<opcod, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, iir, opc, "\t$Rn, $Rm", - [(opnode GPR:$Rn, GPR:$Rm)]> { + [(opnode GPR:$Rn, GPR:$Rm)]>, + Sched<[WriteCMP, ReadALU, ReadALU]> { bits<4> Rn; bits<4> Rm; let isCommutable = Commutable; @@ -1228,7 +1283,8 @@ multiclass AI1_cmp_irs<bits<4> opcod, string opc, def rsi : AI1<opcod, (outs), (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, iis, opc, "\t$Rn, $shift", - [(opnode GPR:$Rn, so_reg_imm:$shift)]> { + [(opnode GPR:$Rn, so_reg_imm:$shift)]>, + Sched<[WriteCMPsi, ReadALU]> { bits<4> Rn; bits<12> shift; let Inst{25} = 0; @@ -1244,7 +1300,8 @@ multiclass AI1_cmp_irs<bits<4> opcod, string opc, def rsr : AI1<opcod, (outs), (ins GPRnopc:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, iis, opc, "\t$Rn, $shift", - [(opnode GPRnopc:$Rn, so_reg_reg:$shift)]> { + [(opnode GPRnopc:$Rn, so_reg_reg:$shift)]>, + Sched<[WriteCMPsr, ReadALU]> { bits<4> Rn; bits<12> shift; let Inst{25} = 0; @@ -1326,7 +1383,8 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm", [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_imm:$imm, CPSR))]>, - Requires<[IsARM]> { + Requires<[IsARM]>, + Sched<[WriteALU, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<12> imm; @@ -1338,7 +1396,8 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm", [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm, CPSR))]>, - Requires<[IsARM]> { + Requires<[IsARM]>, + Sched<[WriteALU, ReadALU, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<4> Rm; @@ -1353,7 +1412,8 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift", [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_imm:$shift, CPSR))]>, - Requires<[IsARM]> { + Requires<[IsARM]>, + Sched<[WriteALUsi, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<12> shift; @@ -1369,7 +1429,8 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift", [(set GPRnopc:$Rd, CPSR, (opnode GPRnopc:$Rn, so_reg_reg:$shift, CPSR))]>, - Requires<[IsARM]> { + Requires<[IsARM]>, + Sched<[WriteALUsr, ReadALUsr]> { bits<4> Rd; bits<4> Rn; bits<12> shift; @@ -1392,7 +1453,8 @@ multiclass AI1_rsc_irs<bits<4> opcod, string opc, PatFrag opnode> { def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm", [(set GPR:$Rd, CPSR, (opnode so_imm:$imm, GPR:$Rn, CPSR))]>, - Requires<[IsARM]> { + Requires<[IsARM]>, + Sched<[WriteALU, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<12> imm; @@ -1403,7 +1465,8 @@ multiclass AI1_rsc_irs<bits<4> opcod, string opc, PatFrag opnode> { } def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm", - [/* pattern left blank */]> { + [/* pattern left blank */]>, + Sched<[WriteALU, ReadALU, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<4> Rm; @@ -1416,7 +1479,8 @@ multiclass AI1_rsc_irs<bits<4> opcod, string opc, PatFrag opnode> { def rsi : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift", [(set GPR:$Rd, CPSR, (opnode so_reg_imm:$shift, GPR:$Rn, CPSR))]>, - Requires<[IsARM]> { + Requires<[IsARM]>, + Sched<[WriteALUsi, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<12> shift; @@ -1430,7 +1494,8 @@ multiclass AI1_rsc_irs<bits<4> opcod, string opc, PatFrag opnode> { def rsr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift", [(set GPR:$Rd, CPSR, (opnode so_reg_reg:$shift, GPR:$Rn, CPSR))]>, - Requires<[IsARM]> { + Requires<[IsARM]>, + Sched<[WriteALUsr, ReadALUsr]> { bits<4> Rd; bits<4> Rn; bits<12> shift; @@ -1641,11 +1706,11 @@ def ATOMUMAX6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), NoItinerary, []>; } -def HINT : AI<(outs), (ins imm0_255:$imm), MiscFrm, NoItinerary, +def HINT : AI<(outs), (ins imm0_4:$imm), MiscFrm, NoItinerary, "hint", "\t$imm", []>, Requires<[IsARM, HasV6]> { - bits<8> imm; - let Inst{27-8} = 0b00110010000011110000; - let Inst{7-0} = imm; + bits<3> imm; + let Inst{27-3} = 0b0011001000001111000000000; + let Inst{2-0} = imm; } def : InstAlias<"nop$p", (HINT 0, pred:$p)>, Requires<[IsARM, HasV6T2]>; @@ -1842,7 +1907,8 @@ let neverHasSideEffects = 1, isReMaterializable = 1 in // the instruction. The {24-21} opcode bits are set by the fixup, as we don't // know until then which form of the instruction will be used. def ADR : AI1<{0,?,?,0}, (outs GPR:$Rd), (ins adrlabel:$label), - MiscFrm, IIC_iALUi, "adr", "\t$Rd, $label", []> { + MiscFrm, IIC_iALUi, "adr", "\t$Rd, $label", []>, + Sched<[WriteALU, ReadALU]> { bits<4> Rd; bits<14> label; let Inst{27-25} = 0b001; @@ -2049,7 +2115,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in { // Secure Monitor Call is a system instruction. def SMC : ABI<0b0001, (outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt", - []> { + []>, Requires<[IsARM, HasTrustZone]> { bits<4> opt; let Inst{23-4} = 0b01100000000000000111; let Inst{3-0} = opt; @@ -2210,7 +2276,7 @@ def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rd, GPR:$dst2), multiclass AI2_ldridx<bit isByte, string opc, InstrItinClass iii, InstrItinClass iir> { def _PRE_IMM : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb), - (ins addrmode_imm12:$addr), IndexModePre, LdFrm, iii, + (ins addrmode_imm12_pre:$addr), IndexModePre, LdFrm, iii, opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> { bits<17> addr; let Inst{25} = 0; @@ -2247,6 +2313,7 @@ multiclass AI2_ldridx<bit isByte, string opc, let Inst{23} = offset{12}; let Inst{19-16} = addr; let Inst{11-0} = offset{11-0}; + let Inst{4} = 0; let DecoderMethod = "DecodeAddrMode2IdxInstruction"; } @@ -2279,7 +2346,7 @@ defm LDRB : AI2_ldridx<1, "ldrb", IIC_iLoad_bh_iu, IIC_iLoad_bh_ru>; multiclass AI3_ldridx<bits<4> op, string opc, InstrItinClass itin> { def _PRE : AI3ldstidx<op, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb), - (ins addrmode3:$addr), IndexModePre, + (ins addrmode3_pre:$addr), IndexModePre, LdMiscFrm, itin, opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> { bits<14> addr; @@ -2313,7 +2380,7 @@ defm LDRSH : AI3_ldridx<0b1111, "ldrsh", IIC_iLoad_bh_ru>; defm LDRSB : AI3_ldridx<0b1101, "ldrsb", IIC_iLoad_bh_ru>; let hasExtraDefRegAllocReq = 1 in { def LDRD_PRE : AI3ldstidx<0b1101, 0, 1, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb), - (ins addrmode3:$addr), IndexModePre, + (ins addrmode3_pre:$addr), IndexModePre, LdMiscFrm, IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $Rn_wb", []> { @@ -2469,7 +2536,7 @@ def STRD : AI3str<0b1111, (outs), (ins GPR:$Rt, GPR:$src2, addrmode3:$addr), multiclass AI2_stridx<bit isByte, string opc, InstrItinClass iii, InstrItinClass iir> { def _PRE_IMM : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb), - (ins GPR:$Rt, addrmode_imm12:$addr), IndexModePre, + (ins GPR:$Rt, addrmode_imm12_pre:$addr), IndexModePre, StFrm, iii, opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> { bits<17> addr; @@ -2591,7 +2658,7 @@ def STRH_preidx: ARMPseudoInst<(outs GPR:$Rn_wb), def STRH_PRE : AI3ldstidx<0b1011, 0, 1, (outs GPR:$Rn_wb), - (ins GPR:$Rt, addrmode3:$addr), IndexModePre, + (ins GPR:$Rt, addrmode3_pre:$addr), IndexModePre, StMiscFrm, IIC_iStore_bh_ru, "strh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> { bits<14> addr; @@ -2623,7 +2690,7 @@ def STRH_POST : AI3ldstidx<0b1011, 0, 0, (outs GPR:$Rn_wb), let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in { def STRD_PRE : AI3ldstidx<0b1111, 0, 1, (outs GPR:$Rn_wb), - (ins GPR:$Rt, GPR:$Rt2, addrmode3:$addr), + (ins GPR:$Rt, GPR:$Rt2, addrmode3_pre:$addr), IndexModePre, StMiscFrm, IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!", "$addr.base = $Rn_wb", []> { @@ -3866,28 +3933,33 @@ def UDIV : ADivA1I<0b011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV, def CLZ : AMiscA1I<0b000010110, 0b0001, (outs GPR:$Rd), (ins GPR:$Rm), IIC_iUNAr, "clz", "\t$Rd, $Rm", - [(set GPR:$Rd, (ctlz GPR:$Rm))]>, Requires<[IsARM, HasV5T]>; + [(set GPR:$Rd, (ctlz GPR:$Rm))]>, Requires<[IsARM, HasV5T]>, + Sched<[WriteALU]>; def RBIT : AMiscA1I<0b01101111, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm), IIC_iUNAr, "rbit", "\t$Rd, $Rm", [(set GPR:$Rd, (ARMrbit GPR:$Rm))]>, - Requires<[IsARM, HasV6T2]>; + Requires<[IsARM, HasV6T2]>, + Sched<[WriteALU]>; def REV : AMiscA1I<0b01101011, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm), IIC_iUNAr, "rev", "\t$Rd, $Rm", - [(set GPR:$Rd, (bswap GPR:$Rm))]>, Requires<[IsARM, HasV6]>; + [(set GPR:$Rd, (bswap GPR:$Rm))]>, Requires<[IsARM, HasV6]>, + Sched<[WriteALU]>; let AddedComplexity = 5 in def REV16 : AMiscA1I<0b01101011, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm), IIC_iUNAr, "rev16", "\t$Rd, $Rm", [(set GPR:$Rd, (rotr (bswap GPR:$Rm), (i32 16)))]>, - Requires<[IsARM, HasV6]>; + Requires<[IsARM, HasV6]>, + Sched<[WriteALU]>; let AddedComplexity = 5 in def REVSH : AMiscA1I<0b01101111, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm), IIC_iUNAr, "revsh", "\t$Rd, $Rm", [(set GPR:$Rd, (sra (bswap GPR:$Rm), (i32 16)))]>, - Requires<[IsARM, HasV6]>; + Requires<[IsARM, HasV6]>, + Sched<[WriteALU]>; def : ARMV6Pat<(or (sra (shl GPR:$Rm, (i32 24)), (i32 16)), (and (srl GPR:$Rm, (i32 8)), 0xFF)), @@ -3899,7 +3971,8 @@ def PKHBT : APKHI<0b01101000, 0, (outs GPRnopc:$Rd), [(set GPRnopc:$Rd, (or (and GPRnopc:$Rn, 0xFFFF), (and (shl GPRnopc:$Rm, pkh_lsl_amt:$sh), 0xFFFF0000)))]>, - Requires<[IsARM, HasV6]>; + Requires<[IsARM, HasV6]>, + Sched<[WriteALUsi, ReadALU]>; // Alternate cases for PKHBT where identities eliminate some nodes. def : ARMV6Pat<(or (and GPRnopc:$Rn, 0xFFFF), (and GPRnopc:$Rm, 0xFFFF0000)), @@ -3915,7 +3988,8 @@ def PKHTB : APKHI<0b01101000, 1, (outs GPRnopc:$Rd), [(set GPRnopc:$Rd, (or (and GPRnopc:$Rn, 0xFFFF0000), (and (sra GPRnopc:$Rm, pkh_asr_amt:$sh), 0xFFFF)))]>, - Requires<[IsARM, HasV6]>; + Requires<[IsARM, HasV6]>, + Sched<[WriteALUsi, ReadALU]>; // Alternate cases for PKHTB where identities eliminate some nodes. Note that // a shift amount of 0 is *not legal* here, it is PKHBT instead. @@ -4391,7 +4465,7 @@ multiclass LdStCop<bit load, bit Dbit, string asm> { let Inst{7-0} = addr{7-0}; let DecoderMethod = "DecodeCopMemInstruction"; } - def _PRE : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr), + def _PRE : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr), asm, "\t$cop, $CRd, $addr!", IndexModePre> { bits<13> addr; bits<4> cop; @@ -4462,7 +4536,7 @@ multiclass LdSt2Cop<bit load, bit Dbit, string asm> { let Inst{7-0} = addr{7-0}; let DecoderMethod = "DecodeCopMemInstruction"; } - def _PRE : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr), + def _PRE : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr), asm, "\t$cop, $CRd, $addr!", IndexModePre> { bits<13> addr; bits<4> cop; diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 0411ac4..896fd0f 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -4316,6 +4316,24 @@ def VACGTq : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt", defm VTST : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vtst", "", NEONvtst, 1>; +def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vn, $Vm", + (VACGTd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>; +def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vn, $Vm", + (VACGTq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>; +def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vn, $Vm", + (VACGEd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>; +def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vn, $Vm", + (VACGEq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>; + +def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm", + (VACGTd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>; +def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm", + (VACGTq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>; +def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vm", + (VACGEd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>; +def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vm", + (VACGEq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>; + // Vector Bitwise Operations. def vnotd : PatFrag<(ops node:$in), @@ -4889,6 +4907,29 @@ def VABSfq : N2VQ<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs", "f32", v4f32, v4f32, fabs>; +def : Pat<(xor (v2i32 (bitconvert (v8i8 (NEONvshrs DPR:$src, (i32 7))))), + (v2i32 (bitconvert (v8i8 (add DPR:$src, + (NEONvshrs DPR:$src, (i32 7))))))), + (VABSv8i8 DPR:$src)>; +def : Pat<(xor (v2i32 (bitconvert (v4i16 (NEONvshrs DPR:$src, (i32 15))))), + (v2i32 (bitconvert (v4i16 (add DPR:$src, + (NEONvshrs DPR:$src, (i32 15))))))), + (VABSv4i16 DPR:$src)>; +def : Pat<(xor (v2i32 (NEONvshrs DPR:$src, (i32 31))), + (v2i32 (add DPR:$src, (NEONvshrs DPR:$src, (i32 31))))), + (VABSv2i32 DPR:$src)>; +def : Pat<(xor (v4i32 (bitconvert (v16i8 (NEONvshrs QPR:$src, (i32 7))))), + (v4i32 (bitconvert (v16i8 (add QPR:$src, + (NEONvshrs QPR:$src, (i32 7))))))), + (VABSv16i8 QPR:$src)>; +def : Pat<(xor (v4i32 (bitconvert (v8i16 (NEONvshrs QPR:$src, (i32 15))))), + (v4i32 (bitconvert (v8i16 (add QPR:$src, + (NEONvshrs QPR:$src, (i32 15))))))), + (VABSv8i16 QPR:$src)>; +def : Pat<(xor (v4i32 (NEONvshrs QPR:$src, (i32 31))), + (v4i32 (add QPR:$src, (NEONvshrs QPR:$src, (i32 31))))), + (VABSv4i32 QPR:$src)>; + def : Pat<(v2f32 (int_arm_neon_vabs (v2f32 DPR:$src))), (VABSfd DPR:$src)>; def : Pat<(v4f32 (int_arm_neon_vabs (v4f32 QPR:$src))), (VABSfq QPR:$src)>; diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index c9d709e..4dacb86 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -150,7 +150,7 @@ def lo5AllOne : PatLeaf<(i32 imm), [{ def t2addrmode_imm12_asmoperand : AsmOperandClass {let Name="MemUImm12Offset";} def t2addrmode_imm12 : Operand<i32>, ComplexPattern<i32, 2, "SelectT2AddrModeImm12", []> { - let PrintMethod = "printAddrModeImm12Operand"; + let PrintMethod = "printAddrModeImm12Operand<false>"; let EncoderMethod = "getAddrModeImm12OpValue"; let DecoderMethod = "DecodeT2AddrModeImm12"; let ParserMatchClass = t2addrmode_imm12_asmoperand; @@ -3401,12 +3401,7 @@ class t2CPS<dag iops, string asm_op> : T2XI<(outs), iops, NoItinerary, bits<5> mode; bit M; - let Inst{31-27} = 0b11110; - let Inst{26} = 0; - let Inst{25-20} = 0b111010; - let Inst{19-16} = 0b1111; - let Inst{15-14} = 0b10; - let Inst{12} = 0; + let Inst{31-11} = 0b111100111010111110000; let Inst{10-9} = imod; let Inst{8} = M; let Inst{7-5} = iflags; @@ -3425,13 +3420,13 @@ let imod = 0, iflags = 0, M = 1 in // A6.3.4 Branches and miscellaneous control // Table A6-14 Change Processor State, and hint instructions -def t2HINT : T2I<(outs), (ins imm0_255:$imm), NoItinerary, "hint", "\t$imm",[]>{ - bits<8> imm; - let Inst{31-8} = 0b111100111010111110000000; - let Inst{7-0} = imm; +def t2HINT : T2I<(outs), (ins imm0_4:$imm), NoItinerary, "hint", "\t$imm",[]> { + bits<3> imm; + let Inst{31-3} = 0b11110011101011111000000000000; + let Inst{2-0} = imm; } -def : t2InstAlias<"hint$p.w $imm", (t2HINT imm0_255:$imm, pred:$p)>; +def : t2InstAlias<"hint$p.w $imm", (t2HINT imm0_4:$imm, pred:$p)>; def : t2InstAlias<"nop$p.w", (t2HINT 0, pred:$p)>; def : t2InstAlias<"yield$p.w", (t2HINT 1, pred:$p)>; def : t2InstAlias<"wfe$p.w", (t2HINT 2, pred:$p)>; @@ -3449,7 +3444,8 @@ def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt", []> { // Secure Monitor Call is a system instruction. // Option = Inst{19-16} -def t2SMC : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt", []> { +def t2SMC : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt", + []>, Requires<[IsThumb2, HasTrustZone]> { let Inst{31-27} = 0b11110; let Inst{26-20} = 0b1111111; let Inst{15-12} = 0b1000; diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 98bd6c1..c8ed576 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -865,7 +865,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD; // Can't do the merge if the destination register is the same as the would-be // writeback register. - if (isLd && MI->getOperand(0).getReg() == Base) + if (MI->getOperand(0).getReg() == Base) return false; unsigned PredReg = 0; @@ -1258,6 +1258,22 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { // merge the ldr's so far, including this one. But don't try to // combine the following ldr(s). Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg()); + + // Watch out for: + // r4 := ldr [r0, #8] + // r4 := ldr [r0, #4] + // + // The optimization may reorder the second ldr in front of the first + // ldr, which violates write after write(WAW) dependence. The same as + // str. Try to merge inst(s) already in MemOps. + bool Overlap = false; + for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); I != E; ++I) { + if (TRI->regsOverlap(Reg, I->MBBI->getOperand(0).getReg())) { + Overlap = true; + break; + } + } + if (CurrBase == 0 && !Clobber) { // Start of a new chain. CurrBase = Base; @@ -1268,7 +1284,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI)); ++NumMemOps; Advance = true; - } else { + } else if (!Overlap) { if (Clobber) { TryMerge = true; Advance = true; diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h index 88d96c0..f4248fc 100644 --- a/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -38,7 +38,7 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// VarArgsRegSaveSize - Size of the register save area for vararg functions. /// - unsigned VarArgsRegSaveSize; + unsigned ArgRegsSaveSize; /// HasStackFrame - True if this function has a stack frame. Set by /// processFunctionBeforeCalleeSavedScan(). @@ -117,7 +117,7 @@ public: ARMFunctionInfo() : isThumb(false), hasThumb2(false), - VarArgsRegSaveSize(0), HasStackFrame(false), RestoreSPFromFP(false), + ArgRegsSaveSize(0), HasStackFrame(false), RestoreSPFromFP(false), LRSpilledForFarJump(false), FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), @@ -129,7 +129,7 @@ public: explicit ARMFunctionInfo(MachineFunction &MF) : isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()), hasThumb2(MF.getTarget().getSubtarget<ARMSubtarget>().hasThumb2()), - VarArgsRegSaveSize(0), HasStackFrame(false), RestoreSPFromFP(false), + ArgRegsSaveSize(0), HasStackFrame(false), RestoreSPFromFP(false), LRSpilledForFarJump(false), FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), @@ -141,8 +141,8 @@ public: bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; } bool isThumb2Function() const { return isThumb && hasThumb2; } - unsigned getVarArgsRegSaveSize() const { return VarArgsRegSaveSize; } - void setVarArgsRegSaveSize(unsigned s) { VarArgsRegSaveSize = s; } + unsigned getArgRegsSaveSize() const { return ArgRegsSaveSize; } + void setArgRegsSaveSize(unsigned s) { ArgRegsSaveSize = s; } bool hasStackFrame() const { return HasStackFrame; } void setHasStackFrame(bool s) { HasStackFrame = s; } diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index 02196d0..2d088de 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -6,6 +6,77 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// Instruction scheduling annotations for out-of-order CPUs. +// These annotations are independent of the itinerary class defined below. +// Here we define the subtarget independent read/write per-operand resources. +// The subtarget schedule definitions will then map these to the subtarget's +// resource usages. +// For example: +// The instruction cycle timings table might contain an entry for an operation +// like the following: +// Rd <- ADD Rn, Rm, <shift> Rs +// Uops | Latency from register | Uops - resource requirements - latency +// 2 | Rn: 1 Rm: 4 Rs: 4 | uop T0, Rm, Rs - P01 - 3 +// | | uopc Rd, Rn, T0 - P01 - 1 +// This is telling us that the result will be available in destination register +// Rd after a minimum of three cycles after the result in Rm and Rs is available +// and one cycle after the result in Rn is available. The micro-ops can execute +// on resource P01. +// To model this, we need to express that we need to dispatch two micro-ops, +// that the resource P01 is needed and that the latency to Rn is different than +// the latency to Rm and Rs. The scheduler can decrease Rn's producer latency by +// two. +// We will do this by assigning (abstract) resources to register defs/uses. +// ARMSchedule.td: +// def WriteALUsr : SchedWrite; +// def ReadAdvanceALUsr : ScheRead; +// +// ARMInstrInfo.td: +// def ADDrs : I<>, Sched<[WriteALUsr, ReadAdvanceALUsr, ReadDefault, +// ReadDefault]> { ...} +// ReadAdvance read resources allow us to define "pipeline by-passes" or +// shorter latencies to certain registers as needed in the example above. +// The "ReadDefault" can be omitted. +// Next, the subtarget td file assigns resources to the abstract resources +// defined here. +// ARMScheduleSubtarget.td: +// // Resources. +// def P01 : ProcResource<3>; // ALU unit (3 of it). +// ... +// // Resource usages. +// def : WriteRes<WriteALUsr, [P01, P01]> { +// Latency = 4; // Latency of 4. +// NumMicroOps = 2; // Dispatch 2 micro-ops. +// // The two instances of resource P01 are occupied for one cycle. It is one +// // cycle because these resources happen to be pipelined. +// ResourceCycles = [1, 1]; +// } +// def : ReadAdvance<ReadAdvanceALUsr, 3>; + +// Basic ALU operation. +def WriteALU : SchedWrite; +def ReadALU : SchedRead; + +// Basic ALU with shifts. +def WriteALUsi : SchedWrite; // Shift by immediate. +def WriteALUsr : SchedWrite; // Shift by register. +def WriteALUSsr : SchedWrite; // Shift by register (flag setting). +def ReadALUsr : SchedRead; // Some operands are read later. + +// Compares. +def WriteCMP : SchedWrite; +def WriteCMPsi : SchedWrite; +def WriteCMPsr : SchedWrite; + +// Define TII for use in SchedVariant Predicates. +def : PredicateProlog<[{ + const ARMBaseInstrInfo *TII = + static_cast<const ARMBaseInstrInfo*>(SchedModel->getInstrInfo()); + (void)TII; +}]>; + +def IsPredicatedPred : SchedPredicate<[{TII->isPredicated(MI)}]>; //===----------------------------------------------------------------------===// // Instruction Itinerary classes used for ARM diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index 4191931..9739ed2 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -1898,6 +1898,8 @@ def CortexA9Model : SchedMachineModel { //===----------------------------------------------------------------------===// // Define each kind of processor resource and number available. +let SchedModel = CortexA9Model in { + def A9UnitALU : ProcResource<2>; def A9UnitMul : ProcResource<1> { let Super = A9UnitALU; } def A9UnitAGU : ProcResource<1>; @@ -1918,11 +1920,11 @@ def A9WriteI : SchedWriteRes<[A9UnitALU]>; def A9WriteIsr : SchedWriteRes<[A9UnitALU]> { let Latency = 2; } // Basic ALU. -def A9WriteA : SchedWriteRes<[A9UnitALU]>; +def A9WriteALU : SchedWriteRes<[A9UnitALU]>; // ALU with operand shifted by immediate. -def A9WriteAsi : SchedWriteRes<[A9UnitALU]> { let Latency = 2; } +def : WriteRes<WriteALUsi, [A9UnitALU]> { let Latency = 2; } // ALU with operand shifted by register. -def A9WriteAsr : SchedWriteRes<[A9UnitALU]> { let Latency = 3; } +def A9WriteALUsr : SchedWriteRes<[A9UnitALU]> { let Latency = 3; } // Multiplication def A9WriteM : SchedWriteRes<[A9UnitMul, A9UnitMul]> { let Latency = 4; } @@ -2003,13 +2005,6 @@ foreach NumCycles = 2-8 in { def A9WriteCycle#NumCycles : WriteSequence<[A9WriteCycle1], NumCycles>; } // foreach NumCycles -// Define TII for use in SchedVariant Predicates. -def : PredicateProlog<[{ - const ARMBaseInstrInfo *TII = - static_cast<const ARMBaseInstrInfo*>(SchedModel->getInstrInfo()); - (void)TII; -}]>; - // Define address generation sequences and predicates for 8 flavors of LDMs. foreach NumAddr = 1-8 in { @@ -2254,11 +2249,11 @@ def A9WriteLMfp : SchedWriteVariant<[ // These mov immediate writers are unconditionally expanded with // additive latency. def A9WriteI2 : WriteSequence<[A9WriteI, A9WriteI]>; -def A9WriteI2pc : WriteSequence<[A9WriteI, A9WriteI, A9WriteA]>; +def A9WriteI2pc : WriteSequence<[A9WriteI, A9WriteI, WriteALU]>; def A9WriteI2ld : WriteSequence<[A9WriteI, A9WriteI, A9WriteL]>; // Some ALU operations can read loaded integer values one cycle early. -def A9ReadA : SchedReadAdvance<1, +def A9ReadALU : SchedReadAdvance<1, [A9WriteL, A9WriteLHi, A9WriteLsi, A9WriteLb, A9WriteLbsi, A9WriteL1, A9WriteL2, A9WriteL3, A9WriteL4, A9WriteL5, A9WriteL6, A9WriteL7, A9WriteL8, @@ -2279,26 +2274,25 @@ def A9Read4 : SchedReadAdvance<3>; // This table follows the ARM Cortex-A9 Technical Reference Manuals, // mostly in order. -let SchedModel = CortexA9Model in { def :ItinRW<[A9WriteI], [IIC_iMOVi,IIC_iMOVr,IIC_iMOVsi, IIC_iMVNi,IIC_iMVNsi, IIC_iCMOVi,IIC_iCMOVr,IIC_iCMOVsi]>; -def :ItinRW<[A9WriteI,A9ReadA],[IIC_iMVNr]>; +def :ItinRW<[A9WriteI,A9ReadALU],[IIC_iMVNr]>; def :ItinRW<[A9WriteIsr], [IIC_iMOVsr,IIC_iMVNsr,IIC_iCMOVsr]>; def :ItinRW<[A9WriteI2], [IIC_iMOVix2,IIC_iCMOVix2]>; def :ItinRW<[A9WriteI2pc], [IIC_iMOVix2addpc]>; def :ItinRW<[A9WriteI2ld], [IIC_iMOVix2ld]>; -def :ItinRW<[A9WriteA], [IIC_iBITi,IIC_iBITr,IIC_iUNAr,IIC_iTSTi,IIC_iTSTr]>; -def :ItinRW<[A9WriteA, A9ReadA], [IIC_iALUi, IIC_iCMPi, IIC_iCMPsi]>; -def :ItinRW<[A9WriteA, A9ReadA, A9ReadA],[IIC_iALUr,IIC_iCMPr]>; -def :ItinRW<[A9WriteAsi], [IIC_iBITsi,IIC_iUNAsi,IIC_iEXTr,IIC_iTSTsi]>; -def :ItinRW<[A9WriteAsi, A9ReadA], [IIC_iALUsi]>; -def :ItinRW<[A9WriteAsi, ReadDefault, A9ReadA], [IIC_iALUsir]>; // RSB -def :ItinRW<[A9WriteAsr], [IIC_iBITsr,IIC_iTSTsr,IIC_iEXTAr,IIC_iEXTAsr]>; -def :ItinRW<[A9WriteAsr, A9ReadA], [IIC_iALUsr,IIC_iCMPsr]>; +def :ItinRW<[WriteALU], [IIC_iBITi,IIC_iBITr,IIC_iUNAr,IIC_iTSTi,IIC_iTSTr]>; +def :ItinRW<[WriteALU, A9ReadALU], [IIC_iALUi, IIC_iCMPi, IIC_iCMPsi]>; +def :ItinRW<[WriteALU, A9ReadALU, A9ReadALU],[IIC_iALUr,IIC_iCMPr]>; +def :ItinRW<[WriteALUsi], [IIC_iBITsi,IIC_iUNAsi,IIC_iEXTr,IIC_iTSTsi]>; +def :ItinRW<[WriteALUsi, A9ReadALU], [IIC_iALUsi]>; +def :ItinRW<[WriteALUsi, ReadDefault, A9ReadALU], [IIC_iALUsir]>; // RSB +def :ItinRW<[A9WriteALUsr], [IIC_iBITsr,IIC_iTSTsr,IIC_iEXTAr,IIC_iEXTAsr]>; +def :ItinRW<[A9WriteALUsr, A9ReadALU], [IIC_iALUsr,IIC_iCMPsr]>; // A9WriteHi ignored for MUL32. def :ItinRW<[A9WriteM, A9WriteMHi], [IIC_iMUL32,IIC_iMAC32, @@ -2371,7 +2365,7 @@ def :ItinRW<[A9WriteLMAdr, A9WriteLM, A9WriteIssue], [IIC_iLoad_mu, IIC_iStore_m, IIC_iStore_mu]>; def :ItinRW<[A9WriteLM, A9WriteLMAdr, A9WriteB], [IIC_iLoad_mBr, IIC_iPop_Br]>; -def :ItinRW<[A9WriteL, A9WriteAdr, A9WriteA], [IIC_iLoadiALU]>; +def :ItinRW<[A9WriteL, A9WriteAdr, WriteALU], [IIC_iLoadiALU]>; def :ItinRW<[A9WriteLSfp, A9WriteAdr], [IIC_fpLoad32, IIC_fpLoad64]>; @@ -2486,4 +2480,17 @@ def :ItinRW<[A9WriteV9, A9Read3, A9Read2], [IIC_VMACD, IIC_VFMACD]>; def :ItinRW<[A9WriteV10, A9Read3, A9Read2], [IIC_VMACQ, IIC_VFMACQ]>; def :ItinRW<[A9WriteV9, A9Read2, A9Read2], [IIC_VRECSD]>; def :ItinRW<[A9WriteV10, A9Read2, A9Read2], [IIC_VRECSQ]>; + +// Map SchedRWs that are identical for cortexa9 to existing resources. +def : SchedAlias<WriteALU, A9WriteALU>; +def : SchedAlias<WriteALUsr, A9WriteALUsr>; +def : SchedAlias<WriteALUSsr, A9WriteALUsr>; +def : SchedAlias<ReadALU, A9ReadALU>; +def : SchedAlias<ReadALUsr, A9ReadALU>; +// FIXME: need to special case AND, ORR, EOR, BIC because they don't read +// advance. But our instrinfo claims it does. + +def : SchedAlias<WriteCMP, A9WriteALU>; +def : SchedAlias<WriteCMPsi, A9WriteALU>; +def : SchedAlias<WriteCMPsr, A9WriteALU>; } // SchedModel = CortexA9Model diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td index e9bc3e0..7c6df41 100644 --- a/lib/Target/ARM/ARMScheduleSwift.td +++ b/lib/Target/ARM/ARMScheduleSwift.td @@ -1078,8 +1078,67 @@ def SwiftModel : SchedMachineModel { let IssueWidth = 3; // 3 micro-ops are dispatched per cycle. let MinLatency = 0; // Data dependencies are allowed within dispatch groups. let LoadLatency = 3; + let MispredictPenalty = 14; // A branch direction mispredict. let Itineraries = SwiftItineraries; } -// TODO: Add Swift processor and scheduler resources. +// Swift predicates. +def IsFastImmShiftSwiftPred : SchedPredicate<[{TII->isSwiftFastImmShift(MI)}]>; + +// Swift resource mapping. +let SchedModel = SwiftModel in { + // Processor resources. + def SwiftUnitP01 : ProcResource<2>; // ALU unit. + def SwiftUnitP0 : ProcResource<1> { let Super = SwiftUnitP01; } // Mul unit. + def SwiftUnitP1 : ProcResource<1> { let Super = SwiftUnitP01; } // Br unit. + def SwiftUnitP2 : ProcResource<1>; // LS unit. + def SwiftUnitDiv : ProcResource<1>; + + // Generic resource requirements. + def SwiftWriteP01TwoCycle : SchedWriteRes<[SwiftUnitP01]> { let Latency = 2; } + def SwiftWriteP01ThreeCycleTwoUops : + SchedWriteRes<[SwiftUnitP01, SwiftUnitP01]> { + let Latency = 3; + let NumMicroOps = 2; + } + def SwiftWriteP0ThreeCycleThreeUops : SchedWriteRes<[SwiftUnitP0]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [3]; + } + + // 4.2.4 Arithmetic and Logical. + // ALU operation register shifted by immediate variant. + def SwiftWriteALUsi : SchedWriteVariant<[ + // lsl #2, lsl #1, or lsr #1. + SchedVar<IsFastImmShiftSwiftPred, [SwiftWriteP01TwoCycle]>, + SchedVar<NoSchedPred, [WriteALU]> + ]>; + def SwiftWriteALUsr : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [SwiftWriteP01ThreeCycleTwoUops]>, + SchedVar<NoSchedPred, [SwiftWriteP01TwoCycle]> + ]>; + def SwiftWriteALUSsr : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [SwiftWriteP0ThreeCycleThreeUops]>, + SchedVar<NoSchedPred, [SwiftWriteP01TwoCycle]> + ]>; + def SwiftReadAdvanceALUsr : SchedReadVariant<[ + SchedVar<IsPredicatedPred, [SchedReadAdvance<2>]>, + SchedVar<NoSchedPred, [NoReadAdvance]> + ]>; + // ADC,ADD,NEG,RSB,RSC,SBC,SUB,ADR + // AND,BIC,EOR,ORN,ORR + // CLZ,RBIT,REV,REV16,REVSH,PKH + def : WriteRes<WriteALU, [SwiftUnitP01]>; + def : SchedAlias<WriteALUsi, SwiftWriteALUsi>; + def : SchedAlias<WriteALUsr, SwiftWriteALUsr>; + def : SchedAlias<WriteALUSsr, SwiftWriteALUSsr>; + def : ReadAdvance<ReadALU, 0>; + def : SchedAlias<ReadALUsr, SwiftReadAdvanceALUsr>; + + // 4.2.5 Integer comparison + def : WriteRes<WriteCMP, [SwiftUnitP01]>; + def : WriteRes<WriteCMPsi, [SwiftUnitP01]>; + def : WriteRes<WriteCMPsr, [SwiftUnitP01]>; +} diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index f4d568c..3b8e56f 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -19,6 +19,7 @@ #include "llvm/IR/Function.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetOptions.h" #define GET_SUBTARGETINFO_TARGET_DESC #define GET_SUBTARGETINFO_CTOR @@ -42,12 +43,13 @@ StrictAlign("arm-strict-align", cl::Hidden, cl::desc("Disallow all unaligned memory accesses")); ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS) + const std::string &FS, const TargetOptions &Options) : ARMGenSubtargetInfo(TT, CPU, FS) , ARMProcFamily(Others) , stackAlignment(4) , CPUString(CPU) , TargetTriple(TT) + , Options(Options) , TargetABI(ARM_ABI_APCS) { initializeEnvironment(); resetSubtargetFeatures(CPU, FS); @@ -89,9 +91,11 @@ void ARMSubtarget::initializeEnvironment() { HasRAS = false; HasMPExtension = false; FPOnlySP = false; + HasTrustZone = false; AllowsUnalignedMem = false; Thumb2DSP = false; UseNaClTrap = false; + UnsafeFPMath = false; } void ARMSubtarget::resetSubtargetFeatures(const MachineFunction *MF) { @@ -162,6 +166,12 @@ void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) { // configuration. if (!StrictAlign && hasV6Ops() && isTargetDarwin()) AllowsUnalignedMem = true; + + // NEON f32 ops are non-IEEE 754 compliant. Darwin is ok with it by default. + uint64_t Bits = getFeatureBits(); + if ((Bits & ARM::ProcA5 || Bits & ARM::ProcA8) && // Where this matters + (Options.UnsafeFPMath || isTargetDarwin())) + UseNEONForSinglePrecisionFP = true; } /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol. diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 8ce22e1..038eb76 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -26,6 +26,7 @@ namespace llvm { class GlobalValue; class StringRef; +class TargetOptions; class ARMSubtarget : public ARMGenSubtargetInfo { protected: @@ -147,6 +148,9 @@ protected: /// precision. bool FPOnlySP; + /// HasTrustZone - if true, processor supports TrustZone security extensions + bool HasTrustZone; + /// AllowsUnalignedMem - If true, the subtarget allows unaligned memory /// accesses for some types. For details, see /// ARMTargetLowering::allowsUnalignedMemoryAccesses(). @@ -159,6 +163,9 @@ protected: /// NaCl TRAP instruction is generated instead of the regular TRAP. bool UseNaClTrap; + /// Target machine allowed unsafe FP math (such as use of NEON fp) + bool UnsafeFPMath; + /// stackAlignment - The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. unsigned stackAlignment; @@ -175,6 +182,9 @@ protected: /// Selected instruction itineraries (one entry per itinerary class.) InstrItineraryData InstrItins; + /// Options passed via command line that could influence the target + const TargetOptions &Options; + public: enum { isELF, isDarwin @@ -189,7 +199,7 @@ protected: /// of the specified triple. /// ARMSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS); + const std::string &FS, const TargetOptions &Options); /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size /// that still makes it profitable to inline the call. @@ -244,6 +254,7 @@ public: bool hasVMLxForwarding() const { return HasVMLxForwarding; } bool isFPBrccSlow() const { return SlowFPBrcc; } bool isFPOnlySP() const { return FPOnlySP; } + bool hasTrustZone() const { return HasTrustZone; } bool prefers32BitThumb() const { return Pref32BitThumb; } bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; } bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; } diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index 3003760..42c7d2c 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -48,7 +48,7 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, StringRef TT, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS), + Subtarget(TT, CPU, FS, Options), JITInfo(), InstrItins(Subtarget.getInstrItineraryData()) { // Default to soft float ABI @@ -185,8 +185,7 @@ bool ARMPassConfig::addPreSched2() { addPass(createARMLoadStoreOptimizationPass()); printAndVerify("After ARM load / store optimizer"); } - if ((DisableA15SDOptimization || !getARMSubtarget().isCortexA15()) && - getARMSubtarget().hasNEON()) + if (getARMSubtarget().hasNEON()) addPass(createExecutionDependencyFixPass(&ARM::DPRRegClass)); } diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index 140a8db..53ece66 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -125,6 +125,10 @@ public: unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const; unsigned getAddressComputationCost(Type *Val) const; + + unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, + OperandValueKind Op1Info = OK_AnyValue, + OperandValueKind Op2Info = OK_AnyValue) const; /// @} }; @@ -211,13 +215,21 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst, { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, - // Operations that we legalize using load/stores to the stack. - { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 16*2 + 4*4 }, - { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 16*2 + 4*3 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 8*2 + 2*4 }, - { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 8*2 + 2*3 }, - { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4*1 + 16*2 + 2*1 }, - { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2*1 + 8*2 + 1 }, + // The number of vmovl instructions for the extension. + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, + + // Operations that we legalize using splitting. + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, // Vector float <-> i32 conversions. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, @@ -448,3 +460,67 @@ unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, return LT.first * NEONShuffleTbl[Idx].Cost; } + +unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Op1Info, + OperandValueKind Op2Info) const { + + int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); + + const unsigned FunctionCallDivCost = 20; + const unsigned ReciprocalDivCost = 10; + static const CostTblEntry<MVT> CostTbl[] = { + // Division. + // These costs are somewhat random. Choose a cost of 20 to indicate that + // vectorizing devision (added function call) is going to be very expensive. + // Double registers types. + { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost}, + { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost}, + { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost}, + { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost}, + { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v4i16, ReciprocalDivCost}, + { ISD::UDIV, MVT::v4i16, ReciprocalDivCost}, + { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost}, + { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v8i8, ReciprocalDivCost}, + { ISD::UDIV, MVT::v8i8, ReciprocalDivCost}, + { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost}, + { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost}, + // Quad register types. + { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost}, + { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost}, + { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost}, + { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost}, + { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost}, + { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost}, + { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost}, + { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost}, + { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost}, + // Multiplication. + }; + + int Idx = -1; + + if (ST->hasNEON()) + Idx = CostTableLookup<MVT>(CostTbl, array_lengthof(CostTbl), ISDOpcode, + LT.second); + + if (Idx != -1) + return LT.first * CostTbl[Idx].Cost; + + + return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, + Op2Info); +} + diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index c897efd..114cc9e 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -610,6 +610,13 @@ public: int64_t Value = CE->getValue(); return ((Value & 3) == 0) && Value >= -1020 && Value <= 1020; } + bool isImm0_4() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < 5; + } bool isImm0_1020s4() const { if (!isImm()) return false; const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); @@ -4593,20 +4600,26 @@ bool ARMAsmParser::parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands, Error(Parser.getTok().getLoc(), "unexpected token in operand"); return true; case AsmToken::Identifier: { - if (!tryParseRegisterWithWriteBack(Operands)) - return false; - int Res = tryParseShiftRegister(Operands); - if (Res == 0) // success - return false; - else if (Res == -1) // irrecoverable error - return true; - // If this is VMRS, check for the apsr_nzcv operand. - if (Mnemonic == "vmrs" && - Parser.getTok().getString().equals_lower("apsr_nzcv")) { - S = Parser.getTok().getLoc(); - Parser.Lex(); - Operands.push_back(ARMOperand::CreateToken("APSR_nzcv", S)); - return false; + // If we've seen a branch mnemonic, the next operand must be a label. This + // is true even if the label is a register name. So "br r1" means branch to + // label "r1". + bool ExpectLabel = Mnemonic == "b" || Mnemonic == "bl"; + if (!ExpectLabel) { + if (!tryParseRegisterWithWriteBack(Operands)) + return false; + int Res = tryParseShiftRegister(Operands); + if (Res == 0) // success + return false; + else if (Res == -1) // irrecoverable error + return true; + // If this is VMRS, check for the apsr_nzcv operand. + if (Mnemonic == "vmrs" && + Parser.getTok().getString().equals_lower("apsr_nzcv")) { + S = Parser.getTok().getLoc(); + Parser.Lex(); + Operands.push_back(ARMOperand::CreateToken("APSR_nzcv", S)); + return false; + } } // Fall though for the Identifier case that is not a register or a @@ -4739,6 +4752,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic, Mnemonic == "mls" || Mnemonic == "smmls" || Mnemonic == "vcls" || Mnemonic == "vmls" || Mnemonic == "vnmls" || Mnemonic == "vacge" || Mnemonic == "vcge" || Mnemonic == "vclt" || Mnemonic == "vacgt" || + Mnemonic == "vaclt" || Mnemonic == "vacle" || Mnemonic == "vcgt" || Mnemonic == "vcle" || Mnemonic == "smlal" || Mnemonic == "umaal" || Mnemonic == "umlal" || Mnemonic == "vabal" || Mnemonic == "vmlal" || Mnemonic == "vpadal" || Mnemonic == "vqdmlal" || @@ -5008,8 +5022,8 @@ static bool isDataTypeToken(StringRef Tok) { static bool doesIgnoreDataTypeSuffix(StringRef Mnemonic, StringRef DT) { return Mnemonic.startswith("vldm") || Mnemonic.startswith("vstm"); } - -static void applyMnemonicAliases(StringRef &Mnemonic, unsigned Features); +static void applyMnemonicAliases(StringRef &Mnemonic, unsigned Features, + unsigned VariantID); /// Parse an arm instruction mnemonic followed by its operands. bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, @@ -5020,7 +5034,8 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // MatchInstructionImpl(), but that's too late for aliases that include // any sort of suffix. unsigned AvailableFeatures = getAvailableFeatures(); - applyMnemonicAliases(Name, AvailableFeatures); + unsigned AssemblerDialect = getParser().getAssemblerDialect(); + applyMnemonicAliases(Name, AvailableFeatures, AssemblerDialect); // First check for the ARM-specific .req directive. if (Parser.getTok().is(AsmToken::Identifier) && @@ -7607,6 +7622,11 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return Error(IDLoc, "instruction variant requires ARMv6 or later"); case Match_RequiresThumb2: return Error(IDLoc, "instruction variant requires Thumb2"); + case Match_ImmRange0_4: { + SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc(); + if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; + return Error(ErrorLoc, "immediate operand must be in the range [0,4]"); + } case Match_ImmRange0_15: { SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index 31a3b0b..ac937f3 100644 --- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -308,6 +308,8 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeImm0_4(MCInst &Inst, unsigned Insn, uint64_t Address, + const void *Decoder); static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn, @@ -1951,10 +1953,12 @@ static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn, Inst.addOperand(MCOperand::CreateImm(mode)); if (iflags) S = MCDisassembler::SoftFail; } else { - // imod == '00' && M == '0' --> UNPREDICTABLE - Inst.setOpcode(ARM::t2CPS1p); - Inst.addOperand(MCOperand::CreateImm(mode)); - S = MCDisassembler::SoftFail; + // imod == '00' && M == '0' --> this is a HINT instruction + int imm = fieldFromInstruction(Insn, 0, 8); + // HINT are defined only for immediate in [0..4] + if(imm > 4) return MCDisassembler::Fail; + Inst.setOpcode(ARM::t2HINT); + Inst.addOperand(MCOperand::CreateImm(imm)); } return S; @@ -1996,9 +2000,10 @@ static DecodeStatus DecodeArmMOVTWInstruction(MCInst &Inst, unsigned Insn, imm |= (fieldFromInstruction(Insn, 16, 4) << 12); if (Inst.getOpcode() == ARM::MOVTi16) - if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder))) + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder))) return MCDisassembler::Fail; - if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder))) + + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder))) return MCDisassembler::Fail; if (!tryAddingSymbolicOperand(Address, imm, false, 4, Inst, Decoder)) @@ -3049,9 +3054,9 @@ static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val, static DecodeStatus DecodeThumbCmpBROperand(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder) { - if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<7>(Val<<1) + 4, + if (!tryAddingSymbolicOperand(Address, Address + (Val<<1) + 4, true, 2, Inst, Decoder)) - Inst.addOperand(MCOperand::CreateImm(SignExtend32<7>(Val << 1))); + Inst.addOperand(MCOperand::CreateImm(Val << 1)); return MCDisassembler::Success; } @@ -3278,7 +3283,7 @@ static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Insn, return MCDisassembler::Fail; } - if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder))) + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) return MCDisassembler::Fail; if (load) { @@ -3570,7 +3575,7 @@ static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn, unsigned Rn = fieldFromInstruction(Insn, 16, 4); unsigned pred = fieldFromInstruction(Insn, 28, 4); - if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder))) + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder))) return MCDisassembler::Fail; if ((Rt & 1) || Rt == 0xE || Rn == 0xF) return MCDisassembler::Fail; @@ -4496,6 +4501,15 @@ static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn, return S; } +static DecodeStatus DecodeImm0_4(MCInst &Inst, unsigned Insn, uint64_t Address, + const void *Decoder) +{ + unsigned Imm = fieldFromInstruction(Insn, 0, 3); + if (Imm > 4) return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(Imm)); + return MCDisassembler::Success; +} + static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp index 2afb20d..3bcd083 100644 --- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp +++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp @@ -490,7 +490,8 @@ void ARMInstPrinter::printAM3PostIndexOp(const MCInst *MI, unsigned Op, } void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, - raw_ostream &O) { + raw_ostream &O, + bool AlwaysPrintImm0) { const MCOperand &MO1 = MI->getOperand(Op); const MCOperand &MO2 = MI->getOperand(Op+1); const MCOperand &MO3 = MI->getOperand(Op+2); @@ -509,7 +510,7 @@ void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm()); ARM_AM::AddrOpc op = ARM_AM::getAM3Op(MO3.getImm()); - if (ImmOffs || (op == ARM_AM::sub)) { + if (AlwaysPrintImm0 || ImmOffs || (op == ARM_AM::sub)) { O << ", " << markup("<imm:") << "#" @@ -520,6 +521,7 @@ void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, O << ']' << markup(">"); } +template <bool AlwaysPrintImm0> void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned Op, raw_ostream &O) { const MCOperand &MO1 = MI->getOperand(Op); @@ -535,7 +537,7 @@ void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned Op, printAM3PostIndexOp(MI, Op, O); return; } - printAM3PreOrOffsetIndexOp(MI, Op, O); + printAM3PreOrOffsetIndexOp(MI, Op, O, AlwaysPrintImm0); } void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI, @@ -593,6 +595,7 @@ void ARMInstPrinter::printLdStmModeOperand(const MCInst *MI, unsigned OpNum, O << ARM_AM::getAMSubModeStr(Mode); } +template <bool AlwaysPrintImm0> void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O) { const MCOperand &MO1 = MI->getOperand(OpNum); @@ -608,7 +611,7 @@ void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum, unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm()); unsigned Op = ARM_AM::getAM5Op(MO2.getImm()); - if (ImmOffs || Op == ARM_AM::sub) { + if (AlwaysPrintImm0 || ImmOffs || Op == ARM_AM::sub) { O << ", " << markup("<imm:") << "#" @@ -1022,6 +1025,7 @@ void ARMInstPrinter::printT2SOOperand(const MCInst *MI, unsigned OpNum, ARM_AM::getSORegOffset(MO2.getImm()), UseMarkup); } +template <bool AlwaysPrintImm0> void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O) { const MCOperand &MO1 = MI->getOperand(OpNum); @@ -1042,13 +1046,13 @@ void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum, OffImm = 0; if (isSub) { O << ", " - << markup("<imm:") + << markup("<imm:") << "#-" << -OffImm << markup(">"); } - else if (OffImm > 0) { + else if (AlwaysPrintImm0 || OffImm > 0) { O << ", " - << markup("<imm:") + << markup("<imm:") << "#" << OffImm << markup(">"); } diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h index edff75d..344104e 100644 --- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h +++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h @@ -47,12 +47,13 @@ public: raw_ostream &O); void printAddrMode2OffsetOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); - + template <bool AlwaysPrintImm0> void printAddrMode3Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printAddrMode3OffsetOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printAM3PostIndexOp(const MCInst *MI, unsigned Op, raw_ostream &O); - void printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,raw_ostream &O); + void printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, raw_ostream &O, + bool AlwaysPrintImm0); void printPostIdxImm8Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printPostIdxRegOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); @@ -60,6 +61,7 @@ public: raw_ostream &O); void printLdStmModeOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + template <bool AlwaysPrintImm0> void printAddrMode5Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printAddrMode6Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printAddrMode7Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); @@ -91,6 +93,7 @@ public: raw_ostream &O); void printT2SOOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + template<bool AlwaysPrintImm0> void printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printT2AddrModeImm8Operand(const MCInst *MI, unsigned OpNum, diff --git a/lib/Target/ARM/LICENSE.TXT b/lib/Target/ARM/LICENSE.TXT index 68afea1..68afea1 100755..100644 --- a/lib/Target/ARM/LICENSE.TXT +++ b/lib/Target/ARM/LICENSE.TXT diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index 418971d..6c3d247 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -13,7 +13,9 @@ // //===----------------------------------------------------------------------===// +#include "ARMRegisterInfo.h" #include "ARMUnwindOp.h" +#include "ARMUnwindOpAsm.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Twine.h" #include "llvm/MC/MCAsmBackend.h" @@ -26,6 +28,7 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCObjectStreamer.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" @@ -33,11 +36,15 @@ #include "llvm/MC/MCValue.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ELF.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; +static std::string GetAEABIUnwindPersonalityName(unsigned Index) { + assert(Index < NUM_PERSONALITY_INDEX && "Invalid personality index"); + return (Twine("__aeabi_unwind_cpp_pr") + Twine(Index)).str(); +} + namespace { /// Extend the generic ELFStreamer class so that it can emit mapping symbols at @@ -57,8 +64,9 @@ public: ARMELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS, MCCodeEmitter *Emitter, bool IsThumb) : MCELFStreamer(SK_ARMELFStreamer, Context, TAB, OS, Emitter), - IsThumb(IsThumb), MappingSymbolCounter(0), LastEMS(EMS_None), ExTab(0), - FnStart(0), Personality(0), CantUnwind(false) {} + IsThumb(IsThumb), MappingSymbolCounter(0), LastEMS(EMS_None) { + Reset(); + } ~ARMELFStreamer() {} @@ -75,14 +83,15 @@ public: virtual void EmitRegSave(const SmallVectorImpl<unsigned> &RegList, bool isVector); - virtual void ChangeSection(const MCSection *Section) { + virtual void ChangeSection(const MCSection *Section, + const MCExpr *Subsection) { // We have to keep track of the mapping symbol state of any sections we // use. Each one should start off as EMS_None, which is provided as the // default constructor by DenseMap::lookup. - LastMappingSymbols[getPreviousSection()] = LastEMS; + LastMappingSymbols[getPreviousSection().first] = LastEMS; LastEMS = LastMappingSymbols.lookup(Section); - MCELFStreamer::ChangeSection(Section); + MCELFStreamer::ChangeSection(Section, Subsection); } /// This function is the one used to emit instruction data into the ELF @@ -175,7 +184,7 @@ private: MCELF::SetType(SD, ELF::STT_NOTYPE); MCELF::SetBinding(SD, ELF::STB_LOCAL); SD.setExternal(false); - Symbol->setSection(*getCurrentSection()); + Symbol->setSection(*getCurrentSection().first); const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext()); Symbol->setVariableValue(Value); @@ -194,6 +203,7 @@ private: void Reset(); void EmitPersonalityFixup(StringRef Name); + void CollectUnwindOpcodes(); void SwitchToEHSection(const char *Prefix, unsigned Type, unsigned Flags, SectionKind Kind, const MCSymbol &Fn); @@ -210,9 +220,16 @@ private: MCSymbol *ExTab; MCSymbol *FnStart; const MCSymbol *Personality; + uint32_t VFPRegSave; // Register mask for {d31-d0} + uint32_t RegSave; // Register mask for {r15-r0} + int64_t SPOffset; + uint16_t FPReg; + int64_t FPOffset; + bool UsedFP; bool CantUnwind; + UnwindOpcodeAssembler UnwindOpAsm; }; -} +} // end anonymous namespace inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix, unsigned Type, @@ -238,7 +255,7 @@ inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix, } else { EHSection = getContext().getELFSection(EHSecName, Type, Flags, Kind); } - assert(EHSection); + assert(EHSection && "Failed to get the required EH section"); // Switch to .ARM.extab or .ARM.exidx section SwitchSection(EHSection); @@ -262,10 +279,20 @@ inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) { } void ARMELFStreamer::Reset() { + const MCRegisterInfo &MRI = getContext().getRegisterInfo(); + ExTab = NULL; FnStart = NULL; Personality = NULL; + VFPRegSave = 0; + RegSave = 0; + FPReg = MRI.getEncodingValue(ARM::SP); + FPOffset = 0; + SPOffset = 0; + UsedFP = false; CantUnwind = false; + + UnwindOpAsm.Reset(); } // Add the R_ARM_NONE fixup at the same position @@ -284,6 +311,18 @@ void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) { MCFixup::getKindForSize(4, false))); } +void ARMELFStreamer::CollectUnwindOpcodes() { + if (UsedFP) { + UnwindOpAsm.EmitSetFP(FPReg); + UnwindOpAsm.EmitSPOffset(-FPOffset); + } else { + UnwindOpAsm.EmitSPOffset(SPOffset); + } + UnwindOpAsm.EmitVFPRegSave(VFPRegSave); + UnwindOpAsm.EmitRegSave(RegSave); + UnwindOpAsm.Finalize(); +} + void ARMELFStreamer::EmitFnStart() { assert(FnStart == 0); FnStart = getContext().CreateTempSymbol(); @@ -294,35 +333,29 @@ void ARMELFStreamer::EmitFnEnd() { assert(FnStart && ".fnstart must preceeds .fnend"); // Emit unwind opcodes if there is no .handlerdata directive - int PersonalityIndex = -1; if (!ExTab && !CantUnwind) { - // For __aeabi_unwind_cpp_pr1, we have to emit opcodes in .ARM.extab. - SwitchToExTabSection(*FnStart); - - // Create .ARM.extab label for offset in .ARM.exidx - ExTab = getContext().CreateTempSymbol(); - EmitLabel(ExTab); - - PersonalityIndex = 1; - - uint32_t Entry = 0; - uint32_t NumExtraEntryWords = 0; - Entry |= NumExtraEntryWords << 24; - Entry |= (EHT_COMPACT | PersonalityIndex) << 16; - - // TODO: This should be generated according to .save, .vsave, .setfp - // directives. Currently, we are simply generating FINISH opcode. - Entry |= UNWIND_OPCODE_FINISH << 8; - Entry |= UNWIND_OPCODE_FINISH; - - EmitIntValue(Entry, 4, 0); + CollectUnwindOpcodes(); + + unsigned PersonalityIndex = UnwindOpAsm.getPersonalityIndex(); + if (PersonalityIndex == AEABI_UNWIND_CPP_PR1 || + PersonalityIndex == AEABI_UNWIND_CPP_PR2) { + // For the __aeabi_unwind_cpp_pr1 and __aeabi_unwind_cpp_pr2, we have to + // emit the unwind opcodes in the corresponding ".ARM.extab" section, and + // then emit a reference to these unwind opcodes in the second word of + // the exception index table entry. + SwitchToExTabSection(*FnStart); + ExTab = getContext().CreateTempSymbol(); + EmitLabel(ExTab); + EmitBytes(UnwindOpAsm.data(), 0); + } } // Emit the exception index table entry SwitchToExIdxSection(*FnStart); - if (PersonalityIndex == 1) - EmitPersonalityFixup("__aeabi_unwind_cpp_pr1"); + unsigned PersonalityIndex = UnwindOpAsm.getPersonalityIndex(); + if (PersonalityIndex < NUM_PERSONALITY_INDEX) + EmitPersonalityFixup(GetAEABIUnwindPersonalityName(PersonalityIndex)); const MCSymbolRefExpr *FnStartRef = MCSymbolRefExpr::Create(FnStart, @@ -333,12 +366,22 @@ void ARMELFStreamer::EmitFnEnd() { if (CantUnwind) { EmitIntValue(EXIDX_CANTUNWIND, 4, 0); - } else { + } else if (ExTab) { + // Emit a reference to the unwind opcodes in the ".ARM.extab" section. const MCSymbolRefExpr *ExTabEntryRef = MCSymbolRefExpr::Create(ExTab, MCSymbolRefExpr::VK_ARM_PREL31, getContext()); EmitValue(ExTabEntryRef, 4, 0); + } else { + // For the __aeabi_unwind_cpp_pr0, we have to emit the unwind opcodes in + // the second word of exception index table entry. The size of the unwind + // opcodes should always be 4 bytes. + assert(PersonalityIndex == AEABI_UNWIND_CPP_PR0 && + "Compact model must use __aeabi_cpp_unwind_pr0 as personality"); + assert(UnwindOpAsm.size() == 4u && + "Unwind opcode size for __aeabi_cpp_unwind_pr0 must be equal to 4"); + EmitBytes(UnwindOpAsm.data(), 0); } // Clean exception handling frame information @@ -368,36 +411,54 @@ void ARMELFStreamer::EmitHandlerData() { EmitValue(PersonalityRef, 4, 0); // Emit unwind opcodes - uint32_t Entry = 0; - uint32_t NumExtraEntryWords = 0; - - // TODO: This should be generated according to .save, .vsave, .setfp - // directives. Currently, we are simply generating FINISH opcode. - Entry |= NumExtraEntryWords << 24; - Entry |= UNWIND_OPCODE_FINISH << 16; - Entry |= UNWIND_OPCODE_FINISH << 8; - Entry |= UNWIND_OPCODE_FINISH; - - EmitIntValue(Entry, 4, 0); + CollectUnwindOpcodes(); + EmitBytes(UnwindOpAsm.data(), 0); } void ARMELFStreamer::EmitPersonality(const MCSymbol *Per) { Personality = Per; + UnwindOpAsm.setPersonality(Per); } -void ARMELFStreamer::EmitSetFP(unsigned NewFpReg, - unsigned NewSpReg, +void ARMELFStreamer::EmitSetFP(unsigned NewFPReg, + unsigned NewSPReg, int64_t Offset) { - // TODO: Not implemented + assert(SPOffset == 0 && + "Current implementation assumes .setfp precedes .pad"); + + const MCRegisterInfo &MRI = getContext().getRegisterInfo(); + + uint16_t NewFPRegEncVal = MRI.getEncodingValue(NewFPReg); +#ifndef NDEBUG + uint16_t NewSPRegEncVal = MRI.getEncodingValue(NewSPReg); +#endif + + assert((NewSPReg == ARM::SP || NewSPRegEncVal == FPReg) && + "the operand of .setfp directive should be either $sp or $fp"); + + UsedFP = true; + FPReg = NewFPRegEncVal; + FPOffset = Offset; } void ARMELFStreamer::EmitPad(int64_t Offset) { - // TODO: Not implemented + SPOffset += Offset; } void ARMELFStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList, bool IsVector) { - // TODO: Not implemented + const MCRegisterInfo &MRI = getContext().getRegisterInfo(); + +#ifndef NDEBUG + unsigned Max = IsVector ? 32 : 16; +#endif + uint32_t &RegMask = IsVector ? VFPRegSave : RegSave; + + for (size_t i = 0; i < RegList.size(); ++i) { + unsigned Reg = MRI.getEncodingValue(RegList[i]); + assert(Reg < Max && "Register encoded value out of range"); + RegMask |= 1u << Reg; + } } namespace llvm { diff --git a/lib/Target/ARM/MCTargetDesc/ARMUnwindOp.h b/lib/Target/ARM/MCTargetDesc/ARMUnwindOp.h index dad5576..fa4add6 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMUnwindOp.h +++ b/lib/Target/ARM/MCTargetDesc/ARMUnwindOp.h @@ -107,6 +107,19 @@ namespace llvm { UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD_D8 = 0xd0 }; + /// ARM-defined Personality Routine Index + enum ARMPersonalityRoutineIndex { + // To make the exception handling table become more compact, ARM defined + // several personality routines in EHABI. There are 3 different + // personality routines in ARM EHABI currently. It is possible to have 16 + // pre-defined personality routines at most. + AEABI_UNWIND_CPP_PR0 = 0, + AEABI_UNWIND_CPP_PR1 = 1, + AEABI_UNWIND_CPP_PR2 = 2, + + NUM_PERSONALITY_INDEX + }; + } #endif // ARM_UNWIND_OP_H diff --git a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp new file mode 100644 index 0000000..191db69 --- /dev/null +++ b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp @@ -0,0 +1,198 @@ +//===-- ARMUnwindOpAsm.cpp - ARM Unwind Opcodes Assembler -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the unwind opcode assmebler for ARM exception handling +// table. +// +//===----------------------------------------------------------------------===// + +#include "ARMUnwindOpAsm.h" + +#include "ARMUnwindOp.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/LEB128.h" + +using namespace llvm; + +void UnwindOpcodeAssembler::EmitRegSave(uint32_t RegSave) { + if (RegSave == 0u) + return; + + // One byte opcode to save register r14 and r11-r4 + if (RegSave & (1u << 4)) { + // The one byte opcode will always save r4, thus we can't use the one byte + // opcode when r4 is not in .save directive. + + // Compute the consecutive registers from r4 to r11. + uint32_t Range = 0; + uint32_t Mask = (1u << 4); + for (uint32_t Bit = (1u << 5); Bit < (1u << 12); Bit <<= 1) { + if ((RegSave & Bit) == 0u) + break; + ++Range; + Mask |= Bit; + } + + // Emit this opcode when the mask covers every registers. + uint32_t UnmaskedReg = RegSave & 0xfff0u & (~Mask); + if (UnmaskedReg == 0u) { + // Pop r[4 : (4 + n)] + Ops.push_back(UNWIND_OPCODE_POP_REG_RANGE_R4 | Range); + RegSave &= 0x000fu; + } else if (UnmaskedReg == (1u << 14)) { + // Pop r[14] + r[4 : (4 + n)] + Ops.push_back(UNWIND_OPCODE_POP_REG_RANGE_R4_R14 | Range); + RegSave &= 0x000fu; + } + } + + // Two bytes opcode to save register r15-r4 + if ((RegSave & 0xfff0u) != 0) { + uint32_t Op = UNWIND_OPCODE_POP_REG_MASK_R4 | (RegSave >> 4); + Ops.push_back(static_cast<uint8_t>(Op >> 8)); + Ops.push_back(static_cast<uint8_t>(Op & 0xff)); + } + + // Opcode to save register r3-r0 + if ((RegSave & 0x000fu) != 0) { + uint32_t Op = UNWIND_OPCODE_POP_REG_MASK | (RegSave & 0x000fu); + Ops.push_back(static_cast<uint8_t>(Op >> 8)); + Ops.push_back(static_cast<uint8_t>(Op & 0xff)); + } +} + +/// Emit unwind opcodes for .vsave directives +void UnwindOpcodeAssembler::EmitVFPRegSave(uint32_t VFPRegSave) { + size_t i = 32; + + while (i > 16) { + uint32_t Bit = 1u << (i - 1); + if ((VFPRegSave & Bit) == 0u) { + --i; + continue; + } + + uint32_t Range = 0; + + --i; + Bit >>= 1; + + while (i > 16 && (VFPRegSave & Bit)) { + --i; + ++Range; + Bit >>= 1; + } + + uint32_t Op = + UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD_D16 | ((i - 16) << 4) | Range; + Ops.push_back(static_cast<uint8_t>(Op >> 8)); + Ops.push_back(static_cast<uint8_t>(Op & 0xff)); + } + + while (i > 0) { + uint32_t Bit = 1u << (i - 1); + if ((VFPRegSave & Bit) == 0u) { + --i; + continue; + } + + uint32_t Range = 0; + + --i; + Bit >>= 1; + + while (i > 0 && (VFPRegSave & Bit)) { + --i; + ++Range; + Bit >>= 1; + } + + uint32_t Op = UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD | (i << 4) | Range; + Ops.push_back(static_cast<uint8_t>(Op >> 8)); + Ops.push_back(static_cast<uint8_t>(Op & 0xff)); + } +} + +/// Emit unwind opcodes for .setfp directives +void UnwindOpcodeAssembler::EmitSetFP(uint16_t FPReg) { + Ops.push_back(UNWIND_OPCODE_SET_VSP | FPReg); +} + +/// Emit unwind opcodes to update stack pointer +void UnwindOpcodeAssembler::EmitSPOffset(int64_t Offset) { + if (Offset > 0x200) { + uint8_t Buff[10]; + size_t Size = encodeULEB128((Offset - 0x204) >> 2, Buff); + Ops.push_back(UNWIND_OPCODE_INC_VSP_ULEB128); + Ops.append(Buff, Buff + Size); + } else if (Offset > 0) { + if (Offset > 0x100) { + Ops.push_back(UNWIND_OPCODE_INC_VSP | 0x3fu); + Offset -= 0x100; + } + Ops.push_back(UNWIND_OPCODE_INC_VSP | + static_cast<uint8_t>((Offset - 4) >> 2)); + } else if (Offset < 0) { + while (Offset < -0x100) { + Ops.push_back(UNWIND_OPCODE_DEC_VSP | 0x3fu); + Offset += 0x100; + } + Ops.push_back(UNWIND_OPCODE_DEC_VSP | + static_cast<uint8_t>(((-Offset) - 4) >> 2)); + } +} + +void UnwindOpcodeAssembler::AddOpcodeSizePrefix(size_t Pos) { + size_t SizeInWords = (size() + 3) / 4; + assert(SizeInWords <= 0x100u && + "Only 256 additional words are allowed for unwind opcodes"); + Ops[Pos] = static_cast<uint8_t>(SizeInWords - 1); +} + +void UnwindOpcodeAssembler::AddPersonalityIndexPrefix(size_t Pos, unsigned PI) { + assert(PI < NUM_PERSONALITY_INDEX && "Invalid personality prefix"); + Ops[Pos] = EHT_COMPACT | PI; +} + +void UnwindOpcodeAssembler::EmitFinishOpcodes() { + for (size_t i = (0x4u - (size() & 0x3u)) & 0x3u; i > 0; --i) + Ops.push_back(UNWIND_OPCODE_FINISH); +} + +void UnwindOpcodeAssembler::Finalize() { + if (HasPersonality) { + // Personality specified by .personality directive + Offset = 1; + AddOpcodeSizePrefix(1); + } else { + if (getOpcodeSize() <= 3) { + // __aeabi_unwind_cpp_pr0: [ 0x80 , OP1 , OP2 , OP3 ] + Offset = 1; + PersonalityIndex = AEABI_UNWIND_CPP_PR0; + AddPersonalityIndexPrefix(Offset, PersonalityIndex); + } else { + // __aeabi_unwind_cpp_pr1: [ 0x81 , SIZE , OP1 , OP2 , ... ] + Offset = 0; + PersonalityIndex = AEABI_UNWIND_CPP_PR1; + AddPersonalityIndexPrefix(Offset, PersonalityIndex); + AddOpcodeSizePrefix(1); + } + } + + // Emit the padding finish opcodes if the size() is not multiple of 4. + EmitFinishOpcodes(); + + // Swap the byte order + uint8_t *Ptr = Ops.begin() + Offset; + assert(size() % 4 == 0 && "Final unwind opcodes should align to 4"); + for (size_t i = 0, n = size(); i < n; i += 4) { + std::swap(Ptr[i], Ptr[i + 3]); + std::swap(Ptr[i + 1], Ptr[i + 2]); + } +} diff --git a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h new file mode 100644 index 0000000..f6ecaeb --- /dev/null +++ b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h @@ -0,0 +1,114 @@ +//===-- ARMUnwindOpAsm.h - ARM Unwind Opcodes Assembler ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the unwind opcode assmebler for ARM exception handling +// table. +// +//===----------------------------------------------------------------------===// + +#ifndef ARM_UNWIND_OP_ASM_H +#define ARM_UNWIND_OP_ASM_H + +#include "ARMUnwindOp.h" + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/DataTypes.h" + +namespace llvm { + +class MCSymbol; + +class UnwindOpcodeAssembler { +private: + llvm::SmallVector<uint8_t, 8> Ops; + + unsigned Offset; + unsigned PersonalityIndex; + bool HasPersonality; + + enum { + // The number of bytes to be preserved for the size and personality index + // prefix of unwind opcodes. + NUM_PRESERVED_PREFIX_BUF = 2 + }; + +public: + UnwindOpcodeAssembler() + : Ops(NUM_PRESERVED_PREFIX_BUF), Offset(NUM_PRESERVED_PREFIX_BUF), + PersonalityIndex(NUM_PERSONALITY_INDEX), HasPersonality(0) { + } + + /// Reset the unwind opcode assembler. + void Reset() { + Ops.resize(NUM_PRESERVED_PREFIX_BUF); + Offset = NUM_PRESERVED_PREFIX_BUF; + PersonalityIndex = NUM_PERSONALITY_INDEX; + HasPersonality = 0; + } + + /// Get the size of the payload (including the size byte) + size_t size() const { + return Ops.size() - Offset; + } + + /// Get the beginning of the payload + const uint8_t *begin() const { + return Ops.begin() + Offset; + } + + /// Get the payload + StringRef data() const { + return StringRef(reinterpret_cast<const char *>(begin()), size()); + } + + /// Set the personality index + void setPersonality(const MCSymbol *Per) { + HasPersonality = 1; + } + + /// Get the personality index + unsigned getPersonalityIndex() const { + return PersonalityIndex; + } + + /// Emit unwind opcodes for .save directives + void EmitRegSave(uint32_t RegSave); + + /// Emit unwind opcodes for .vsave directives + void EmitVFPRegSave(uint32_t VFPRegSave); + + /// Emit unwind opcodes for .setfp directives + void EmitSetFP(uint16_t FPReg); + + /// Emit unwind opcodes to update stack pointer + void EmitSPOffset(int64_t Offset); + + /// Finalize the unwind opcode sequence for EmitBytes() + void Finalize(); + +private: + /// Get the size of the opcodes in bytes. + size_t getOpcodeSize() const { + return Ops.size() - NUM_PRESERVED_PREFIX_BUF; + } + + /// Add the length prefix to the payload + void AddOpcodeSizePrefix(size_t Pos); + + /// Add personality index prefix in some compact format + void AddPersonalityIndexPrefix(size_t Pos, unsigned PersonalityIndex); + + /// Fill the words with finish opcode if it is not aligned + void EmitFinishOpcodes(); +}; + +} // namespace llvm + +#endif // ARM_UNWIND_OP_ASM_H diff --git a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt index e17eb4d..a7ac5ca 100644 --- a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt +++ b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt @@ -8,6 +8,7 @@ add_llvm_library(LLVMARMDesc ARMMCTargetDesc.cpp ARMMachObjectWriter.cpp ARMELFObjectWriter.cpp + ARMUnwindOpAsm.cpp ) add_dependencies(LLVMARMDesc ARMCommonTableGen) diff --git a/lib/Target/ARM/README-Thumb.txt b/lib/Target/ARM/README-Thumb.txt index 463c440..a64707e 100644 --- a/lib/Target/ARM/README-Thumb.txt +++ b/lib/Target/ARM/README-Thumb.txt @@ -173,7 +173,6 @@ GCC is doing a couple of clever things here: mov r1, #1 lsl r1, r1, #8 tst r2, r1 - //===---------------------------------------------------------------------===// @@ -196,7 +195,6 @@ This is especially bad when dynamic alloca is used. The all fixed size stack objects are referenced off the frame pointer with negative offsets. See oggenc for an example. - //===---------------------------------------------------------------------===// Poor codegen test/CodeGen/ARM/select.ll f7: diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index 2c3388c..1e2a8b0 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -88,7 +88,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const { const Thumb1InstrInfo &TII = *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo()); - unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); unsigned NumBytes = MFI->getStackSize(); const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); @@ -104,8 +104,8 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const { unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0; int FramePtrSpillFI = 0; - if (VARegSaveSize) - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -VARegSaveSize, + if (ArgRegsSaveSize) + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -ArgRegsSaveSize, MachineInstr::FrameSetup); if (!AFI->hasStackFrame()) { @@ -249,7 +249,7 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, const Thumb1InstrInfo &TII = *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo()); - unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); int NumBytes = (int)MFI->getStackSize(); const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(); unsigned FramePtr = RegInfo->getFrameRegister(MF); @@ -300,7 +300,7 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, } } - if (VARegSaveSize) { + if (ArgRegsSaveSize) { // Unlike T2 and ARM mode, the T1 pop instruction cannot restore // to LR, and we can't pop the value directly to the PC since // we need to update the SP after popping the value. Therefore, we @@ -313,7 +313,7 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))) .addReg(ARM::R3, RegState::Define); - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, VARegSaveSize); + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize); MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg)) @@ -376,7 +376,7 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB, ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); - bool isVarArg = AFI->getVarArgsRegSaveSize() > 0; + bool isVarArg = AFI->getArgRegsSaveSize() > 0; DebugLoc DL = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(MF, DL, TII.get(ARM::tPOP)); AddDefaultPred(MIB); diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp index 609d502..7452fb7 100644 --- a/lib/Target/ARM/Thumb1RegisterInfo.cpp +++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp @@ -588,7 +588,7 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // means the stack pointer cannot be used to access the emergency spill slot // when !hasReservedCallFrame(). #ifndef NDEBUG - if (RS && FrameReg == ARM::SP && FrameIndex == RS->getScavengingFrameIndex()){ + if (RS && FrameReg == ARM::SP && RS->isScavengingFrameIndex(FrameIndex)){ assert(MF.getTarget().getFrameLowering()->hasReservedCallFrame(MF) && "Cannot use SP to access the emergency spill slot in " "functions without a reserved call frame"); diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp index 67e8ec7..a1b48c2 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -19,6 +19,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/MC/MCInst.h" #include "llvm/Support/CommandLine.h" @@ -126,25 +127,41 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { + DebugLoc DL; + if (I != MBB.end()) DL = I->getDebugLoc(); + + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), + MachineMemOperand::MOStore, + MFI.getObjectSize(FI), + MFI.getObjectAlignment(FI)); + if (RC == &ARM::GPRRegClass || RC == &ARM::tGPRRegClass || RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass || RC == &ARM::GPRnopcRegClass) { - DebugLoc DL; - if (I != MBB.end()) DL = I->getDebugLoc(); - - MachineFunction &MF = *MBB.getParent(); - MachineFrameInfo &MFI = *MF.getFrameInfo(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOStore, - MFI.getObjectSize(FI), - MFI.getObjectAlignment(FI)); AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2STRi12)) .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); return; } + if (ARM::GPRPairRegClass.hasSubClassEq(RC)) { + // Thumb2 STRD expects its dest-registers to be in rGPR. Not a problem for + // gsub_0, but needs an extra constraint for gsub_1 (which could be sp + // otherwise). + MachineRegisterInfo *MRI = &MF.getRegInfo(); + MRI->constrainRegClass(SrcReg, &ARM::GPRPair_with_gsub_1_in_rGPRRegClass); + + MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2STRDi8)); + AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI); + AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI); + MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO); + AddDefaultPred(MIB); + return; + } + ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC, TRI); } @@ -153,24 +170,42 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned DestReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), + MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), + MFI.getObjectAlignment(FI)); + DebugLoc DL; + if (I != MBB.end()) DL = I->getDebugLoc(); + if (RC == &ARM::GPRRegClass || RC == &ARM::tGPRRegClass || RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass || RC == &ARM::GPRnopcRegClass) { - DebugLoc DL; - if (I != MBB.end()) DL = I->getDebugLoc(); - - MachineFunction &MF = *MBB.getParent(); - MachineFrameInfo &MFI = *MF.getFrameInfo(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOLoad, - MFI.getObjectSize(FI), - MFI.getObjectAlignment(FI)); AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg) .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); return; } + if (ARM::GPRPairRegClass.hasSubClassEq(RC)) { + // Thumb2 LDRD expects its dest-registers to be in rGPR. Not a problem for + // gsub_0, but needs an extra constraint for gsub_1 (which could be sp + // otherwise). + MachineRegisterInfo *MRI = &MF.getRegInfo(); + MRI->constrainRegClass(DestReg, &ARM::GPRPair_with_gsub_1_in_rGPRRegClass); + + MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2LDRDi8)); + AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI); + AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI); + MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO); + AddDefaultPred(MIB); + + if (TargetRegisterInfo::isPhysicalRegister(DestReg)) + MIB.addReg(DestReg, RegState::ImplicitDefine); + return; + } + ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI); } @@ -514,6 +549,15 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, Offset = -Offset; isSub = true; } + } else if (AddrMode == ARMII::AddrModeT2_i8s4) { + Offset += MI.getOperand(FrameRegIdx + 1).getImm() * 4; + NumBits = 8; + // MCInst operand has already scaled value. + Scale = 1; + if (Offset < 0) { + isSub = true; + Offset = -Offset; + } } else { llvm_unreachable("Unsupported addressing mode!"); } diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp index 567bc05..4795aae 100644 --- a/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -15,6 +15,7 @@ #include "MCTargetDesc/ARMAddressingModes.h" #include "Thumb2InstrInfo.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -79,11 +80,8 @@ namespace { { ARM::t2LSLrr, 0, ARM::tLSLrr, 0, 0, 0, 1, 0,0, 1,0,1 }, { ARM::t2LSRri, ARM::tLSRri, 0, 5, 0, 1, 0, 0,0, 1,0,1 }, { ARM::t2LSRrr, 0, ARM::tLSRrr, 0, 0, 0, 1, 0,0, 1,0,1 }, - // FIXME: tMOVi8 and tMVN also partially update CPSR but they are less - // likely to cause issue in the loop. As a size / performance workaround, - // they are not marked as such. - { ARM::t2MOVi, ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0,0,0 }, - { ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0,1,0 }, + { ARM::t2MOVi, ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 1,0,0 }, + { ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 1,1,0 }, // FIXME: Do we need the 16-bit 'S' variant? { ARM::t2MOVr,ARM::tMOVr, 0, 0, 0, 0, 0, 1,0, 0,0,0 }, { ARM::t2MUL, 0, ARM::tMUL, 0, 0, 0, 1, 0,0, 1,0,0 }, @@ -149,8 +147,7 @@ namespace { /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable. DenseMap<unsigned, unsigned> ReduceOpcodeMap; - bool canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use, - bool IsSelfLoop); + bool canAddPseudoFlagDep(MachineInstr *Use, bool IsSelfLoop); bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry, bool is2Addr, ARMCC::CondCodes Pred, @@ -160,33 +157,46 @@ namespace { const ReduceEntry &Entry); bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, - const ReduceEntry &Entry, bool LiveCPSR, - MachineInstr *CPSRDef, bool IsSelfLoop); + const ReduceEntry &Entry, bool LiveCPSR, bool IsSelfLoop); /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address /// instruction. bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, - const ReduceEntry &Entry, - bool LiveCPSR, MachineInstr *CPSRDef, + const ReduceEntry &Entry, bool LiveCPSR, bool IsSelfLoop); /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit /// non-two-address instruction. bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, - const ReduceEntry &Entry, - bool LiveCPSR, MachineInstr *CPSRDef, + const ReduceEntry &Entry, bool LiveCPSR, bool IsSelfLoop); /// ReduceMI - Attempt to reduce MI, return true on success. bool ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI, - bool LiveCPSR, MachineInstr *CPSRDef, - bool IsSelfLoop); + bool LiveCPSR, bool IsSelfLoop); /// ReduceMBB - Reduce width of instructions in the specified basic block. bool ReduceMBB(MachineBasicBlock &MBB); bool OptimizeSize; bool MinimizeSize; + + // Last instruction to define CPSR in the current block. + MachineInstr *CPSRDef; + // Was CPSR last defined by a high latency instruction? + // When CPSRDef is null, this refers to CPSR defs in predecessors. + bool HighLatencyCPSR; + + struct MBBInfo { + // The flags leaving this block have high latency. + bool HighLatencyCPSR; + // Has this block been visited yet? + bool Visited; + + MBBInfo() : HighLatencyCPSR(false), Visited(false) {} + }; + + SmallVector<MBBInfo, 8> BlockInfo; }; char Thumb2SizeReduce::ID = 0; } @@ -207,6 +217,16 @@ static bool HasImplicitCPSRDef(const MCInstrDesc &MCID) { return false; } +// Check for a likely high-latency flag def. +static bool isHighLatencyCPSR(MachineInstr *Def) { + switch(Def->getOpcode()) { + case ARM::FMSTAT: + case ARM::tMUL: + return true; + } + return false; +} + /// canAddPseudoFlagDep - For A9 (and other out-of-order) implementations, /// the 's' 16-bit instruction partially update CPSR. Abort the /// transformation to avoid adding false dependency on last CPSR setting @@ -225,20 +245,19 @@ static bool HasImplicitCPSRDef(const MCInstrDesc &MCID) { /// In this case it would have been ok to narrow the mul.w to muls since there /// are indirect RAW dependency between the muls and the mul.w bool -Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use, - bool FirstInSelfLoop) { +Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) { // Disable the check for -Oz (aka OptimizeForSizeHarder). if (MinimizeSize || !STI->avoidCPSRPartialUpdate()) return false; - if (!Def) + if (!CPSRDef) // If this BB loops back to itself, conservatively avoid narrowing the // first instruction that does partial flag update. - return FirstInSelfLoop; + return HighLatencyCPSR || FirstInSelfLoop; SmallSet<unsigned, 2> Defs; - for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = Def->getOperand(i); + for (unsigned i = 0, e = CPSRDef->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = CPSRDef->getOperand(i); if (!MO.isReg() || MO.isUndef() || MO.isUse()) continue; unsigned Reg = MO.getReg(); @@ -256,6 +275,16 @@ Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use, return false; } + // If the current CPSR has high latency, try to avoid the false dependency. + if (HighLatencyCPSR) + return true; + + // tMOVi8 usually doesn't start long dependency chains, and there are a lot + // of them, so always shrink them when CPSR doesn't have high latency. + if (Use->getOpcode() == ARM::t2MOVi || + Use->getOpcode() == ARM::t2MOVi16) + return false; + // No read-after-write dependency. The narrowing will add false dependency. return true; } @@ -498,16 +527,15 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, bool Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, const ReduceEntry &Entry, - bool LiveCPSR, MachineInstr *CPSRDef, - bool IsSelfLoop) { + bool LiveCPSR, bool IsSelfLoop) { unsigned Opc = MI->getOpcode(); if (Opc == ARM::t2ADDri) { // If the source register is SP, try to reduce to tADDrSPi, otherwise // it's a normal reduce. if (MI->getOperand(1).getReg() != ARM::SP) { - if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop)) + if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop)) return true; - return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop); + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); } // Try to reduce to tADDrSPi. unsigned Imm = MI->getOperand(2).getImm(); @@ -557,12 +585,12 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, switch (Opc) { default: break; case ARM::t2ADDSri: { - if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop)) + if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop)) return true; // fallthrough } case ARM::t2ADDSrr: - return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop); + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); } } break; @@ -574,13 +602,13 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, case ARM::t2UXTB: case ARM::t2UXTH: if (MI->getOperand(2).getImm() == 0) - return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop); + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); break; case ARM::t2MOVi16: // Can convert only 'pure' immediate operands, not immediates obtained as // globals' addresses. if (MI->getOperand(1).isImm()) - return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop); + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); break; case ARM::t2CMPrr: { // Try to reduce to the lo-reg only version first. Why there are two @@ -590,9 +618,9 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, // source insn opcode. So for now, we hack a local entry record to use. static const ReduceEntry NarrowEntry = { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 0,1,0 }; - if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, CPSRDef, IsSelfLoop)) + if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, IsSelfLoop)) return true; - return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop); + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); } } return false; @@ -601,8 +629,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, bool Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, const ReduceEntry &Entry, - bool LiveCPSR, MachineInstr *CPSRDef, - bool IsSelfLoop) { + bool LiveCPSR, bool IsSelfLoop) { if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr)) return false; @@ -683,7 +710,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, // Avoid adding a false dependency on partial flag update by some 16-bit // instructions which has the 's' bit set. if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC && - canAddPseudoFlagDep(CPSRDef, MI, IsSelfLoop)) + canAddPseudoFlagDep(MI, IsSelfLoop)) return false; // Add the 16-bit instruction. @@ -720,8 +747,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, bool Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, const ReduceEntry &Entry, - bool LiveCPSR, MachineInstr *CPSRDef, - bool IsSelfLoop) { + bool LiveCPSR, bool IsSelfLoop) { if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit)) return false; @@ -780,7 +806,7 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, // Avoid adding a false dependency on partial flag update by some 16-bit // instructions which has the 's' bit set. if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC && - canAddPseudoFlagDep(CPSRDef, MI, IsSelfLoop)) + canAddPseudoFlagDep(MI, IsSelfLoop)) return false; // Add the 16-bit instruction. @@ -865,8 +891,7 @@ static bool UpdateCPSRUse(MachineInstr &MI, bool LiveCPSR) { } bool Thumb2SizeReduce::ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI, - bool LiveCPSR, MachineInstr *CPSRDef, - bool IsSelfLoop) { + bool LiveCPSR, bool IsSelfLoop) { unsigned Opcode = MI->getOpcode(); DenseMap<unsigned, unsigned>::iterator OPI = ReduceOpcodeMap.find(Opcode); if (OPI == ReduceOpcodeMap.end()) @@ -875,16 +900,16 @@ bool Thumb2SizeReduce::ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI, // Don't attempt normal reductions on "special" cases for now. if (Entry.Special) - return ReduceSpecial(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop); + return ReduceSpecial(MBB, MI, Entry, LiveCPSR, IsSelfLoop); // Try to transform to a 16-bit two-address instruction. if (Entry.NarrowOpc2 && - ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop)) + ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop)) return true; // Try to transform to a 16-bit non-two-address instruction. if (Entry.NarrowOpc1 && - ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop)) + ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop)) return true; return false; @@ -895,9 +920,25 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { // Yes, CPSR could be livein. bool LiveCPSR = MBB.isLiveIn(ARM::CPSR); - MachineInstr *CPSRDef = 0; MachineInstr *BundleMI = 0; + CPSRDef = 0; + HighLatencyCPSR = false; + + // Check predecessors for the latest CPSRDef. + for (MachineBasicBlock::pred_iterator + I = MBB.pred_begin(), E = MBB.pred_end(); I != E; ++I) { + const MBBInfo &PInfo = BlockInfo[(*I)->getNumber()]; + if (!PInfo.Visited) { + // Since blocks are visited in RPO, this must be a back-edge. + continue; + } + if (PInfo.HighLatencyCPSR) { + HighLatencyCPSR = true; + break; + } + } + // If this BB loops back to itself, conservatively avoid narrowing the // first instruction that does partial flag update. bool IsSelfLoop = MBB.isSuccessor(&MBB); @@ -911,13 +952,15 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { BundleMI = MI; continue; } + if (MI->isDebugValue()) + continue; LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR); // Does NextMII belong to the same bundle as MI? bool NextInSameBundle = NextMII != E && NextMII->isBundledWithPred(); - if (ReduceMI(MBB, MI, LiveCPSR, CPSRDef, IsSelfLoop)) { + if (ReduceMI(MBB, MI, LiveCPSR, IsSelfLoop)) { Modified = true; MachineBasicBlock::instr_iterator I = prior(NextMII); MI = &*I; @@ -944,14 +987,19 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { if (MI->isCall()) { // Calls don't really set CPSR. CPSRDef = 0; + HighLatencyCPSR = false; IsSelfLoop = false; } else if (DefCPSR) { // This is the last CPSR defining instruction. CPSRDef = MI; + HighLatencyCPSR = isHighLatencyCPSR(CPSRDef); IsSelfLoop = false; } } + MBBInfo &Info = BlockInfo[MBB.getNumber()]; + Info.HighLatencyCPSR = HighLatencyCPSR; + Info.Visited = true; return Modified; } @@ -967,9 +1015,16 @@ bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) { MinimizeSize = FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); + BlockInfo.clear(); + BlockInfo.resize(MF.getNumBlockIDs()); + + // Visit blocks in reverse post-order so LastCPSRDef is known for all + // predecessors. + ReversePostOrderTraversal<MachineFunction*> RPOT(&MF); bool Modified = false; - for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) - Modified |= ReduceMBB(*I); + for (ReversePostOrderTraversal<MachineFunction*>::rpo_iterator + I = RPOT.begin(), E = RPOT.end(); I != E; ++I) + Modified |= ReduceMBB(**I); return Modified; } |