diff options
Diffstat (limited to 'lib/Target/AArch64')
45 files changed, 1220 insertions, 822 deletions
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h index e96d18b..21106c9 100644 --- a/lib/Target/AArch64/AArch64.h +++ b/lib/Target/AArch64/AArch64.h @@ -40,9 +40,6 @@ FunctionPass *createAArch64ConditionOptimizerPass(); FunctionPass *createAArch64AddressTypePromotionPass(); FunctionPass *createAArch64A57FPLoadBalancing(); FunctionPass *createAArch64A53Fix835769(); -/// \brief Creates an ARM-specific Target Transformation Info pass. -ImmutablePass * -createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM); FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index e6a27c3..dff48f9 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -91,6 +91,8 @@ def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8, def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; +// FIXME: Cortex-A72 is currently modelled as an Cortex-A57. +def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>; def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/AArch64/AArch64A53Fix835769.cpp b/lib/Target/AArch64/AArch64A53Fix835769.cpp index 852a635..dd401c6 100644 --- a/lib/Target/AArch64/AArch64A53Fix835769.cpp +++ b/lib/Target/AArch64/AArch64A53Fix835769.cpp @@ -16,8 +16,6 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" -#include "AArch64InstrInfo.h" -#include "AArch64Subtarget.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -26,6 +24,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Target/TargetInstrInfo.h" using namespace llvm; @@ -79,7 +78,7 @@ static bool isSecondInstructionInSequence(MachineInstr *MI) { namespace { class AArch64A53Fix835769 : public MachineFunctionPass { - const AArch64InstrInfo *TII; + const TargetInstrInfo *TII; public: static char ID; @@ -107,17 +106,13 @@ char AArch64A53Fix835769::ID = 0; bool AArch64A53Fix835769::runOnMachineFunction(MachineFunction &F) { - const TargetMachine &TM = F.getTarget(); - - bool Changed = false; DEBUG(dbgs() << "***** AArch64A53Fix835769 *****\n"); - - TII = TM.getSubtarget<AArch64Subtarget>().getInstrInfo(); + bool Changed = false; + TII = F.getSubtarget().getInstrInfo(); for (auto &MBB : F) { Changed |= runOnBasicBlock(MBB); } - return Changed; } diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp index 2503764..2cf3c22 100644 --- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -38,8 +38,8 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -96,6 +96,10 @@ static bool isMla(MachineInstr *MI) { } } +namespace llvm { +static void initializeAArch64A57FPLoadBalancingPass(PassRegistry &); +} + //===----------------------------------------------------------------------===// namespace { @@ -109,14 +113,15 @@ static const char *ColorNames[2] = { "Even", "Odd" }; class Chain; class AArch64A57FPLoadBalancing : public MachineFunctionPass { - const AArch64InstrInfo *TII; MachineRegisterInfo *MRI; const TargetRegisterInfo *TRI; RegisterClassInfo RCI; public: static char ID; - explicit AArch64A57FPLoadBalancing() : MachineFunctionPass(ID) {} + explicit AArch64A57FPLoadBalancing() : MachineFunctionPass(ID) { + initializeAArch64A57FPLoadBalancingPass(*PassRegistry::getPassRegistry()); + } bool runOnMachineFunction(MachineFunction &F) override; @@ -143,8 +148,16 @@ private: Color getColor(unsigned Register); Chain *getAndEraseNext(Color PreferredColor, std::vector<Chain*> &L); }; +} + char AArch64A57FPLoadBalancing::ID = 0; +INITIALIZE_PASS_BEGIN(AArch64A57FPLoadBalancing, DEBUG_TYPE, + "AArch64 A57 FP Load-Balancing", false, false) +INITIALIZE_PASS_END(AArch64A57FPLoadBalancing, DEBUG_TYPE, + "AArch64 A57 FP Load-Balancing", false, false) + +namespace { /// A Chain is a sequence of instructions that are linked together by /// an accumulation operand. For example: /// @@ -259,7 +272,7 @@ public: } /// Return true if this chain starts before Other. - bool startsBefore(Chain *Other) { + bool startsBefore(const Chain *Other) const { return StartInstIdx < Other->StartInstIdx; } @@ -297,10 +310,8 @@ bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) { bool Changed = false; DEBUG(dbgs() << "***** AArch64A57FPLoadBalancing *****\n"); - const TargetMachine &TM = F.getTarget(); MRI = &F.getRegInfo(); TRI = F.getRegInfo().getTargetRegisterInfo(); - TII = TM.getSubtarget<AArch64Subtarget>().getInstrInfo(); RCI.runOnMachineFunction(F); for (auto &MBB : F) { @@ -431,10 +442,17 @@ bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV, // chains that we cannot change before we look at those we can, // so the parity counter is updated and we know what color we should // change them to! + // Final tie-break with instruction order so pass output is stable (i.e. not + // dependent on malloc'd pointer values). std::sort(GV.begin(), GV.end(), [](const Chain *G1, const Chain *G2) { if (G1->size() != G2->size()) return G1->size() > G2->size(); - return G1->requiresFixup() > G2->requiresFixup(); + if (G1->requiresFixup() != G2->requiresFixup()) + return G1->requiresFixup() > G2->requiresFixup(); + // Make sure startsBefore() produces a stable final order. + assert((G1 == G2 || (G1->startsBefore(G2) ^ G2->startsBefore(G1))) && + "Starts before not total order!"); + return G1->startsBefore(G2); }); Color PreferredColor = Parity < 0 ? Color::Even : Color::Odd; @@ -481,10 +499,16 @@ int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C, RS.forward(I); AvailableRegs &= RS.getRegsAvailable(TRI->getRegClass(RegClassID)); - // Remove any registers clobbered by a regmask. + // Remove any registers clobbered by a regmask or any def register that is + // immediately dead. for (auto J : I->operands()) { if (J.isRegMask()) AvailableRegs.clearBitsNotInMask(J.getRegMask()); + + if (J.isReg() && J.isDef() && AvailableRegs[J.getReg()]) { + assert(J.isDead() && "Non-dead def should have been removed by now!"); + AvailableRegs.reset(J.getReg()); + } } } diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp index 5afe0f4..f27dfc9 100644 --- a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp +++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp @@ -64,7 +64,7 @@ STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted"); namespace { class AArch64AdvSIMDScalar : public MachineFunctionPass { MachineRegisterInfo *MRI; - const AArch64InstrInfo *TII; + const TargetInstrInfo *TII; private: // isProfitableToTransform - Predicate function to determine whether an @@ -268,7 +268,7 @@ AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const { return TransformAll; } -static MachineInstr *insertCopy(const AArch64InstrInfo *TII, MachineInstr *MI, +static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr *MI, unsigned Dst, unsigned Src, bool IsKill) { MachineInstrBuilder MIB = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AArch64::COPY), @@ -376,10 +376,8 @@ bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) { bool Changed = false; DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n"); - const TargetMachine &TM = mf.getTarget(); MRI = &mf.getRegInfo(); - TII = static_cast<const AArch64InstrInfo *>( - TM.getSubtargetImpl()->getInstrInfo()); + TII = mf.getSubtarget().getInstrInfo(); // Just check things on a one-block-at-a-time basis. for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I) diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp index 8bee4f5..d64d851 100644 --- a/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -43,19 +43,13 @@ using namespace llvm; namespace { class AArch64AsmPrinter : public AsmPrinter { - /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can - /// make the right decision when printing asm code for different targets. - const AArch64Subtarget *Subtarget; - AArch64MCInstLower MCInstLowering; StackMaps SM; public: - AArch64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer) - : AsmPrinter(TM, Streamer), - Subtarget(&TM.getSubtarget<AArch64Subtarget>()), - MCInstLowering(OutContext, *this), SM(*this), AArch64FI(nullptr), - LOHLabelCounter(0) {} + AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) + : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this), + SM(*this), AArch64FI(nullptr), LOHLabelCounter(0) {} const char *getPassName() const override { return "AArch64 Assembly Printer"; @@ -124,7 +118,8 @@ private: //===----------------------------------------------------------------------===// void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) { - if (Subtarget->isTargetMachO()) { + Triple TT(TM.getTargetTriple()); + if (TT.isOSBinFormatMachO()) { // Funny Darwin hack: This flag tells the linker that no global symbols // contain code that falls through to other global symbols (e.g. the obvious // implementation of multiple entry points). If this doesn't occur, the @@ -135,7 +130,7 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) { } // Emit a .data.rel section containing any stubs that were created. - if (Subtarget->isTargetELF()) { + if (TT.isOSBinFormatELF()) { const TargetLoweringObjectFileELF &TLOFELF = static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering()); @@ -145,7 +140,7 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) { MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList(); if (!Stubs.empty()) { OutStreamer.SwitchSection(TLOFELF.getDataRelSection()); - const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout(); + const DataLayout *TD = TM.getDataLayout(); for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { OutStreamer.EmitLabel(Stubs[i].first); @@ -252,8 +247,8 @@ bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO, const TargetRegisterClass *RC, bool isVector, raw_ostream &O) { assert(MO.isReg() && "Should only get here with a register!"); - const AArch64RegisterInfo *RI = static_cast<const AArch64RegisterInfo *>( - TM.getSubtargetImpl()->getRegisterInfo()); + const AArch64RegisterInfo *RI = + MF->getSubtarget<AArch64Subtarget>().getRegisterInfo(); unsigned Reg = MO.getReg(); unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg)); assert(RI->regsOverlap(RegToPrint, Reg)); @@ -381,8 +376,23 @@ void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, unsigned NumNOPBytes = MI.getOperand(1).getImm(); SM.recordStackMap(MI); - // Emit padding. assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); + + // Scan ahead to trim the shadow. + const MachineBasicBlock &MBB = *MI.getParent(); + MachineBasicBlock::const_iterator MII(MI); + ++MII; + while (NumNOPBytes > 0) { + if (MII == MBB.end() || MII->isCall() || + MII->getOpcode() == AArch64::DBG_VALUE || + MII->getOpcode() == TargetOpcode::PATCHPOINT || + MII->getOpcode() == TargetOpcode::STACKMAP) + break; + ++MII; + NumNOPBytes -= 4; + } + + // Emit nops. for (unsigned i = 0; i < NumNOPBytes; i += 4) EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0)); } diff --git a/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/lib/Target/AArch64/AArch64BranchRelaxation.cpp index e2b6367..d973234 100644 --- a/lib/Target/AArch64/AArch64BranchRelaxation.cpp +++ b/lib/Target/AArch64/AArch64BranchRelaxation.cpp @@ -476,9 +476,7 @@ bool AArch64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) { DEBUG(dbgs() << "***** AArch64BranchRelaxation *****\n"); - TII = (const AArch64InstrInfo *)MF->getTarget() - .getSubtargetImpl() - ->getInstrInfo(); + TII = (const AArch64InstrInfo *)MF->getSubtarget().getInstrInfo(); // Renumber all of the machine basic blocks in the function, guaranteeing that // the numbers agree with the position of the block in the function. diff --git a/lib/Target/AArch64/AArch64CallingConvention.h b/lib/Target/AArch64/AArch64CallingConvention.h new file mode 100644 index 0000000..1e2d1c3 --- /dev/null +++ b/lib/Target/AArch64/AArch64CallingConvention.h @@ -0,0 +1,141 @@ +//=== AArch64CallingConv.h - Custom Calling Convention Routines -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the custom routines for the AArch64 Calling Convention +// that aren't done by tablegen. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/Target/TargetInstrInfo.h" + +namespace { +using namespace llvm; + +static const uint16_t XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2, + AArch64::X3, AArch64::X4, AArch64::X5, + AArch64::X6, AArch64::X7}; +static const uint16_t HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2, + AArch64::H3, AArch64::H4, AArch64::H5, + AArch64::H6, AArch64::H7}; +static const uint16_t SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2, + AArch64::S3, AArch64::S4, AArch64::S5, + AArch64::S6, AArch64::S7}; +static const uint16_t DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2, + AArch64::D3, AArch64::D4, AArch64::D5, + AArch64::D6, AArch64::D7}; +static const uint16_t QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2, + AArch64::Q3, AArch64::Q4, AArch64::Q5, + AArch64::Q6, AArch64::Q7}; + +static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers, + MVT LocVT, ISD::ArgFlagsTy &ArgFlags, + CCState &State, unsigned SlotAlign) { + unsigned Size = LocVT.getSizeInBits() / 8; + unsigned StackAlign = State.getMachineFunction() + .getTarget() + .getDataLayout() + ->getStackAlignment(); + unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign); + + for (auto &It : PendingMembers) { + It.convertToMem(State.AllocateStack(Size, std::max(Align, SlotAlign))); + State.addLoc(It); + SlotAlign = 1; + } + + // All pending members have now been allocated + PendingMembers.clear(); + return true; +} + +/// The Darwin variadic PCS places anonymous arguments in 8-byte stack slots. An +/// [N x Ty] type must still be contiguous in memory though. +static bool CC_AArch64_Custom_Stack_Block( + unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs(); + + // Add the argument to the list to be allocated once we know the size of the + // block. + PendingMembers.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + + if (!ArgFlags.isInConsecutiveRegsLast()) + return true; + + return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, 8); +} + +/// Given an [N x Ty] block, it should be passed in a consecutive sequence of +/// registers. If no such sequence is available, mark the rest of the registers +/// of that type as used and place the argument on the stack. +static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + // Try to allocate a contiguous block of registers, each of the correct + // size to hold one member. + ArrayRef<uint16_t> RegList; + if (LocVT.SimpleTy == MVT::i64) + RegList = XRegList; + else if (LocVT.SimpleTy == MVT::f16) + RegList = HRegList; + else if (LocVT.SimpleTy == MVT::f32 || LocVT.is32BitVector()) + RegList = SRegList; + else if (LocVT.SimpleTy == MVT::f64 || LocVT.is64BitVector()) + RegList = DRegList; + else if (LocVT.SimpleTy == MVT::f128 || LocVT.is128BitVector()) + RegList = QRegList; + else { + // Not an array we want to split up after all. + return false; + } + + SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs(); + + // Add the argument to the list to be allocated once we know the size of the + // block. + PendingMembers.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + + if (!ArgFlags.isInConsecutiveRegsLast()) + return true; + + unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size()); + if (RegResult) { + for (auto &It : PendingMembers) { + It.convertToReg(RegResult); + State.addLoc(It); + ++RegResult; + } + PendingMembers.clear(); + return true; + } + + // Mark all regs in the class as unavailable + for (auto Reg : RegList) + State.AllocateReg(Reg); + + const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>( + State.getMachineFunction().getSubtarget()); + unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8; + + return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign); +} + +} + +#endif diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td index 9e707e4..4691e94 100644 --- a/lib/Target/AArch64/AArch64CallingConvention.td +++ b/lib/Target/AArch64/AArch64CallingConvention.td @@ -16,7 +16,7 @@ class CCIfAlign<string Align, CCAction A> : CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>; /// CCIfBigEndian - Match only if we're in big endian mode. class CCIfBigEndian<CCAction A> : - CCIf<"State.getMachineFunction().getSubtarget().getDataLayout()->isBigEndian()", A>; + CCIf<"State.getMachineFunction().getTarget().getDataLayout()->isBigEndian()", A>; //===----------------------------------------------------------------------===// // ARM AAPCS64 Calling Convention @@ -40,6 +40,8 @@ def CC_AArch64_AAPCS : CallingConv<[ // slot is 64-bit. CCIfByVal<CCPassByVal<8, 8>>, + CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>, + // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, // up to eight each of GPR and FPR. CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, @@ -119,6 +121,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[ // slot is 64-bit. CCIfByVal<CCPassByVal<8, 8>>, + CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>, + // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, // up to eight each of GPR and FPR. CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, @@ -159,6 +163,8 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[ CCIfType<[v2f32], CCBitConvertToType<v2i32>>, CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>, + CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Stack_Block">>, + // Handle all scalar types as either i64 or f64. CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, CCIfType<[f16, f32], CCPromoteToType<f64>>, @@ -198,6 +204,44 @@ def RetCC_AArch64_WebKit_JS : CallingConv<[ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>> ]>; +//===----------------------------------------------------------------------===// +// ARM64 Calling Convention for GHC +//===----------------------------------------------------------------------===// + +// This calling convention is specific to the Glasgow Haskell Compiler. +// The only documentation is the GHC source code, specifically the C header +// file: +// +// https://github.com/ghc/ghc/blob/master/includes/stg/MachRegs.h +// +// which defines the registers for the Spineless Tagless G-Machine (STG) that +// GHC uses to implement lazy evaluation. The generic STG machine has a set of +// registers which are mapped to appropriate set of architecture specific +// registers for each CPU architecture. +// +// The STG Machine is documented here: +// +// https://ghc.haskell.org/trac/ghc/wiki/Commentary/Compiler/GeneratedCode +// +// The AArch64 register mapping is under the heading "The ARMv8/AArch64 ABI +// register mapping". + +def CC_AArch64_GHC : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, f128], CCBitConvertToType<v2f64>>, + + CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>, + CCIfType<[f32], CCAssignToReg<[S8, S9, S10, S11]>>, + CCIfType<[f64], CCAssignToReg<[D12, D13, D14, D15]>>, + + // Promote i8/i16/i32 arguments to i64. + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim + CCIfType<[i64], CCAssignToReg<[X19, X20, X21, X22, X23, X24, X25, X26, X27, X28]>> +]>; + // FIXME: LR is only callee-saved in the sense that *we* preserve it and are // presumably a callee to someone. External functions may not do so, but this // is currently safe since BL has LR as an implicit-def and what happens after a @@ -243,3 +287,4 @@ def CSR_AArch64_AllRegs (sequence "S%u", 0, 31), (sequence "D%u", 0, 31), (sequence "Q%u", 0, 31))>; +def CSR_AArch64_NoRegs : CalleeSavedRegs<(add)>; diff --git a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp index aab8e38..3b74481 100644 --- a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp +++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp @@ -92,9 +92,7 @@ struct LDTLSCleanup : public MachineFunctionPass { MachineInstr *replaceTLSBaseAddrCall(MachineInstr *I, unsigned TLSBaseAddrReg) { MachineFunction *MF = I->getParent()->getParent(); - const AArch64TargetMachine *TM = - static_cast<const AArch64TargetMachine *>(&MF->getTarget()); - const AArch64InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the // code sequence assumes the address will be. @@ -112,9 +110,7 @@ struct LDTLSCleanup : public MachineFunctionPass { // inserting a copy instruction after I. Returns the new instruction. MachineInstr *setRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) { MachineFunction *MF = I->getParent()->getParent(); - const AArch64TargetMachine *TM = - static_cast<const AArch64TargetMachine *>(&MF->getTarget()); - const AArch64InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); // Create a virtual register for the TLS base address. MachineRegisterInfo &RegInfo = MF->getRegInfo(); diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp index 87b545b..938dcb3 100644 --- a/lib/Target/AArch64/AArch64CollectLOH.cpp +++ b/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -285,9 +285,7 @@ static void initReachingDef(MachineFunction &MF, BlockToSetOfInstrsPerColor &ReachableUses, const MapRegToId &RegToId, const MachineInstr *DummyOp, bool ADRPMode) { - const TargetMachine &TM = MF.getTarget(); - const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo(); - + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); unsigned NbReg = RegToId.size(); for (MachineBasicBlock &MBB : MF) { @@ -1026,8 +1024,7 @@ static void collectInvolvedReg(MachineFunction &MF, MapRegToId &RegToId, } bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) { - const TargetMachine &TM = MF.getTarget(); - const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo(); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>(); MapRegToId RegToId; @@ -1043,8 +1040,7 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) { MachineInstr *DummyOp = nullptr; if (BasicBlockScopeOnly) { - const AArch64InstrInfo *TII = static_cast<const AArch64InstrInfo *>( - TM.getSubtargetImpl()->getInstrInfo()); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); // For local analysis, create a dummy operation to record uses that are not // local. DummyOp = MF.CreateMachineInstr(TII->get(AArch64::COPY), DebugLoc()); diff --git a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp index 0fbd3c6..e68571f 100644 --- a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp +++ b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp @@ -304,7 +304,7 @@ bool AArch64ConditionOptimizer::adjustTo(MachineInstr *CmpMI, bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n" << "********** Function: " << MF.getName() << '\n'); - TII = MF.getTarget().getSubtargetImpl()->getInstrInfo(); + TII = MF.getSubtarget().getInstrInfo(); DomTree = &getAnalysis<MachineDominatorTree>(); bool Changed = false; diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp index 54f53dc..fccd8df 100644 --- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -893,15 +893,13 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) { << "********** Function: " << MF.getName() << '\n'); TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); - SchedModel = - MF.getTarget().getSubtarget<TargetSubtargetInfo>().getSchedModel(); + SchedModel = MF.getSubtarget().getSchedModel(); MRI = &MF.getRegInfo(); DomTree = &getAnalysis<MachineDominatorTree>(); Loops = getAnalysisIfAvailable<MachineLoopInfo>(); Traces = &getAnalysis<MachineTraceMetrics>(); MinInstr = nullptr; - MinSize = MF.getFunction()->getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::MinSize); + MinSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize); bool Changed = false; CmpConv.runOnMachineFunction(MF); diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index c850680..41b1132 100644 --- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -229,7 +229,7 @@ static bool isStartChunk(uint64_t Chunk) { if (Chunk == 0 || Chunk == UINT64_MAX) return false; - return (CountLeadingOnes_64(Chunk) + countTrailingZeros(Chunk)) == 64; + return isMask_64(~Chunk); } /// \brief Check whether this chunk matches the pattern '0...1...' This pattern @@ -239,7 +239,7 @@ static bool isEndChunk(uint64_t Chunk) { if (Chunk == 0 || Chunk == UINT64_MAX) return false; - return (countLeadingZeros(Chunk) + CountTrailingOnes_64(Chunk)) == 64; + return isMask_64(Chunk); } /// \brief Clear or set all bits in the chunk at the given index. diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index 612cb00..61017c1 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" +#include "AArch64CallingConvention.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64AddressingModes.h" @@ -244,9 +245,10 @@ public: unsigned fastMaterializeFloatZero(const ConstantFP* CF) override; explicit AArch64FastISel(FunctionLoweringInfo &FuncInfo, - const TargetLibraryInfo *LibInfo) + const TargetLibraryInfo *LibInfo) : FastISel(FuncInfo, LibInfo, /*SkipTargetIndependentISel=*/true) { - Subtarget = &TM.getSubtarget<AArch64Subtarget>(); + Subtarget = + &static_cast<const AArch64Subtarget &>(FuncInfo.MF->getSubtarget()); Context = &FuncInfo.Fn->getContext(); } @@ -301,6 +303,8 @@ static unsigned getImplicitScaleFactor(MVT VT) { CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const { if (CC == CallingConv::WebKit_JS) return CC_AArch64_WebKit_JS; + if (CC == CallingConv::GHC) + return CC_AArch64_GHC; return Subtarget->isTargetDarwin() ? CC_AArch64_DarwinPCS : CC_AArch64_AAPCS; } @@ -366,6 +370,24 @@ unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) { return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm); } + // For the MachO large code model materialize the FP constant in code. + if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { + unsigned Opc1 = Is64Bit ? AArch64::MOVi64imm : AArch64::MOVi32imm; + const TargetRegisterClass *RC = Is64Bit ? + &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + + unsigned TmpReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc1), TmpReg) + .addImm(CFP->getValueAPF().bitcastToAPInt().getZExtValue()); + + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(TmpReg, getKillRegState(true)); + + return ResultReg; + } + // Materialize via constant pool. MachineConstantPool wants an explicit // alignment. unsigned Align = DL.getPrefTypeAlignment(CFP->getType()); @@ -752,7 +774,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) if (Addr.getOffsetReg()) break; - if (DL.getTypeSizeInBits(Ty) != 8) + if (!Ty || DL.getTypeSizeInBits(Ty) != 8) break; const Value *LHS = U->getOperand(0); @@ -2112,15 +2134,15 @@ bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) { int TestBit = -1; bool IsCmpNE; - if ((Predicate == CmpInst::ICMP_EQ) || (Predicate == CmpInst::ICMP_NE)) { - if (const auto *C = dyn_cast<Constant>(LHS)) - if (C->isNullValue()) - std::swap(LHS, RHS); - - if (!isa<Constant>(RHS)) - return false; + switch (Predicate) { + default: + return false; + case CmpInst::ICMP_EQ: + case CmpInst::ICMP_NE: + if (isa<Constant>(LHS) && cast<Constant>(LHS)->isNullValue()) + std::swap(LHS, RHS); - if (!cast<Constant>(RHS)->isNullValue()) + if (!isa<Constant>(RHS) || !cast<Constant>(RHS)->isNullValue()) return false; if (const auto *AI = dyn_cast<BinaryOperator>(LHS)) @@ -2143,26 +2165,27 @@ bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) { TestBit = 0; IsCmpNE = Predicate == CmpInst::ICMP_NE; - } else if (Predicate == CmpInst::ICMP_SLT) { - if (!isa<Constant>(RHS)) - return false; - - if (!cast<Constant>(RHS)->isNullValue()) + break; + case CmpInst::ICMP_SLT: + case CmpInst::ICMP_SGE: + if (!isa<Constant>(RHS) || !cast<Constant>(RHS)->isNullValue()) return false; TestBit = BW - 1; - IsCmpNE = true; - } else if (Predicate == CmpInst::ICMP_SGT) { + IsCmpNE = Predicate == CmpInst::ICMP_SLT; + break; + case CmpInst::ICMP_SGT: + case CmpInst::ICMP_SLE: if (!isa<ConstantInt>(RHS)) return false; - if (cast<ConstantInt>(RHS)->getValue() != -1) + if (cast<ConstantInt>(RHS)->getValue() != APInt(BW, -1, true)) return false; TestBit = BW - 1; - IsCmpNE = false; - } else - return false; + IsCmpNE = Predicate == CmpInst::ICMP_SLE; + break; + } // end switch static const unsigned OpcTable[2][2][2] = { { {AArch64::CBZW, AArch64::CBZX }, @@ -3302,8 +3325,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { MFI->setFrameAddressIsTaken(true); const AArch64RegisterInfo *RegInfo = - static_cast<const AArch64RegisterInfo *>( - TM.getSubtargetImpl()->getRegisterInfo()); + static_cast<const AArch64RegisterInfo *>(Subtarget->getRegisterInfo()); unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF)); unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index a7779d6..84bf317 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -64,8 +64,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { return false; // Don't use the red zone if the function explicitly asks us not to. // This is typically used for kernel code. - if (MF.getFunction()->getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::NoRedZone)) + if (MF.getFunction()->hasFnAttribute(Attribute::NoRedZone)) return false; const MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -167,7 +166,7 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves( if (CSI.empty()) return; - const DataLayout *TD = MF.getSubtarget().getDataLayout(); + const DataLayout *TD = MF.getTarget().getDataLayout(); bool HasFP = hasFP(MF); // Calculate amount of bytes used for return address storing. @@ -196,7 +195,8 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves( unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( nullptr, DwarfReg, Offset - TotalSkipped)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); } } @@ -214,6 +214,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const { bool HasFP = hasFP(MF); DebugLoc DL = MBB.findDebugLoc(MBBI); + // All calls are tail calls in GHC calling conv, and functions have no + // prologue/epilogue. + if (MF.getFunction()->getCallingConv() == CallingConv::GHC) + return; + int NumBytes = (int)MFI->getStackSize(); if (!AFI->hasStackFrame()) { assert(!HasFP && "unexpected function without stack frame but with FP"); @@ -234,7 +239,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const { unsigned CFIIndex = MMI.addFrameInst( MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); } else if (NumBytes) { ++NumRedZoneFunctions; } @@ -301,7 +307,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const { TII->copyPhysReg(MBB, MBBI, DL, AArch64::X19, AArch64::SP, false); if (needsFrameMoves) { - const DataLayout *TD = MF.getSubtarget().getDataLayout(); + const DataLayout *TD = MF.getTarget().getDataLayout(); const int StackGrowth = -TD->getPointerSize(0); unsigned FramePtr = RegInfo->getFrameRegister(MF); @@ -377,26 +383,30 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const { unsigned CFIIndex = MMI.addFrameInst( MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); // Record the location of the stored LR unsigned LR = RegInfo->getDwarfRegNum(AArch64::LR, true); CFIIndex = MMI.addFrameInst( MCCFIInstruction::createOffset(nullptr, LR, StackGrowth)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); // Record the location of the stored FP CFIIndex = MMI.addFrameInst( MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); } else { // Encode the stack size of the leaf function. unsigned CFIIndex = MMI.addFrameInst( MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize())); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); } // Now emit the moves for whatever callee saved regs we have. @@ -445,6 +455,11 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, int NumBytes = MFI->getStackSize(); const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + // All calls are tail calls in GHC calling conv, and functions have no + // prologue/epilogue. + if (MF.getFunction()->getCallingConv() == CallingConv::GHC) + return; + // Initial and residual are named for consitency with the prologue. Note that // in the epilogue, the residual adjustment is executed first. uint64_t ArgumentPopSize = 0; diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 87a6d80..ac11c4d 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -53,12 +53,10 @@ public: } bool runOnMachineFunction(MachineFunction &MF) override { - AttributeSet FnAttrs = MF.getFunction()->getAttributes(); ForCodeSize = - FnAttrs.hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize) || - FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); - Subtarget = &TM.getSubtarget<AArch64Subtarget>(); + MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) || + MF.getFunction()->hasFnAttribute(Attribute::MinSize); + Subtarget = &MF.getSubtarget<AArch64Subtarget>(); return SelectionDAGISel::runOnMachineFunction(MF); } @@ -134,8 +132,8 @@ public: /// Generic helper for the createDTuple/createQTuple /// functions. Those should almost always be called instead. - SDValue createTuple(ArrayRef<SDValue> Vecs, unsigned RegClassIDs[], - unsigned SubRegs[]); + SDValue createTuple(ArrayRef<SDValue> Vecs, const unsigned RegClassIDs[], + const unsigned SubRegs[]); SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt); @@ -569,6 +567,27 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg, return isWorthFolding(N); } +/// If there's a use of this ADDlow that's not itself a load/store then we'll +/// need to create a real ADD instruction from it anyway and there's no point in +/// folding it into the mem op. Theoretically, it shouldn't matter, but there's +/// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding +/// leads to duplaicated ADRP instructions. +static bool isWorthFoldingADDlow(SDValue N) { + for (auto Use : N->uses()) { + if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE && + Use->getOpcode() != ISD::ATOMIC_LOAD && + Use->getOpcode() != ISD::ATOMIC_STORE) + return false; + + // ldar and stlr have much more restrictive addressing modes (just a + // register). + if (cast<MemSDNode>(Use)->getOrdering() > Monotonic) + return false; + } + + return true; +} + /// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit /// immediate" address. The "Size" argument is the size in bytes of the memory /// reference, which determines the scale. @@ -582,7 +601,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size, return true; } - if (N.getOpcode() == AArch64ISD::ADDlow) { + if (N.getOpcode() == AArch64ISD::ADDlow && isWorthFoldingADDlow(N)) { GlobalAddressSDNode *GAN = dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode()); Base = N.getOperand(0); @@ -594,7 +613,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size, unsigned Alignment = GV->getAlignment(); const DataLayout *DL = TLI->getDataLayout(); Type *Ty = GV->getType()->getElementType(); - if (Alignment == 0 && Ty->isSized() && !Subtarget->isTargetDarwin()) + if (Alignment == 0 && Ty->isSized()) Alignment = DL->getABITypeAlignment(Ty); if (Alignment >= Size) @@ -869,26 +888,26 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size, } SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) { - static unsigned RegClassIDs[] = { + static const unsigned RegClassIDs[] = { AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID}; - static unsigned SubRegs[] = { AArch64::dsub0, AArch64::dsub1, - AArch64::dsub2, AArch64::dsub3 }; + static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1, + AArch64::dsub2, AArch64::dsub3}; return createTuple(Regs, RegClassIDs, SubRegs); } SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) { - static unsigned RegClassIDs[] = { + static const unsigned RegClassIDs[] = { AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID}; - static unsigned SubRegs[] = { AArch64::qsub0, AArch64::qsub1, - AArch64::qsub2, AArch64::qsub3 }; + static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1, + AArch64::qsub2, AArch64::qsub3}; return createTuple(Regs, RegClassIDs, SubRegs); } SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs, - unsigned RegClassIDs[], - unsigned SubRegs[]) { + const unsigned RegClassIDs[], + const unsigned SubRegs[]) { // There's no special register-class for a vector-list of 1 element: it's just // a vector. if (Regs.size() == 1) @@ -1033,13 +1052,10 @@ SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, EVT VT = N->getValueType(0); SDValue Chain = N->getOperand(0); - SmallVector<SDValue, 6> Ops; - Ops.push_back(N->getOperand(2)); // Mem operand; - Ops.push_back(Chain); + SDValue Ops[] = {N->getOperand(2), // Mem operand; + Chain}; - std::vector<EVT> ResTys; - ResTys.push_back(MVT::Untyped); - ResTys.push_back(MVT::Other); + EVT ResTys[] = {MVT::Untyped, MVT::Other}; SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); SDValue SuperReg = SDValue(Ld, 0); @@ -1057,15 +1073,12 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs, EVT VT = N->getValueType(0); SDValue Chain = N->getOperand(0); - SmallVector<SDValue, 6> Ops; - Ops.push_back(N->getOperand(1)); // Mem operand - Ops.push_back(N->getOperand(2)); // Incremental - Ops.push_back(Chain); + SDValue Ops[] = {N->getOperand(1), // Mem operand + N->getOperand(2), // Incremental + Chain}; - std::vector<EVT> ResTys; - ResTys.push_back(MVT::i64); // Type of the write back register - ResTys.push_back(MVT::Untyped); - ResTys.push_back(MVT::Other); + EVT ResTys[] = {MVT::i64, // Type of the write back register + MVT::Untyped, MVT::Other}; SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); @@ -1096,10 +1109,7 @@ SDNode *AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs); SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs); - SmallVector<SDValue, 6> Ops; - Ops.push_back(RegSeq); - Ops.push_back(N->getOperand(NumVecs + 2)); - Ops.push_back(N->getOperand(0)); + SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), N->getOperand(0)}; SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops); return St; @@ -1109,20 +1119,18 @@ SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc) { SDLoc dl(N); EVT VT = N->getOperand(2)->getValueType(0); - SmallVector<EVT, 2> ResTys; - ResTys.push_back(MVT::i64); // Type of the write back register - ResTys.push_back(MVT::Other); // Type for the Chain + EVT ResTys[] = {MVT::i64, // Type of the write back register + MVT::Other}; // Type for the Chain // Form a REG_SEQUENCE to force register allocation. bool Is128Bit = VT.getSizeInBits() == 128; SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs); SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs); - SmallVector<SDValue, 6> Ops; - Ops.push_back(RegSeq); - Ops.push_back(N->getOperand(NumVecs + 1)); // base register - Ops.push_back(N->getOperand(NumVecs + 2)); // Incremental - Ops.push_back(N->getOperand(0)); // Chain + SDValue Ops[] = {RegSeq, + N->getOperand(NumVecs + 1), // base register + N->getOperand(NumVecs + 2), // Incremental + N->getOperand(0)}; // Chain SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); return St; @@ -1176,18 +1184,13 @@ SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs, SDValue RegSeq = createQTuple(Regs); - std::vector<EVT> ResTys; - ResTys.push_back(MVT::Untyped); - ResTys.push_back(MVT::Other); + EVT ResTys[] = {MVT::Untyped, MVT::Other}; unsigned LaneNo = cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue(); - SmallVector<SDValue, 6> Ops; - Ops.push_back(RegSeq); - Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); - Ops.push_back(N->getOperand(NumVecs + 3)); - Ops.push_back(N->getOperand(0)); + SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, MVT::i64), + N->getOperand(NumVecs + 3), N->getOperand(0)}; SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); SDValue SuperReg = SDValue(Ld, 0); @@ -1221,20 +1224,17 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs, SDValue RegSeq = createQTuple(Regs); - std::vector<EVT> ResTys; - ResTys.push_back(MVT::i64); // Type of the write back register - ResTys.push_back(MVT::Untyped); - ResTys.push_back(MVT::Other); + EVT ResTys[] = {MVT::i64, // Type of the write back register + MVT::Untyped, MVT::Other}; unsigned LaneNo = cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue(); - SmallVector<SDValue, 6> Ops; - Ops.push_back(RegSeq); - Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); // Lane Number - Ops.push_back(N->getOperand(NumVecs + 2)); // Base register - Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental - Ops.push_back(N->getOperand(0)); + SDValue Ops[] = {RegSeq, + CurDAG->getTargetConstant(LaneNo, MVT::i64), // Lane Number + N->getOperand(NumVecs + 2), // Base register + N->getOperand(NumVecs + 3), // Incremental + N->getOperand(0)}; SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); // Update uses of the write back register @@ -1282,11 +1282,8 @@ SDNode *AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned LaneNo = cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue(); - SmallVector<SDValue, 6> Ops; - Ops.push_back(RegSeq); - Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); - Ops.push_back(N->getOperand(NumVecs + 3)); - Ops.push_back(N->getOperand(0)); + SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, MVT::i64), + N->getOperand(NumVecs + 3), N->getOperand(0)}; SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); // Transfer memoperands. @@ -1312,19 +1309,16 @@ SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs, SDValue RegSeq = createQTuple(Regs); - SmallVector<EVT, 2> ResTys; - ResTys.push_back(MVT::i64); // Type of the write back register - ResTys.push_back(MVT::Other); + EVT ResTys[] = {MVT::i64, // Type of the write back register + MVT::Other}; unsigned LaneNo = cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue(); - SmallVector<SDValue, 6> Ops; - Ops.push_back(RegSeq); - Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); - Ops.push_back(N->getOperand(NumVecs + 2)); // Base Register - Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental - Ops.push_back(N->getOperand(0)); + SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, MVT::i64), + N->getOperand(NumVecs + 2), // Base Register + N->getOperand(NumVecs + 3), // Incremental + N->getOperand(0)}; SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); // Transfer memoperands. @@ -1403,12 +1397,17 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N, } else return false; - assert((BiggerPattern || (Srl_imm > 0 && Srl_imm < VT.getSizeInBits())) && - "bad amount in shift node!"); + // Bail out on large immediates. This happens when no proper + // combining/constant folding was performed. + if (!BiggerPattern && (Srl_imm <= 0 || Srl_imm >= VT.getSizeInBits())) { + DEBUG((dbgs() << N + << ": Found large shift immediate, this should not happen\n")); + return false; + } LSB = Srl_imm; - MSB = Srl_imm + (VT == MVT::i32 ? CountTrailingOnes_32(And_imm) - : CountTrailingOnes_64(And_imm)) - + MSB = Srl_imm + (VT == MVT::i32 ? countTrailingOnes<uint32_t>(And_imm) + : countTrailingOnes<uint64_t>(And_imm)) - 1; if (ClampMSB) // Since we're moving the extend before the right shift operation, we need @@ -1452,7 +1451,7 @@ static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc, return false; // Check whether we really have several bits extract here. - unsigned BitWide = 64 - CountLeadingOnes_64(~(And_mask >> Srl_imm)); + unsigned BitWide = 64 - countLeadingOnes(~(And_mask >> Srl_imm)); if (BitWide && isMask_64(And_mask >> Srl_imm)) { if (N->getValueType(0) == MVT::i32) Opc = AArch64::UBFMWri; @@ -1508,7 +1507,14 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, } else return false; - assert(Shl_imm < VT.getSizeInBits() && "bad amount in shift node!"); + // Missing combines/constant folding may have left us with strange + // constants. + if (Shl_imm >= VT.getSizeInBits()) { + DEBUG((dbgs() << N + << ": Found large shift immediate, this should not happen\n")); + return false; + } + uint64_t Srl_imm = 0; if (!isIntImmediate(N->getOperand(1), Srl_imm)) return false; @@ -1851,7 +1857,7 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, return false; ShiftAmount = countTrailingZeros(NonZeroBits); - MaskWidth = CountTrailingOnes_64(NonZeroBits >> ShiftAmount); + MaskWidth = countTrailingOnes(NonZeroBits >> ShiftAmount); // BFI encompasses sufficiently many nodes that it's worth inserting an extra // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL @@ -2229,11 +2235,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { SDValue MemAddr = Node->getOperand(4); // Place arguments in the right order. - SmallVector<SDValue, 7> Ops; - Ops.push_back(ValLo); - Ops.push_back(ValHi); - Ops.push_back(MemAddr); - Ops.push_back(Chain); + SDValue Ops[] = {ValLo, ValHi, MemAddr, Chain}; SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops); // Transfer memoperands. diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 7c94d83..a1b324e 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "AArch64ISelLowering.h" +#include "AArch64CallingConvention.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64PerfectShuffle.h" #include "AArch64Subtarget.h" @@ -66,10 +67,9 @@ EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, cl::desc("Allow AArch64 SLI/SRI formation"), cl::init(false)); - -AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM) - : TargetLowering(TM) { - Subtarget = &TM.getSubtarget<AArch64Subtarget>(); +AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, + const AArch64Subtarget &STI) + : TargetLowering(TM), Subtarget(&STI) { // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so // we have to make something up. Arbitrarily, choose ZeroOrOne. @@ -111,7 +111,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM) } // Compute derived properties from the register classes - computeRegisterProperties(); + computeRegisterProperties(Subtarget->getRegisterInfo()); // Provide all sorts of operation actions setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); @@ -386,13 +386,24 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM) setOperationAction(ISD::FSINCOS, MVT::f32, Expand); } + // Make floating-point constants legal for the large code model, so they don't + // become loads from the constant pool. + if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { + setOperationAction(ISD::ConstantFP, MVT::f32, Legal); + setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + } + // AArch64 does not have floating-point extending loads, i1 sign-extending // load, floating-point truncating stores, or v2i32->v2i16 truncating store. - setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand); + for (MVT VT : MVT::fp_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); + } + for (MVT VT : MVT::integer_valuetypes()) + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); + setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); @@ -531,26 +542,22 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM) setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); // Likewise, narrowing and extending vector loads/stores aren't handled // directly. - for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { - - setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, - Expand); - - setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); - - setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); - - for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) - setTruncStoreAction((MVT::SimpleValueType)VT, - (MVT::SimpleValueType)InnerVT, Expand); - setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); - setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); - setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); + for (MVT VT : MVT::vector_valuetypes()) { + setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); + + setOperationAction(ISD::MULHS, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + + setOperationAction(ISD::BSWAP, VT, Expand); + + for (MVT InnerVT : MVT::vector_valuetypes()) { + setTruncStoreAction(VT, InnerVT, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); + } } // AArch64 has implementations of a lot of rounding-like FP operations. @@ -615,7 +622,8 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand); - setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand); + for (MVT InnerVT : MVT::all_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand); // CNT supports only B element sizes. if (VT != MVT::v8i8 && VT != MVT::v16i8) @@ -722,13 +730,6 @@ MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i64; } -unsigned AArch64TargetLowering::getMaximalGlobalOffset() const { - // FIXME: On AArch64, this depends on the type. - // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes(). - // and the offset has to be a multiple of the related size in bytes. - return 4095; -} - FastISel * AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { @@ -869,9 +870,8 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, // EndBB: // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] - const TargetInstrInfo *TII = - getTargetMachine().getSubtargetImpl()->getInstrInfo(); MachineFunction *MF = MBB->getParent(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); DebugLoc DL = MI->getDebugLoc(); MachineFunction::iterator It = MBB; @@ -1330,10 +1330,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, RTLIB::Libcall Call) const { - SmallVector<SDValue, 2> Ops; - for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) - Ops.push_back(Op.getOperand(i)); - + SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false, SDLoc(Op)).first; } @@ -1561,10 +1558,7 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, else LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); - SmallVector<SDValue, 2> Ops; - for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) - Ops.push_back(Op.getOperand(i)); - + SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false, SDLoc(Op)).first; } @@ -1981,6 +1975,8 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, llvm_unreachable("Unsupported calling convention."); case CallingConv::WebKit_JS: return CC_AArch64_WebKit_JS; + case CallingConv::GHC: + return CC_AArch64_GHC; case CallingConv::C: case CallingConv::Fast: if (!Subtarget->isTargetDarwin()) @@ -2012,18 +2008,19 @@ SDValue AArch64TargetLowering::LowerFormalArguments( unsigned CurArgIdx = 0; for (unsigned i = 0; i != NumArgs; ++i) { MVT ValVT = Ins[i].VT; - std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx); - CurArgIdx = Ins[i].OrigArgIndex; - - // Get type of the original argument. - EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true); - MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; - // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. - if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) - ValVT = MVT::i8; - else if (ActualMVT == MVT::i16) - ValVT = MVT::i16; + if (Ins[i].isOrigArg()) { + std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx); + CurArgIdx = Ins[i].getOrigArgIndex(); + // Get type of the original argument. + EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true); + MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; + // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. + if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) + ValVT = MVT::i8; + else if (ActualMVT == MVT::i16) + ValVT = MVT::i16; + } CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); @@ -2106,7 +2103,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( unsigned ArgSize = VA.getValVT().getSizeInBits() / 8; uint32_t BEAlign = 0; - if (ArgSize < 8 && !Subtarget->isLittleEndian()) + if (!Subtarget->isLittleEndian() && ArgSize < 8 && + !Ins[i].Flags.isInConsecutiveRegs()) BEAlign = 8 - ArgSize; int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); @@ -2198,8 +2196,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, AArch64::X3, AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7 }; static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); - unsigned FirstVariadicGPR = - CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs); + unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs); unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); int GPRIdx = 0; @@ -2227,8 +2224,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs); - unsigned FirstVariadicFPR = - CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs); + unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs); unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); int FPRIdx = 0; @@ -2349,7 +2345,9 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( // cannot rely on the linker replacing the tail call with a return. if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { const GlobalValue *GV = G->getGlobal(); - if (GV->hasExternalWeakLinkage()) + const Triple TT(getTargetMachine().getTargetTriple()); + if (GV->hasExternalWeakLinkage() && + (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) return false; } @@ -2660,7 +2658,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 : VA.getValVT().getSizeInBits(); OpSize = (OpSize + 7) / 8; - if (!Subtarget->isLittleEndian() && !Flags.isByVal()) { + if (!Subtarget->isLittleEndian() && !Flags.isByVal() && + !Flags.isInConsecutiveRegs()) { if (OpSize < 8) BEAlign = 8 - OpSize; } @@ -2782,19 +2781,16 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Add a register mask operand representing the call-preserved registers. const uint32_t *Mask; - const TargetRegisterInfo *TRI = - getTargetMachine().getSubtargetImpl()->getRegisterInfo(); - const AArch64RegisterInfo *ARI = - static_cast<const AArch64RegisterInfo *>(TRI); + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); if (IsThisReturn) { // For 'this' returns, use the X0-preserving mask if applicable - Mask = ARI->getThisReturnPreservedMask(CallConv); + Mask = TRI->getThisReturnPreservedMask(CallConv); if (!Mask) { IsThisReturn = false; - Mask = ARI->getCallPreservedMask(CallConv); + Mask = TRI->getCallPreservedMask(CallConv); } } else - Mask = ARI->getCallPreservedMask(CallConv); + Mask = TRI->getCallPreservedMask(CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -3014,11 +3010,8 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, // TLS calls preserve all registers except those that absolutely must be // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be // silly). - const TargetRegisterInfo *TRI = - getTargetMachine().getSubtargetImpl()->getRegisterInfo(); - const AArch64RegisterInfo *ARI = - static_cast<const AArch64RegisterInfo *>(TRI); - const uint32_t *Mask = ARI->getTLSCallPreservedMask(); + const uint32_t *Mask = + Subtarget->getRegisterInfo()->getTLSCallPreservedMask(); // Finally, we can make the call. This is just a degenerate version of a // normal AArch64 call node: x0 takes the address of the descriptor, and @@ -3065,11 +3058,8 @@ SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr, // TLS calls preserve all registers except those that absolutely must be // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be // silly). - const TargetRegisterInfo *TRI = - getTargetMachine().getSubtargetImpl()->getRegisterInfo(); - const AArch64RegisterInfo *ARI = - static_cast<const AArch64RegisterInfo *>(TRI); - const uint32_t *Mask = ARI->getTLSCallPreservedMask(); + const uint32_t *Mask = + Subtarget->getRegisterInfo()->getTLSCallPreservedMask(); // The function takes only one argument: the address of the descriptor itself // in X0. @@ -3259,8 +3249,8 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { OFCC = getInvertedCondCode(OFCC); SDValue CCVal = DAG.getConstant(OFCC, MVT::i32); - return DAG.getNode(AArch64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest, - CCVal, Overflow); + return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, + Overflow); } if (LHS.getValueType().isInteger()) { @@ -3429,8 +3419,8 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, } SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { - if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) + if (DAG.getMachineFunction().getFunction()->hasFnAttribute( + Attribute::NoImplicitFloat)) return SDValue(); if (!Subtarget->hasNEON()) @@ -3447,18 +3437,12 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { SDValue Val = Op.getOperand(0); SDLoc DL(Op); EVT VT = Op.getValueType(); - SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8); - SDValue VecVal; - if (VT == MVT::i32) { - VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val); - VecVal = DAG.getTargetInsertSubreg(AArch64::ssub, DL, MVT::v8i8, ZeroVec, - VecVal); - } else { - VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); - } + if (VT == MVT::i32) + Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); + Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); - SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal); + SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val); SDValue UaddLV = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, MVT::i32), CtPop); @@ -4279,7 +4263,8 @@ AArch64TargetLowering::getSingleConstraintMatchWeight( std::pair<unsigned, const TargetRegisterClass *> AArch64TargetLowering::getRegForInlineAsmConstraint( - const std::string &Constraint, MVT VT) const { + const TargetRegisterInfo *TRI, const std::string &Constraint, + MVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': @@ -4308,7 +4293,7 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. std::pair<unsigned, const TargetRegisterClass *> Res; - Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); + Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); // Not found as a standard register? if (!Res.second) { @@ -4615,19 +4600,21 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, // The extraction can just take the second half Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, - DAG.getIntPtrConstant(NumSrcElts)); + DAG.getConstant(NumSrcElts, MVT::i64)); Src.WindowBase = -NumSrcElts; } else if (Src.MaxElt < NumSrcElts) { // The extraction can just take the first half - Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, - Src.ShuffleVec, DAG.getIntPtrConstant(0)); + Src.ShuffleVec = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(0, MVT::i64)); } else { // An actual VEXT is needed - SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, - Src.ShuffleVec, DAG.getIntPtrConstant(0)); + SDValue VEXTSrc1 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(0, MVT::i64)); SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, - DAG.getIntPtrConstant(NumSrcElts)); + DAG.getConstant(NumSrcElts, MVT::i64)); unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, @@ -6270,6 +6257,8 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, SDLoc dl, SelectionDAG &DAG) { EVT SrcVT = LHS.getValueType(); + assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && + "function only supposed to emit natural comparisons"); BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode()); APInt CnstBits(VT.getSizeInBits(), 0); @@ -6364,13 +6353,15 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); + EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger(); SDLoc dl(Op); if (LHS.getValueType().getVectorElementType().isInteger()) { assert(LHS.getValueType() == RHS.getValueType()); AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); - return EmitVectorComparison(LHS, RHS, AArch64CC, false, Op.getValueType(), - dl, DAG); + SDValue Cmp = + EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG); + return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); } assert(LHS.getValueType().getVectorElementType() == MVT::f32 || @@ -6384,19 +6375,21 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath; SDValue Cmp = - EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG); + EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); if (!Cmp.getNode()) return SDValue(); if (CC2 != AArch64CC::AL) { SDValue Cmp2 = - EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG); + EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG); if (!Cmp2.getNode()) return SDValue(); - Cmp = DAG.getNode(ISD::OR, dl, Cmp.getValueType(), Cmp, Cmp2); + Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2); } + Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); + if (ShouldInvert) return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); @@ -6534,6 +6527,34 @@ bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { return NumBits1 > NumBits2; } +/// Check if it is profitable to hoist instruction in then/else to if. +/// Not profitable if I and it's user can form a FMA instruction +/// because we prefer FMSUB/FMADD. +bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { + if (I->getOpcode() != Instruction::FMul) + return true; + + if (I->getNumUses() != 1) + return true; + + Instruction *User = I->user_back(); + + if (User && + !(User->getOpcode() == Instruction::FSub || + User->getOpcode() == Instruction::FAdd)) + return true; + + const TargetOptions &Options = getTargetMachine().Options; + EVT VT = getValueType(User->getOperand(0)->getType()); + + if (isFMAFasterThanFMulAndFAdd(VT) && + isOperationLegalOrCustom(ISD::FMA, VT) && + (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath)) + return false; + + return true; +} + // All 32-bit GPR operations implicitly zero the high-half of the corresponding // 64-bit GPR. bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { @@ -6604,8 +6625,7 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, bool Fast; const Function *F = MF.getFunction(); if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 && - !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::NoImplicitFloat) && + !F->hasFnAttribute(Attribute::NoImplicitFloat) && (memOpAlign(SrcAlign, DstAlign, 16) || (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast))) return MVT::f128; @@ -6948,7 +6968,8 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, return SDValue(); } -static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { // First try to optimize away the conversion when it's conditionally from // a constant. Vectors only. SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); @@ -6967,7 +6988,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) { // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. // This eliminates an "integer-to-vector-move UOP and improve throughput. SDValue N0 = N->getOperand(0); - if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && + if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && // Do not change the width of a volatile load. !cast<LoadSDNode>(N0)->isVolatile()) { LoadSDNode *LN0 = cast<LoadSDNode>(N0); @@ -7756,9 +7777,9 @@ static SDValue performExtendCombine(SDNode *N, EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), LoVT.getVectorNumElements()); Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, - DAG.getIntPtrConstant(0)); + DAG.getConstant(0, MVT::i64)); Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, - DAG.getIntPtrConstant(InNVT.getVectorNumElements())); + DAG.getConstant(InNVT.getVectorNumElements(), MVT::i64)); Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo); Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi); @@ -7839,14 +7860,13 @@ static SDValue performSTORECombine(SDNode *N, return SDValue(); // Cyclone has bad performance on unaligned 16B stores when crossing line and - // page boundries. We want to split such stores. + // page boundaries. We want to split such stores. if (!Subtarget->isCyclone()) return SDValue(); // Don't split at Oz. MachineFunction &MF = DAG.getMachineFunction(); - bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::MinSize); + bool IsMinSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize); if (IsMinSize) return SDValue(); @@ -7880,9 +7900,9 @@ static SDValue performSTORECombine(SDNode *N, EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts); SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, - DAG.getIntPtrConstant(0)); + DAG.getConstant(0, MVT::i64)); SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, - DAG.getIntPtrConstant(NumElts)); + DAG.getConstant(NumElts, MVT::i64)); SDValue BasePtr = S->getBasePtr(); SDValue NewST1 = DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), @@ -7973,7 +7993,7 @@ static SDValue performPostLD1Combine(SDNode *N, LoadSDN->getMemOperand()); // Update the uses. - std::vector<SDValue> NewResults; + SmallVector<SDValue, 2> NewResults; NewResults.push_back(SDValue(LD, 0)); // The result of load NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain DCI.CombineTo(LD, NewResults); @@ -8478,6 +8498,12 @@ static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) { // largest real NEON comparison is 64-bits per lane, which means the result is // at most 32-bits and an illegal vector. Just bail out for now. EVT SrcVT = N0.getOperand(0).getValueType(); + + // Don't try to do this optimization when the setcc itself has i1 operands. + // There are no legal vectors of i1, so this would be pointless. + if (SrcVT == MVT::i1) + return SDValue(); + int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits(); if (!ResVT.isVector() || NumMaskElts == 0) return SDValue(); @@ -8518,7 +8544,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performMulCombine(N, DAG, DCI, Subtarget); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: - return performIntToFpCombine(N, DAG); + return performIntToFpCombine(N, DAG, Subtarget); case ISD::OR: return performORCombine(N, DCI, Subtarget); case ISD::INTRINSIC_WO_CHAIN: @@ -8696,13 +8722,12 @@ bool AArch64TargetLowering::getPostIndexedAddressParts( static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) { - if (N->getValueType(0) != MVT::i16) - return; - SDLoc DL(N); SDValue Op = N->getOperand(0); - assert(Op.getValueType() == MVT::f16 && - "Inconsistent bitcast? Only 16-bit types should be i16 or f16"); + + if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16) + return; + Op = SDValue( DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32, DAG.getUNDEF(MVT::i32), Op, @@ -8732,6 +8757,12 @@ bool AArch64TargetLowering::useLoadStackGuardNode() const { return true; } +bool AArch64TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { + // Combine multiple FDIVs with the same divisor into multiple FMULs by the + // reciprocal if there are three or more FDIVs. + return NumUsers > 2; +} + TargetLoweringBase::LegalizeTypeAction AArch64TargetLowering::getPreferredVectorAction(EVT VT) const { MVT SVT = VT.getSimpleVT(); @@ -8836,3 +8867,8 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, Val, Stxr->getFunctionType()->getParamType(0)), Addr); } + +bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( + Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { + return Ty->isArrayTy(); +} diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index 2f5708d..e973364 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -18,6 +18,7 @@ #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/IR/CallingConv.h" +#include "llvm/IR/Instruction.h" #include "llvm/Target/TargetLowering.h" namespace llvm { @@ -207,7 +208,8 @@ class AArch64TargetLowering : public TargetLowering { bool RequireStrictAlign; public: - explicit AArch64TargetLowering(const TargetMachine &TM); + explicit AArch64TargetLowering(const TargetMachine &TM, + const AArch64Subtarget &STI); /// Selects the correct CCAssignFn for a given CallingConvention value. CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const; @@ -222,7 +224,7 @@ public: MVT getScalarShiftAmountTy(EVT LHSTy) const override; /// allowsMisalignedMemoryAccesses - Returns true if the target allows - /// unaligned memory accesses. of the specified type. + /// unaligned memory accesses of the specified type. bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0, unsigned Align = 1, bool *Fast = nullptr) const override { @@ -244,10 +246,6 @@ public: /// getFunctionAlignment - Return the Log2 alignment of this function. unsigned getFunctionAlignment(const Function *F) const; - /// getMaximalGlobalOffset - Returns the maximal possible offset which can - /// be used for loads / stores from the global. - unsigned getMaximalGlobalOffset() const override; - /// Returns true if a cast between SrcAS and DestAS is a noop. bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { // Addrspacecasts are always noops. @@ -285,6 +283,8 @@ public: bool isTruncateFree(Type *Ty1, Type *Ty2) const override; bool isTruncateFree(EVT VT1, EVT VT2) const override; + bool isProfitableToHoist(Instruction *I) const override; + bool isZExtFree(Type *Ty1, Type *Ty2) const override; bool isZExtFree(EVT VT1, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; @@ -440,6 +440,7 @@ private: SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector<SDNode *> *Created) const override; + bool combineRepeatedFPDivisors(unsigned NumUsers) const override; ConstraintType getConstraintType(const std::string &Constraint) const override; @@ -452,7 +453,8 @@ private: const char *constraint) const override; std::pair<unsigned, const TargetRegisterClass *> - getRegForInlineAsmConstraint(const std::string &Constraint, + getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + const std::string &Constraint, MVT VT) const override; void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, @@ -472,6 +474,10 @@ private: void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const override; + + bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, + CallingConv::ID CallConv, + bool isVarArg) const override; }; namespace AArch64 { diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index 2b0f5d2..d295c02 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -4383,7 +4383,7 @@ class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size, } multiclass SIMDVectorLShiftLongBySizeBHS { - let neverHasSideEffects = 1 in { + let hasSideEffects = 0 in { def v8i8 : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64, "shll", ".8h", ".8b", "8">; def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128, diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index 2dbb31c..64cec55 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -12,9 +12,9 @@ //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" +#include "AArch64MachineCombinerPattern.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" -#include "AArch64MachineCombinerPattern.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" @@ -707,9 +707,8 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) { assert(MBB && "Can't get MachineBasicBlock here"); MachineFunction *MF = MBB->getParent(); assert(MF && "Can't get MachineFunction here"); - const TargetMachine *TM = &MF->getTarget(); - const TargetInstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); - const TargetRegisterInfo *TRI = TM->getSubtargetImpl()->getRegisterInfo(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); MachineRegisterInfo *MRI = &MF->getRegInfo(); for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx; diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index 30bf650..d8f1274 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -16,8 +16,8 @@ #include "AArch64.h" #include "AArch64RegisterInfo.h" -#include "llvm/Target/TargetInstrInfo.h" #include "llvm/CodeGen/MachineCombinerPattern.h" +#include "llvm/Target/TargetInstrInfo.h" #define GET_INSTRINFO_HEADER #include "AArch64GenInstrInfo.inc" diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index 252ed40..6e4c0b0 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -481,6 +481,24 @@ def trunc_imm : SDNodeXForm<imm, [{ def : Pat<(i64 i64imm_32bit:$src), (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>; +// Materialize FP constants via MOVi32imm/MOVi64imm (MachO large code model). +def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{ +return CurDAG->getTargetConstant( + N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i32); +}]>; + +def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{ +return CurDAG->getTargetConstant( + N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i64); +}]>; + + +def : Pat<(f32 fpimm:$in), + (COPY_TO_REGCLASS (MOVi32imm (bitcast_fpimm_to_i32 f32:$in)), FPR32)>; +def : Pat<(f64 fpimm:$in), + (COPY_TO_REGCLASS (MOVi64imm (bitcast_fpimm_to_i64 f64:$in)), FPR64)>; + + // Deal with the various forms of (ELF) large addressing with MOVZ/MOVK // sequences. def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2, @@ -639,6 +657,10 @@ def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))), (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>; def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))), (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>; +def : Pat<(i32 (mul (ineg GPR32:$Rn), GPR32:$Rm)), + (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>; +def : Pat<(i64 (mul (ineg GPR64:$Rn), GPR64:$Rm)), + (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>; } // AddedComplexity = 7 let AddedComplexity = 5 in { @@ -789,7 +811,7 @@ def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>; //===----------------------------------------------------------------------===// // Bitfield immediate extraction instruction. //===----------------------------------------------------------------------===// -let neverHasSideEffects = 1 in +let hasSideEffects = 0 in defm EXTR : ExtractImm<"extr">; def : InstAlias<"ror $dst, $src, $shift", (EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>; @@ -804,7 +826,7 @@ def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)), //===----------------------------------------------------------------------===// // Other bitfield immediate instructions. //===----------------------------------------------------------------------===// -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { defm BFM : BitfieldImmWith2RegArgs<0b01, "bfm">; defm SBFM : BitfieldImm<0b00, "sbfm">; defm UBFM : BitfieldImm<0b10, "ubfm">; @@ -977,9 +999,9 @@ def : InstAlias<"cneg $dst, $src, $cc", // PC-relative instructions. //===----------------------------------------------------------------------===// let isReMaterializable = 1 in { -let neverHasSideEffects = 1, mayStore = 0, mayLoad = 0 in { +let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in { def ADR : ADRI<0, "adr", adrlabel, []>; -} // neverHasSideEffects = 1 +} // hasSideEffects = 0 def ADRP : ADRI<1, "adrp", adrplabel, [(set GPR64:$Xd, (AArch64adrp tglobaladdr:$label))]>; @@ -1867,6 +1889,33 @@ let Predicates = [IsLE] in { } } // AddedComplexity = 10 +// Match stores from lane 0 to the appropriate subreg's store. +multiclass VecROStoreLane0Pat<ROAddrMode ro, SDPatternOperator storeop, + ValueType VecTy, ValueType STy, + SubRegIndex SubRegIdx, + Instruction STRW, Instruction STRX> { + + def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)), + (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)), + (STRW (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx), + GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>; + + def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)), + (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)), + (STRX (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx), + GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>; +} + +let AddedComplexity = 19 in { + defm : VecROStoreLane0Pat<ro16, truncstorei16, v8i16, i32, hsub, STRHroW, STRHroX>; + defm : VecROStoreLane0Pat<ro16, store , v8i16, i16, hsub, STRHroW, STRHroX>; + defm : VecROStoreLane0Pat<ro32, truncstorei32, v4i32, i32, ssub, STRSroW, STRSroX>; + defm : VecROStoreLane0Pat<ro32, store , v4i32, i32, ssub, STRSroW, STRSroX>; + defm : VecROStoreLane0Pat<ro32, store , v4f32, f32, ssub, STRSroW, STRSroX>; + defm : VecROStoreLane0Pat<ro64, store , v2i64, i64, dsub, STRDroW, STRDroX>; + defm : VecROStoreLane0Pat<ro64, store , v2f64, f64, dsub, STRDroW, STRDroX>; +} + //--- // (unsigned immediate) defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str", @@ -3667,29 +3716,21 @@ defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, INSvi32lane>; // Floating point vector extractions are codegen'd as either a sequence of -// subregister extractions, possibly fed by an INS if the lane number is -// anything other than zero. +// subregister extractions, or a MOV (aka CPY here, alias for DUP) if +// the lane number is anything other than zero. def : Pat<(vector_extract (v2f64 V128:$Rn), 0), (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>; def : Pat<(vector_extract (v4f32 V128:$Rn), 0), (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>; def : Pat<(vector_extract (v8f16 V128:$Rn), 0), (f16 (EXTRACT_SUBREG V128:$Rn, hsub))>; + def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx), - (f64 (EXTRACT_SUBREG - (INSvi64lane (v2f64 (IMPLICIT_DEF)), 0, - V128:$Rn, VectorIndexD:$idx), - dsub))>; + (f64 (CPYi64 V128:$Rn, VectorIndexD:$idx))>; def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx), - (f32 (EXTRACT_SUBREG - (INSvi32lane (v4f32 (IMPLICIT_DEF)), 0, - V128:$Rn, VectorIndexS:$idx), - ssub))>; + (f32 (CPYi32 V128:$Rn, VectorIndexS:$idx))>; def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx), - (f16 (EXTRACT_SUBREG - (INSvi16lane (v8f16 (IMPLICIT_DEF)), 0, - V128:$Rn, VectorIndexH:$idx), - hsub))>; + (f16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>; // All concat_vectors operations are canonicalised to act on i64 vectors for // AArch64. In the general case we need an instruction, which had just as well be @@ -4124,7 +4165,7 @@ def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s", // AdvSIMD indexed element //---------------------------------------------------------------------------- -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { defm FMLA : SIMDFPIndexedSDTied<0, 0b0001, "fmla">; defm FMLS : SIMDFPIndexedSDTied<0, 0b0101, "fmls">; } @@ -4678,7 +4719,7 @@ defm LD1R : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>; defm LD2R : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>; defm LD3R : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>; defm LD4R : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>; -let mayLoad = 1, neverHasSideEffects = 1 in { +let mayLoad = 1, hasSideEffects = 0 in { defm LD1 : SIMDLdSingleBTied<0, 0b000, "ld1", VecListOneb, GPR64pi1>; defm LD1 : SIMDLdSingleHTied<0, 0b010, 0, "ld1", VecListOneh, GPR64pi2>; defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes, GPR64pi4>; @@ -4768,7 +4809,7 @@ defm ST1 : SIMDStSingleH<0, 0b010, 0, "st1", VecListOneh, GPR64pi2>; defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>; defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>; -let AddedComplexity = 15 in +let AddedComplexity = 19 in class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex, ValueType VTy, ValueType STy, Instruction ST1> : Pat<(scalar_store @@ -4784,7 +4825,7 @@ def : St1Lane128Pat<store, VectorIndexD, v2i64, i64, ST1i64>; def : St1Lane128Pat<store, VectorIndexD, v2f64, f64, ST1i64>; def : St1Lane128Pat<store, VectorIndexH, v8f16, f16, ST1i16>; -let AddedComplexity = 15 in +let AddedComplexity = 19 in class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex, ValueType VTy, ValueType STy, Instruction ST1> : Pat<(scalar_store @@ -4848,7 +4889,7 @@ defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>; defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>; defm : St1LanePost128Pat<post_store, VectorIndexH, v8f16, f16, ST1i16_POST, 2>; -let mayStore = 1, neverHasSideEffects = 1 in { +let mayStore = 1, hasSideEffects = 0 in { defm ST2 : SIMDStSingleB<1, 0b000, "st2", VecListTwob, GPR64pi2>; defm ST2 : SIMDStSingleH<1, 0b010, 0, "st2", VecListTwoh, GPR64pi4>; defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos, GPR64pi8>; diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 8157981..8463ce6 100644 --- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -135,6 +135,8 @@ static bool isUnscaledLdst(unsigned Opc) { return true; case AArch64::LDURXi: return true; + case AArch64::LDURSWi: + return true; } } @@ -173,6 +175,9 @@ int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) { case AArch64::LDRXui: case AArch64::LDURXi: return 8; + case AArch64::LDRSWui: + case AArch64::LDURSWi: + return 4; } } @@ -210,6 +215,9 @@ static unsigned getMatchingPairOpcode(unsigned Opc) { case AArch64::LDRXui: case AArch64::LDURXi: return AArch64::LDPXi; + case AArch64::LDRSWui: + case AArch64::LDURSWi: + return AArch64::LDPSWi; } } @@ -237,6 +245,8 @@ static unsigned getPreIndexedOpcode(unsigned Opc) { return AArch64::LDRWpre; case AArch64::LDRXui: return AArch64::LDRXpre; + case AArch64::LDRSWui: + return AArch64::LDRSWpre; } } @@ -264,6 +274,8 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { return AArch64::LDRWpost; case AArch64::LDRXui: return AArch64::LDRXpost; + case AArch64::LDRSWui: + return AArch64::LDRSWpost; } } @@ -780,6 +792,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { case AArch64::LDRQui: case AArch64::LDRXui: case AArch64::LDRWui: + case AArch64::LDRSWui: // do the unscaled versions as well case AArch64::STURSi: case AArch64::STURDi: @@ -790,7 +803,8 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { case AArch64::LDURDi: case AArch64::LDURQi: case AArch64::LDURWi: - case AArch64::LDURXi: { + case AArch64::LDURXi: + case AArch64::LDURSWi: { // If this is a volatile load/store, don't mess with it. if (MI->hasOrderedMemoryRef()) { ++MBBI; @@ -931,10 +945,8 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { } bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { - const TargetMachine &TM = Fn.getTarget(); - TII = static_cast<const AArch64InstrInfo *>( - TM.getSubtargetImpl()->getInstrInfo()); - TRI = TM.getSubtargetImpl()->getRegisterInfo(); + TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo()); + TRI = Fn.getSubtarget().getRegisterInfo(); bool Modified = false; for (auto &MBB : Fn) diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp index f942c4e..4690177 100644 --- a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp +++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp @@ -235,7 +235,7 @@ bool A57ChainingConstraint::addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd, costs[i + 1][j + 1] = sameParityMax + 1.0; } } - G.setEdgeCosts(edge, std::move(costs)); + G.updateEdgeCosts(edge, std::move(costs)); return true; } @@ -312,7 +312,7 @@ void A57ChainingConstraint::addInterChainConstraint(PBQPRAGraph &G, unsigned Rd, costs[i + 1][j + 1] = sameParityMax + 1.0; } } - G.setEdgeCosts(edge, std::move(costs)); + G.updateEdgeCosts(edge, std::move(costs)); } } } @@ -328,7 +328,7 @@ void A57ChainingConstraint::apply(PBQPRAGraph &G) { const MachineFunction &MF = G.getMetadata().MF; LiveIntervals &LIs = G.getMetadata().LIS; - TRI = MF.getTarget().getSubtargetImpl()->getRegisterInfo(); + TRI = MF.getSubtarget().getRegisterInfo(); DEBUG(MF.dump()); for (const auto &MBB: MF) { diff --git a/lib/Target/AArch64/AArch64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp index 16c33b7..c037c86 100644 --- a/lib/Target/AArch64/AArch64PromoteConstant.cpp +++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp @@ -22,7 +22,7 @@ #include "AArch64.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/Constants.h" @@ -31,6 +31,7 @@ #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" @@ -112,44 +113,42 @@ private: AU.addPreserved<DominatorTreeWrapperPass>(); } - /// Type to store a list of User. - typedef SmallVector<Value::user_iterator, 4> Users; + /// Type to store a list of Uses. + typedef SmallVector<Use *, 4> Uses; /// Map an insertion point to all the uses it dominates. - typedef DenseMap<Instruction *, Users> InsertionPoints; + typedef DenseMap<Instruction *, Uses> InsertionPoints; /// Map a function to the required insertion point of load for a /// global variable. typedef DenseMap<Function *, InsertionPoints> InsertionPointsPerFunc; /// Find the closest point that dominates the given Use. - Instruction *findInsertionPoint(Value::user_iterator &Use); + Instruction *findInsertionPoint(Use &Use); /// Check if the given insertion point is dominated by an existing /// insertion point. /// If true, the given use is added to the list of dominated uses for /// the related existing point. /// \param NewPt the insertion point to be checked - /// \param UseIt the use to be added into the list of dominated uses + /// \param Use the use to be added into the list of dominated uses /// \param InsertPts existing insertion points /// \pre NewPt and all instruction in InsertPts belong to the same function /// \return true if one of the insertion point in InsertPts dominates NewPt, /// false otherwise - bool isDominated(Instruction *NewPt, Value::user_iterator &UseIt, - InsertionPoints &InsertPts); + bool isDominated(Instruction *NewPt, Use &Use, InsertionPoints &InsertPts); /// Check if the given insertion point can be merged with an existing /// insertion point in a common dominator. /// If true, the given use is added to the list of the created insertion /// point. /// \param NewPt the insertion point to be checked - /// \param UseIt the use to be added into the list of dominated uses + /// \param Use the use to be added into the list of dominated uses /// \param InsertPts existing insertion points /// \pre NewPt and all instruction in InsertPts belong to the same function /// \pre isDominated returns false for the exact same parameters. /// \return true if it exists an insertion point in InsertPts that could /// have been merged with NewPt in a common dominator, /// false otherwise - bool tryAndMerge(Instruction *NewPt, Value::user_iterator &UseIt, - InsertionPoints &InsertPts); + bool tryAndMerge(Instruction *NewPt, Use &Use, InsertionPoints &InsertPts); /// Compute the minimal insertion points to dominates all the interesting /// uses of value. @@ -182,21 +181,19 @@ private: bool promoteConstant(Constant *Cst); /// Transfer the list of dominated uses of IPI to NewPt in InsertPts. - /// Append UseIt to this list and delete the entry of IPI in InsertPts. - static void appendAndTransferDominatedUses(Instruction *NewPt, - Value::user_iterator &UseIt, + /// Append Use to this list and delete the entry of IPI in InsertPts. + static void appendAndTransferDominatedUses(Instruction *NewPt, Use &Use, InsertionPoints::iterator &IPI, InsertionPoints &InsertPts) { // Record the dominated use. - IPI->second.push_back(UseIt); + IPI->second.push_back(&Use); // Transfer the dominated uses of IPI to NewPt // Inserting into the DenseMap may invalidate existing iterator. // Keep a copy of the key to find the iterator to erase. Instruction *OldInstr = IPI->first; - InsertPts.insert(InsertionPoints::value_type(NewPt, IPI->second)); + InsertPts[NewPt] = std::move(IPI->second); // Erase IPI. - IPI = InsertPts.find(OldInstr); - InsertPts.erase(IPI); + InsertPts.erase(OldInstr); } }; } // end anonymous namespace @@ -328,23 +325,18 @@ static bool shouldConvert(const Constant *Cst) { return isConstantUsingVectorTy(Cst->getType()); } -Instruction * -AArch64PromoteConstant::findInsertionPoint(Value::user_iterator &Use) { +Instruction *AArch64PromoteConstant::findInsertionPoint(Use &Use) { + Instruction *User = cast<Instruction>(Use.getUser()); + // If this user is a phi, the insertion point is in the related // incoming basic block. - PHINode *PhiInst = dyn_cast<PHINode>(*Use); - Instruction *InsertionPoint; - if (PhiInst) - InsertionPoint = - PhiInst->getIncomingBlock(Use.getOperandNo())->getTerminator(); - else - InsertionPoint = dyn_cast<Instruction>(*Use); - assert(InsertionPoint && "User is not an instruction!"); - return InsertionPoint; + if (PHINode *PhiInst = dyn_cast<PHINode>(User)) + return PhiInst->getIncomingBlock(Use.getOperandNo())->getTerminator(); + + return User; } -bool AArch64PromoteConstant::isDominated(Instruction *NewPt, - Value::user_iterator &UseIt, +bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Use &Use, InsertionPoints &InsertPts) { DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>( @@ -363,15 +355,14 @@ bool AArch64PromoteConstant::isDominated(Instruction *NewPt, DEBUG(dbgs() << "Insertion point dominated by:\n"); DEBUG(IPI.first->print(dbgs())); DEBUG(dbgs() << '\n'); - IPI.second.push_back(UseIt); + IPI.second.push_back(&Use); return true; } } return false; } -bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, - Value::user_iterator &UseIt, +bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Use &Use, InsertionPoints &InsertPts) { DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>( *NewPt->getParent()->getParent()).getDomTree(); @@ -391,7 +382,7 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, DEBUG(dbgs() << "Merge insertion point with:\n"); DEBUG(IPI->first->print(dbgs())); DEBUG(dbgs() << "\nat considered insertion point.\n"); - appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts); + appendAndTransferDominatedUses(NewPt, Use, IPI, InsertPts); return true; } @@ -415,7 +406,7 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, DEBUG(dbgs() << '\n'); DEBUG(NewPt->print(dbgs())); DEBUG(dbgs() << '\n'); - appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts); + appendAndTransferDominatedUses(NewPt, Use, IPI, InsertPts); return true; } return false; @@ -424,22 +415,22 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, void AArch64PromoteConstant::computeInsertionPoints( Constant *Val, InsertionPointsPerFunc &InsPtsPerFunc) { DEBUG(dbgs() << "** Compute insertion points **\n"); - for (Value::user_iterator UseIt = Val->user_begin(), - EndUseIt = Val->user_end(); - UseIt != EndUseIt; ++UseIt) { + for (Use &Use : Val->uses()) { + Instruction *User = dyn_cast<Instruction>(Use.getUser()); + // If the user is not an Instruction, we cannot modify it. - if (!isa<Instruction>(*UseIt)) + if (!User) continue; // Filter out uses that should not be converted. - if (!shouldConvertUse(Val, cast<Instruction>(*UseIt), UseIt.getOperandNo())) + if (!shouldConvertUse(Val, User, Use.getOperandNo())) continue; - DEBUG(dbgs() << "Considered use, opidx " << UseIt.getOperandNo() << ":\n"); - DEBUG((*UseIt)->print(dbgs())); + DEBUG(dbgs() << "Considered use, opidx " << Use.getOperandNo() << ":\n"); + DEBUG(User->print(dbgs())); DEBUG(dbgs() << '\n'); - Instruction *InsertionPoint = findInsertionPoint(UseIt); + Instruction *InsertionPoint = findInsertionPoint(Use); DEBUG(dbgs() << "Considered insertion point:\n"); DEBUG(InsertionPoint->print(dbgs())); @@ -449,17 +440,17 @@ void AArch64PromoteConstant::computeInsertionPoints( // by another one. InsertionPoints &InsertPts = InsPtsPerFunc[InsertionPoint->getParent()->getParent()]; - if (isDominated(InsertionPoint, UseIt, InsertPts)) + if (isDominated(InsertionPoint, Use, InsertPts)) continue; // This insertion point is useful, check if we can merge some insertion // point in a common dominator or if NewPt dominates an existing one. - if (tryAndMerge(InsertionPoint, UseIt, InsertPts)) + if (tryAndMerge(InsertionPoint, Use, InsertPts)) continue; DEBUG(dbgs() << "Keep considered insertion point\n"); // It is definitely useful by its own - InsertPts[InsertionPoint].push_back(UseIt); + InsertPts[InsertionPoint].push_back(&Use); } } @@ -470,41 +461,32 @@ bool AArch64PromoteConstant::insertDefinitions( bool HasChanged = false; // Traverse all insertion points in all the function. - for (InsertionPointsPerFunc::iterator FctToInstPtsIt = InsPtsPerFunc.begin(), - EndIt = InsPtsPerFunc.end(); - FctToInstPtsIt != EndIt; ++FctToInstPtsIt) { - InsertionPoints &InsertPts = FctToInstPtsIt->second; + for (const auto &FctToInstPtsIt : InsPtsPerFunc) { + const InsertionPoints &InsertPts = FctToInstPtsIt.second; // Do more checking for debug purposes. #ifndef NDEBUG DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>( - *FctToInstPtsIt->first).getDomTree(); + *FctToInstPtsIt.first).getDomTree(); #endif - GlobalVariable *PromotedGV; assert(!InsertPts.empty() && "Empty uses does not need a definition"); - Module *M = FctToInstPtsIt->first->getParent(); - DenseMap<Module *, GlobalVariable *>::iterator MapIt = - ModuleToMergedGV.find(M); - if (MapIt == ModuleToMergedGV.end()) { + Module *M = FctToInstPtsIt.first->getParent(); + GlobalVariable *&PromotedGV = ModuleToMergedGV[M]; + if (!PromotedGV) { PromotedGV = new GlobalVariable( *M, Cst->getType(), true, GlobalValue::InternalLinkage, nullptr, "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal); PromotedGV->setInitializer(Cst); - ModuleToMergedGV[M] = PromotedGV; DEBUG(dbgs() << "Global replacement: "); DEBUG(PromotedGV->print(dbgs())); DEBUG(dbgs() << '\n'); ++NumPromoted; HasChanged = true; - } else { - PromotedGV = MapIt->second; } - for (InsertionPoints::iterator IPI = InsertPts.begin(), - EndIPI = InsertPts.end(); - IPI != EndIPI; ++IPI) { + for (const auto &IPI : InsertPts) { // Create the load of the global variable. - IRBuilder<> Builder(IPI->first->getParent(), IPI->first); + IRBuilder<> Builder(IPI.first->getParent(), IPI.first); LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV); DEBUG(dbgs() << "**********\n"); DEBUG(dbgs() << "New def: "); @@ -512,18 +494,15 @@ bool AArch64PromoteConstant::insertDefinitions( DEBUG(dbgs() << '\n'); // Update the dominated uses. - Users &DominatedUsers = IPI->second; - for (Value::user_iterator Use : DominatedUsers) { + for (Use *Use : IPI.second) { #ifndef NDEBUG - assert((DT.dominates(LoadedCst, cast<Instruction>(*Use)) || - (isa<PHINode>(*Use) && - DT.dominates(LoadedCst, findInsertionPoint(Use)))) && + assert(DT.dominates(LoadedCst, findInsertionPoint(*Use)) && "Inserted definition does not dominate all its uses!"); #endif - DEBUG(dbgs() << "Use to update " << Use.getOperandNo() << ":"); - DEBUG(Use->print(dbgs())); + DEBUG(dbgs() << "Use to update " << Use->getOperandNo() << ":"); + DEBUG(Use->getUser()->print(dbgs())); DEBUG(dbgs() << '\n'); - Use->setOperand(Use.getOperandNo(), LoadedCst); + Use->set(LoadedCst); ++NumPromotedUses; } } @@ -556,22 +535,19 @@ bool AArch64PromoteConstant::runOnFunction(Function &F) { // global variable. Create as few loads of this variable as possible and // update the uses accordingly. bool LocalChange = false; - SmallSet<Constant *, 8> AlreadyChecked; - - for (auto &MBB : F) { - for (auto &MI : MBB) { - // Traverse the operand, looking for constant vectors. Replace them by a - // load of a global variable of constant vector type. - for (unsigned OpIdx = 0, EndOpIdx = MI.getNumOperands(); - OpIdx != EndOpIdx; ++OpIdx) { - Constant *Cst = dyn_cast<Constant>(MI.getOperand(OpIdx)); - // There is no point in promoting global values as they are already - // global. Do not promote constant expressions either, as they may - // require some code expansion. - if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) && - AlreadyChecked.insert(Cst).second) - LocalChange |= promoteConstant(Cst); - } + SmallPtrSet<Constant *, 8> AlreadyChecked; + + for (Instruction &I : inst_range(&F)) { + // Traverse the operand, looking for constant vectors. Replace them by a + // load of a global variable of constant vector type. + for (Value *Op : I.operand_values()) { + Constant *Cst = dyn_cast<Constant>(Op); + // There is no point in promoting global values as they are already + // global. Do not promote constant expressions either, as they may + // require some code expansion. + if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) && + AlreadyChecked.insert(Cst).second) + LocalChange |= promoteConstant(Cst); } } return LocalChange; diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp index d734d43..206cdbb 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -33,6 +33,10 @@ using namespace llvm; #define GET_REGINFO_TARGET_DESC #include "AArch64GenRegisterInfo.inc" +static cl::opt<bool> +ReserveX18("aarch64-reserve-x18", cl::Hidden, + cl::desc("Reserve X18, making it unavailable as GPR")); + AArch64RegisterInfo::AArch64RegisterInfo(const AArch64InstrInfo *tii, const AArch64Subtarget *sti) : AArch64GenRegisterInfo(AArch64::LR), TII(tii), STI(sti) {} @@ -40,6 +44,10 @@ AArch64RegisterInfo::AArch64RegisterInfo(const AArch64InstrInfo *tii, const MCPhysReg * AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { assert(MF && "Invalid MachineFunction pointer."); + if (MF->getFunction()->getCallingConv() == CallingConv::GHC) + // GHC set of callee saved regs is empty as all those regs are + // used for passing STG regs around + return CSR_AArch64_NoRegs_SaveList; if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg) return CSR_AArch64_AllRegs_SaveList; else @@ -48,6 +56,9 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const uint32_t * AArch64RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { + if (CC == CallingConv::GHC) + // This is academic becase all GHC calls are (supposed to be) tail calls + return CSR_AArch64_NoRegs_RegMask; if (CC == CallingConv::AnyReg) return CSR_AArch64_AllRegs_RegMask; else @@ -63,7 +74,7 @@ const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const { } const uint32_t * -AArch64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const { +AArch64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID CC) const { // This should return a register mask that is the same as that returned by // getCallPreservedMask but that additionally preserves the register used for // the first i64 argument (which must also be the register used to return a @@ -71,6 +82,7 @@ AArch64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const { // // In case that the calling convention does not use the same register for // both, the function should return NULL (does not currently apply) + assert(CC != CallingConv::GHC && "should not be GHC calling convention."); return CSR_AArch64_AAPCS_ThisReturn_RegMask; } @@ -90,7 +102,7 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(AArch64::W29); } - if (STI->isTargetDarwin()) { + if (STI->isTargetDarwin() || ReserveX18) { Reserved.set(AArch64::X18); // Platform register Reserved.set(AArch64::W18); } @@ -117,7 +129,7 @@ bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF, return true; case AArch64::X18: case AArch64::W18: - return STI->isTargetDarwin(); + return STI->isTargetDarwin() || ReserveX18; case AArch64::FP: case AArch64::W29: return TFI->hasFP(MF) || STI->isTargetDarwin(); @@ -379,7 +391,7 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, case AArch64::GPR64commonRegClassID: return 32 - 1 // XZR/SP - (TFI->hasFP(MF) || STI->isTargetDarwin()) // FP - - STI->isTargetDarwin() // X18 reserved as platform register + - (STI->isTargetDarwin() || ReserveX18) // X18 reserved as platform register - hasBasePointer(MF); // X19 case AArch64::FPR8RegClassID: case AArch64::FPR16RegClassID: diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index 0cfd582..b9c5399 100644 --- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -28,15 +28,14 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( // Check to see if there is a specialized entry-point for memory zeroing. ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size); + const AArch64Subtarget &STI = + DAG.getMachineFunction().getSubtarget<AArch64Subtarget>(); const char *bzeroEntry = - (V && V->isNullValue()) - ? DAG.getTarget().getSubtarget<AArch64Subtarget>().getBZeroEntry() - : nullptr; + (V && V->isNullValue()) ? STI.getBZeroEntry() : nullptr; // For small size (< 256), it is not beneficial to use bzero // instead of memset. if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) { - const AArch64TargetLowering &TLI = - *DAG.getTarget().getSubtarget<AArch64Subtarget>().getTargetLowering(); + const AArch64TargetLowering &TLI = *STI.getTargetLowering(); EVT IntPtr = TLI.getPointerTy(); Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext()); diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp index 0c36e8f..85b44a2 100644 --- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp +++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp @@ -30,7 +30,6 @@ class AArch64StorePairSuppress : public MachineFunctionPass { const AArch64InstrInfo *TII; const TargetRegisterInfo *TRI; const MachineRegisterInfo *MRI; - MachineFunction *MF; TargetSchedModel SchedModel; MachineTraceMetrics *Traces; MachineTraceMetrics::Ensemble *MinInstr; @@ -115,20 +114,16 @@ bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) { } } -bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) { - MF = &mf; - TII = - static_cast<const AArch64InstrInfo *>(MF->getSubtarget().getInstrInfo()); - TRI = MF->getSubtarget().getRegisterInfo(); - MRI = &MF->getRegInfo(); - const TargetSubtargetInfo &ST = - MF->getTarget().getSubtarget<TargetSubtargetInfo>(); +bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) { + const TargetSubtargetInfo &ST = MF.getSubtarget(); + TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); + TRI = ST.getRegisterInfo(); + MRI = &MF.getRegInfo(); SchedModel.init(ST.getSchedModel(), &ST, TII); - Traces = &getAnalysis<MachineTraceMetrics>(); MinInstr = nullptr; - DEBUG(dbgs() << "*** " << getPassName() << ": " << MF->getName() << '\n'); + DEBUG(dbgs() << "*** " << getPassName() << ": " << MF.getName() << '\n'); if (!SchedModel.hasInstrSchedModel()) { DEBUG(dbgs() << " Skipping pass: no machine model present.\n"); @@ -139,7 +134,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) { // precisely determine whether a store pair can be formed. But we do want to // filter out most situations where we can't form store pairs to avoid // computing trace metrics in those cases. - for (auto &MBB : *MF) { + for (auto &MBB : MF) { bool SuppressSTP = false; unsigned PrevBaseReg = 0; for (auto &MI : MBB) { diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index 47b5d54..c613025 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -48,17 +48,10 @@ AArch64Subtarget::AArch64Subtarget(const std::string &TT, const TargetMachine &TM, bool LittleEndian) : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false), - HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), CPUString(CPU), - TargetTriple(TT), - // This nested ternary is horrible, but DL needs to be properly - // initialized - // before TLInfo is constructed. - DL(isTargetMachO() - ? "e-m:o-i64:64-i128:128-n32:64-S128" - : (LittleEndian ? "e-m:e-i64:64-i128:128-n32:64-S128" - : "E-m:e-i64:64-i128:128-n32:64-S128")), - FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS)), - TSInfo(&DL), TLInfo(TM) {} + HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), + IsLittle(LittleEndian), CPUString(CPU), TargetTriple(TT), FrameLowering(), + InstrInfo(initializeSubtargetDependencies(FS)), + TSInfo(TM.getDataLayout()), TLInfo(TM, *this) {} /// ClassifyGlobalReference - Find the target operand flags that describe /// how a global value should be referenced for the current subtarget. diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index e2740f1..d418cc5 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -48,13 +48,14 @@ protected: // HasZeroCycleZeroing - Has zero-cycle zeroing instructions. bool HasZeroCycleZeroing; + bool IsLittle; + /// CPUString - String name of used CPU. std::string CPUString; /// TargetTriple - What processor and OS we're targeting. Triple TargetTriple; - const DataLayout DL; AArch64FrameLowering FrameLowering; AArch64InstrInfo InstrInfo; AArch64SelectionDAGInfo TSInfo; @@ -82,7 +83,6 @@ public: return &TLInfo; } const AArch64InstrInfo *getInstrInfo() const override { return &InstrInfo; } - const DataLayout *getDataLayout() const override { return &DL; } const AArch64RegisterInfo *getRegisterInfo() const override { return &getInstrInfo()->getRegisterInfo(); } @@ -100,7 +100,7 @@ public: bool hasCrypto() const { return HasCrypto; } bool hasCRC() const { return HasCRC; } - bool isLittleEndian() const { return DL.isLittleEndian(); } + bool isLittleEndian() const { return IsLittle; } bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } bool isTargetIOS() const { return TargetTriple.isiOS(); } diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index beed8e0..d73d0b3 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -13,10 +13,11 @@ #include "AArch64.h" #include "AArch64TargetMachine.h" #include "AArch64TargetObjectFile.h" +#include "AArch64TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/IR/Function.h" -#include "llvm/PassManager.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetOptions.h" @@ -112,6 +113,13 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT, CodeGenOpt::Level OL, bool LittleEndian) : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + // This nested ternary is horrible, but DL needs to be properly + // initialized + // before TLInfo is constructed. + DL(Triple(TT).isOSBinFormatMachO() + ? "e-m:o-i64:64-i128:128-n32:64-S128" + : (LittleEndian ? "e-m:e-i64:64-i128:128-n32:64-S128" + : "E-m:e-i64:64-i128:128-n32:64-S128")), TLOF(createTLOF(Triple(getTargetTriple()))), Subtarget(TT, CPU, FS, *this, LittleEndian), isLittle(LittleEndian) { initAsmInfo(); @@ -121,11 +129,8 @@ AArch64TargetMachine::~AArch64TargetMachine() {} const AArch64Subtarget * AArch64TargetMachine::getSubtargetImpl(const Function &F) const { - AttributeSet FnAttrs = F.getAttributes(); - Attribute CPUAttr = - FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu"); - Attribute FSAttr = - FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features"); + Attribute CPUAttr = F.getFnAttribute("target-cpu"); + Attribute FSAttr = F.getFnAttribute("target-features"); std::string CPU = !CPUAttr.hasAttribute(Attribute::None) ? CPUAttr.getValueAsString().str() @@ -181,19 +186,17 @@ public: bool addPreISel() override; bool addInstSelector() override; bool addILPOpts() override; - bool addPreRegAlloc() override; - bool addPostRegAlloc() override; - bool addPreSched2() override; - bool addPreEmitPass() override; + void addPreRegAlloc() override; + void addPostRegAlloc() override; + void addPreSched2() override; + void addPreEmitPass() override; }; } // namespace -void AArch64TargetMachine::addAnalysisPasses(PassManagerBase &PM) { - // Add first the target-independent BasicTTI pass, then our AArch64 pass. This - // allows the AArch64 pass to delegate to the target independent layer when - // appropriate. - PM.add(createBasicTargetTransformInfoPass(this)); - PM.add(createAArch64TargetTransformInfoPass(this)); +TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() { + return TargetIRAnalysis([this](Function &F) { + return TargetTransformInfo(AArch64TTIImpl(this, F)); + }); } TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) { @@ -233,8 +236,11 @@ bool AArch64PassConfig::addPreISel() { // get a chance to be merged if (TM->getOptLevel() != CodeGenOpt::None && EnablePromoteConstant) addPass(createAArch64PromoteConstantPass()); + // FIXME: On AArch64, this depends on the type. + // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes(). + // and the offset has to be a multiple of the related size in bytes. if (TM->getOptLevel() != CodeGenOpt::None) - addPass(createGlobalMergePass(TM)); + addPass(createGlobalMergePass(TM, 4095)); if (TM->getOptLevel() != CodeGenOpt::None) addPass(createAArch64AddressTypePromotionPass()); @@ -246,7 +252,7 @@ bool AArch64PassConfig::addInstSelector() { // For ELF, cleanup any local-dynamic TLS accesses (i.e. combine as many // references to _TLS_MODULE_BASE_ as possible. - if (TM->getSubtarget<AArch64Subtarget>().isTargetELF() && + if (Triple(TM->getTargetTriple()).isOSBinFormatELF() && getOptLevel() != CodeGenOpt::None) addPass(createAArch64CleanupLocalDynamicTLSPass()); @@ -267,7 +273,7 @@ bool AArch64PassConfig::addILPOpts() { return true; } -bool AArch64PassConfig::addPreRegAlloc() { +void AArch64PassConfig::addPreRegAlloc() { // Use AdvSIMD scalar instructions whenever profitable. if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar) { addPass(createAArch64AdvSIMDScalar()); @@ -275,10 +281,9 @@ bool AArch64PassConfig::addPreRegAlloc() { // be register coaleascer friendly. addPass(&PeepholeOptimizerID); } - return true; } -bool AArch64PassConfig::addPostRegAlloc() { +void AArch64PassConfig::addPostRegAlloc() { // Change dead register definitions to refer to the zero register. if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination) addPass(createAArch64DeadRegisterDefinitions()); @@ -288,26 +293,23 @@ bool AArch64PassConfig::addPostRegAlloc() { usingDefaultRegAlloc()) // Improve performance for some FP/SIMD code for A57. addPass(createAArch64A57FPLoadBalancing()); - return true; } -bool AArch64PassConfig::addPreSched2() { +void AArch64PassConfig::addPreSched2() { // Expand some pseudo instructions to allow proper scheduling. addPass(createAArch64ExpandPseudoPass()); // Use load/store pair instructions when possible. if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt) addPass(createAArch64LoadStoreOptimizationPass()); - return true; } -bool AArch64PassConfig::addPreEmitPass() { +void AArch64PassConfig::addPreEmitPass() { if (EnableA53Fix835769) addPass(createAArch64A53Fix835769()); // Relax conditional branch instructions if they're otherwise out of // range of their destination. addPass(createAArch64BranchRelaxation()); if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH && - TM->getSubtarget<AArch64Subtarget>().isTargetMachO()) + Triple(TM->getTargetTriple()).isOSBinFormatMachO()) addPass(createAArch64CollectLOHPass()); - return true; } diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h index 75c65c5..7143adf 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.h +++ b/lib/Target/AArch64/AArch64TargetMachine.h @@ -23,6 +23,7 @@ namespace llvm { class AArch64TargetMachine : public LLVMTargetMachine { protected: + const DataLayout DL; std::unique_ptr<TargetLoweringObjectFile> TLOF; AArch64Subtarget Subtarget; mutable StringMap<std::unique_ptr<AArch64Subtarget>> SubtargetMap; @@ -35,6 +36,7 @@ public: ~AArch64TargetMachine() override; + const DataLayout *getDataLayout() const override { return &DL; } const AArch64Subtarget *getSubtargetImpl() const override { return &Subtarget; } @@ -43,8 +45,8 @@ public: // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; - /// \brief Register AArch64 analysis passes with a pass manager. - void addAnalysisPasses(PassManagerBase &PM) override; + /// \brief Get the TargetIRAnalysis for this target. + TargetIRAnalysis getTargetIRAnalysis() override; TargetLoweringObjectFile* getObjFileLowering() const override { return TLOF.get(); diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index b1a2914..0646d85 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1,4 +1,4 @@ -//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass --------===// +//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// // // The LLVM Compiler Infrastructure // @@ -6,18 +6,11 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -/// \file -/// This file implements a TargetTransformInfo analysis pass specific to the -/// AArch64 target machine. It uses the target's detailed information to provide -/// more precise answers to certain TTI queries, while letting the target -/// independent and default TTI implementations handle the rest. -/// -//===----------------------------------------------------------------------===// -#include "AArch64.h" -#include "AArch64TargetMachine.h" +#include "AArch64TargetTransformInfo.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" #include "llvm/Target/TargetLowering.h" @@ -26,130 +19,10 @@ using namespace llvm; #define DEBUG_TYPE "aarch64tti" -// Declare the pass initialization routine locally as target-specific passes -// don't have a target-wide initialization entry point, and so we rely on the -// pass constructor initialization. -namespace llvm { -void initializeAArch64TTIPass(PassRegistry &); -} - -namespace { - -class AArch64TTI final : public ImmutablePass, public TargetTransformInfo { - const AArch64TargetMachine *TM; - const AArch64Subtarget *ST; - const AArch64TargetLowering *TLI; - - /// Estimate the overhead of scalarizing an instruction. Insert and Extract - /// are set if the result needs to be inserted and/or extracted from vectors. - unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; - -public: - AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) { - llvm_unreachable("This pass cannot be directly constructed"); - } - - AArch64TTI(const AArch64TargetMachine *TM) - : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), - TLI(TM->getSubtargetImpl()->getTargetLowering()) { - initializeAArch64TTIPass(*PassRegistry::getPassRegistry()); - } - - void initializePass() override { pushTTIStack(this); } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - TargetTransformInfo::getAnalysisUsage(AU); - } - - /// Pass identification. - static char ID; - - /// Provide necessary pointer adjustments for the two base classes. - void *getAdjustedAnalysisPointer(const void *ID) override { - if (ID == &TargetTransformInfo::ID) - return (TargetTransformInfo *)this; - return this; - } - - /// \name Scalar TTI Implementations - /// @{ - unsigned getIntImmCost(int64_t Val) const; - unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override; - unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty) const override; - unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, - Type *Ty) const override; - PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override; - - /// @} - - /// \name Vector TTI Implementations - /// @{ - - unsigned getNumberOfRegisters(bool Vector) const override { - if (Vector) { - if (ST->hasNEON()) - return 32; - return 0; - } - return 31; - } - - unsigned getRegisterBitWidth(bool Vector) const override { - if (Vector) { - if (ST->hasNEON()) - return 128; - return 0; - } - return 64; - } - - unsigned getMaxInterleaveFactor() const override; - - unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const - override; - - unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const - override; - - unsigned getArithmeticInstrCost( - unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue, - OperandValueKind Opd2Info = OK_AnyValue, - OperandValueProperties Opd1PropInfo = OP_None, - OperandValueProperties Opd2PropInfo = OP_None) const override; - - unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override; - - unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const - override; - - unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace) const override; - - unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const override; - - void getUnrollingPreferences(const Function *F, Loop *L, - UnrollingPreferences &UP) const override; - - - /// @} -}; - -} // end anonymous namespace - -INITIALIZE_AG_PASS(AArch64TTI, TargetTransformInfo, "aarch64tti", - "AArch64 Target Transform Info", true, true, false) -char AArch64TTI::ID = 0; - -ImmutablePass * -llvm::createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM) { - return new AArch64TTI(TM); -} - /// \brief Calculate the cost of materializing a 64-bit value. This helper /// method might only calculate a fraction of a larger immediate. Therefore it /// is valid to return a cost of ZERO. -unsigned AArch64TTI::getIntImmCost(int64_t Val) const { +unsigned AArch64TTIImpl::getIntImmCost(int64_t Val) { // Check if the immediate can be encoded within an instruction. if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) return 0; @@ -163,7 +36,7 @@ unsigned AArch64TTI::getIntImmCost(int64_t Val) const { } /// \brief Calculate the cost of materializing the given constant. -unsigned AArch64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { +unsigned AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -187,25 +60,25 @@ unsigned AArch64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { return std::max(1U, Cost); } -unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx, - const APInt &Imm, Type *Ty) const { +unsigned AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, + const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); // There is no cost model for constants with a bit size of 0. Return TCC_Free // here, so that constant hoisting will ignore this constant. if (BitSize == 0) - return TCC_Free; + return TTI::TCC_Free; unsigned ImmIdx = ~0U; switch (Opcode) { default: - return TCC_Free; + return TTI::TCC_Free; case Instruction::GetElementPtr: // Always hoist the base address of a GetElementPtr. if (Idx == 0) - return 2 * TCC_Basic; - return TCC_Free; + return 2 * TTI::TCC_Basic; + return TTI::TCC_Free; case Instruction::Store: ImmIdx = 0; break; @@ -227,7 +100,7 @@ unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx, case Instruction::LShr: case Instruction::AShr: if (Idx == 1) - return TCC_Free; + return TTI::TCC_Free; break; case Instruction::Trunc: case Instruction::ZExt: @@ -245,26 +118,27 @@ unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx, if (Idx == ImmIdx) { unsigned NumConstants = (BitSize + 63) / 64; - unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty); - return (Cost <= NumConstants * TCC_Basic) - ? static_cast<unsigned>(TCC_Free) : Cost; + unsigned Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); + return (Cost <= NumConstants * TTI::TCC_Basic) + ? static_cast<unsigned>(TTI::TCC_Free) + : Cost; } - return AArch64TTI::getIntImmCost(Imm, Ty); + return AArch64TTIImpl::getIntImmCost(Imm, Ty); } -unsigned AArch64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, - const APInt &Imm, Type *Ty) const { +unsigned AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, + const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); // There is no cost model for constants with a bit size of 0. Return TCC_Free // here, so that constant hoisting will ignore this constant. if (BitSize == 0) - return TCC_Free; + return TTI::TCC_Free; switch (IID) { default: - return TCC_Free; + return TTI::TCC_Free; case Intrinsic::sadd_with_overflow: case Intrinsic::uadd_with_overflow: case Intrinsic::ssub_with_overflow: @@ -273,35 +147,36 @@ unsigned AArch64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, case Intrinsic::umul_with_overflow: if (Idx == 1) { unsigned NumConstants = (BitSize + 63) / 64; - unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty); - return (Cost <= NumConstants * TCC_Basic) - ? static_cast<unsigned>(TCC_Free) : Cost; + unsigned Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); + return (Cost <= NumConstants * TTI::TCC_Basic) + ? static_cast<unsigned>(TTI::TCC_Free) + : Cost; } break; case Intrinsic::experimental_stackmap: if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) - return TCC_Free; + return TTI::TCC_Free; break; case Intrinsic::experimental_patchpoint_void: case Intrinsic::experimental_patchpoint_i64: if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) - return TCC_Free; + return TTI::TCC_Free; break; } - return AArch64TTI::getIntImmCost(Imm, Ty); + return AArch64TTIImpl::getIntImmCost(Imm, Ty); } -AArch64TTI::PopcntSupportKind -AArch64TTI::getPopcntSupport(unsigned TyWidth) const { +TargetTransformInfo::PopcntSupportKind +AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); if (TyWidth == 32 || TyWidth == 64) - return PSK_FastHardware; + return TTI::PSK_FastHardware; // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. - return PSK_Software; + return TTI::PSK_Software; } -unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst, - Type *Src) const { +unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, + Type *Src) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -309,7 +184,7 @@ unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst, EVT DstTy = TLI->getValueType(Dst); if (!SrcTy.isSimple() || !DstTy.isSimple()) - return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); + return BaseT::getCastInstrCost(Opcode, Dst, Src); static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = { // LowerVectorINT_TO_FP: @@ -380,11 +255,11 @@ unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst, if (Idx != -1) return ConversionTbl[Idx].Cost; - return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); + return BaseT::getCastInstrCost(Opcode, Dst, Src); } -unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) const { +unsigned AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) { assert(Val->isVectorTy() && "This must be a vector type"); if (Index != -1U) { @@ -408,10 +283,10 @@ unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val, return 2; } -unsigned AArch64TTI::getArithmeticInstrCost( - unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, - OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo, - OperandValueProperties Opd2PropInfo) const { +unsigned AArch64TTIImpl::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, + TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo) { // Legalize the type. std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); @@ -442,8 +317,8 @@ unsigned AArch64TTI::getArithmeticInstrCost( switch (ISD) { default: - return TargetTransformInfo::getArithmeticInstrCost( - Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); + return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, + Opd1PropInfo, Opd2PropInfo); case ISD::ADD: case ISD::MUL: case ISD::XOR: @@ -455,7 +330,7 @@ unsigned AArch64TTI::getArithmeticInstrCost( } } -unsigned AArch64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { +unsigned AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting @@ -470,8 +345,8 @@ unsigned AArch64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { return 1; } -unsigned AArch64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) const { +unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy) { int ISD = TLI->InstructionOpcodeToISD(Opcode); // We don't lower vector selects well that are wider than the register width. @@ -498,12 +373,12 @@ unsigned AArch64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, return VectorSelectTbl[Idx].Cost; } } - return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src, - unsigned Alignment, - unsigned AddressSpace) const { +unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, + unsigned AddressSpace) { std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 && @@ -531,7 +406,7 @@ unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src, return LT.first; } -unsigned AArch64TTI::getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const { +unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { unsigned Cost = 0; for (auto *I : Tys) { if (!I->isVectorTy()) @@ -543,14 +418,94 @@ unsigned AArch64TTI::getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const { return Cost; } -unsigned AArch64TTI::getMaxInterleaveFactor() const { +unsigned AArch64TTIImpl::getMaxInterleaveFactor() { if (ST->isCortexA57()) return 4; return 2; } -void AArch64TTI::getUnrollingPreferences(const Function *F, Loop *L, - UnrollingPreferences &UP) const { +void AArch64TTIImpl::getUnrollingPreferences(Loop *L, + TTI::UnrollingPreferences &UP) { // Disable partial & runtime unrolling on -Os. UP.PartialOptSizeThreshold = 0; } + +Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, + Type *ExpectedType) { + switch (Inst->getIntrinsicID()) { + default: + return nullptr; + case Intrinsic::aarch64_neon_st2: + case Intrinsic::aarch64_neon_st3: + case Intrinsic::aarch64_neon_st4: { + // Create a struct type + StructType *ST = dyn_cast<StructType>(ExpectedType); + if (!ST) + return nullptr; + unsigned NumElts = Inst->getNumArgOperands() - 1; + if (ST->getNumElements() != NumElts) + return nullptr; + for (unsigned i = 0, e = NumElts; i != e; ++i) { + if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) + return nullptr; + } + Value *Res = UndefValue::get(ExpectedType); + IRBuilder<> Builder(Inst); + for (unsigned i = 0, e = NumElts; i != e; ++i) { + Value *L = Inst->getArgOperand(i); + Res = Builder.CreateInsertValue(Res, L, i); + } + return Res; + } + case Intrinsic::aarch64_neon_ld2: + case Intrinsic::aarch64_neon_ld3: + case Intrinsic::aarch64_neon_ld4: + if (Inst->getType() == ExpectedType) + return Inst; + return nullptr; + } +} + +bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, + MemIntrinsicInfo &Info) { + switch (Inst->getIntrinsicID()) { + default: + break; + case Intrinsic::aarch64_neon_ld2: + case Intrinsic::aarch64_neon_ld3: + case Intrinsic::aarch64_neon_ld4: + Info.ReadMem = true; + Info.WriteMem = false; + Info.Vol = false; + Info.NumMemRefs = 1; + Info.PtrVal = Inst->getArgOperand(0); + break; + case Intrinsic::aarch64_neon_st2: + case Intrinsic::aarch64_neon_st3: + case Intrinsic::aarch64_neon_st4: + Info.ReadMem = false; + Info.WriteMem = true; + Info.Vol = false; + Info.NumMemRefs = 1; + Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1); + break; + } + + switch (Inst->getIntrinsicID()) { + default: + return false; + case Intrinsic::aarch64_neon_ld2: + case Intrinsic::aarch64_neon_st2: + Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; + break; + case Intrinsic::aarch64_neon_ld3: + case Intrinsic::aarch64_neon_st3: + Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; + break; + case Intrinsic::aarch64_neon_ld4: + case Intrinsic::aarch64_neon_st4: + Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; + break; + } + return true; +} diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h new file mode 100644 index 0000000..dd3fd1f --- /dev/null +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -0,0 +1,147 @@ +//===-- AArch64TargetTransformInfo.h - AArch64 specific TTI -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file a TargetTransformInfo::Concept conforming object specific to the +/// AArch64 target machine. It uses the target's detailed information to +/// provide more precise answers to certain TTI queries, while letting the +/// target independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H + +#include "AArch64.h" +#include "AArch64TargetMachine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/Target/TargetLowering.h" +#include <algorithm> + +namespace llvm { + +class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> { + typedef BasicTTIImplBase<AArch64TTIImpl> BaseT; + typedef TargetTransformInfo TTI; + friend BaseT; + + const AArch64TargetMachine *TM; + const AArch64Subtarget *ST; + const AArch64TargetLowering *TLI; + + /// Estimate the overhead of scalarizing an instruction. Insert and Extract + /// are set if the result needs to be inserted and/or extracted from vectors. + unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract); + + const AArch64Subtarget *getST() const { return ST; } + const AArch64TargetLowering *getTLI() const { return TLI; } + + enum MemIntrinsicType { + VECTOR_LDST_TWO_ELEMENTS, + VECTOR_LDST_THREE_ELEMENTS, + VECTOR_LDST_FOUR_ELEMENTS + }; + +public: + explicit AArch64TTIImpl(const AArch64TargetMachine *TM, Function &F) + : BaseT(TM), TM(TM), ST(TM->getSubtargetImpl(F)), + TLI(ST->getTargetLowering()) {} + + // Provide value semantics. MSVC requires that we spell all of these out. + AArch64TTIImpl(const AArch64TTIImpl &Arg) + : BaseT(static_cast<const BaseT &>(Arg)), TM(Arg.TM), ST(Arg.ST), + TLI(Arg.TLI) {} + AArch64TTIImpl(AArch64TTIImpl &&Arg) + : BaseT(std::move(static_cast<BaseT &>(Arg))), TM(std::move(Arg.TM)), + ST(std::move(Arg.ST)), TLI(std::move(Arg.TLI)) {} + AArch64TTIImpl &operator=(const AArch64TTIImpl &RHS) { + BaseT::operator=(static_cast<const BaseT &>(RHS)); + TM = RHS.TM; + ST = RHS.ST; + TLI = RHS.TLI; + return *this; + } + AArch64TTIImpl &operator=(AArch64TTIImpl &&RHS) { + BaseT::operator=(std::move(static_cast<BaseT &>(RHS))); + TM = std::move(RHS.TM); + ST = std::move(RHS.ST); + TLI = std::move(RHS.TLI); + return *this; + } + + /// \name Scalar TTI Implementations + /// @{ + + using BaseT::getIntImmCost; + unsigned getIntImmCost(int64_t Val); + unsigned getIntImmCost(const APInt &Imm, Type *Ty); + unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, + Type *Ty); + unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty); + TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); + + /// @} + + /// \name Vector TTI Implementations + /// @{ + + unsigned getNumberOfRegisters(bool Vector) { + if (Vector) { + if (ST->hasNEON()) + return 32; + return 0; + } + return 31; + } + + unsigned getRegisterBitWidth(bool Vector) { + if (Vector) { + if (ST->hasNEON()) + return 128; + return 0; + } + return 64; + } + + unsigned getMaxInterleaveFactor(); + + unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + + unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + + unsigned getArithmeticInstrCost( + unsigned Opcode, Type *Ty, + TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, + TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, + TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + + unsigned getAddressComputationCost(Type *Ty, bool IsComplex); + + unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + + unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); + + unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys); + + void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + + Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, + Type *ExpectedType); + + bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info); + + /// @} +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 98e0ea8..1960c99 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -210,9 +210,9 @@ private: struct SysRegOp { const char *Data; unsigned Length; - uint64_t FeatureBits; // We need to pass through information about which - // core we are compiling for so that the SysReg - // Mappers can appropriately conditionalize. + uint32_t MRSReg; + uint32_t MSRReg; + uint32_t PStateField; }; struct SysCRImmOp { @@ -374,11 +374,6 @@ public: return StringRef(SysReg.Data, SysReg.Length); } - uint64_t getSysRegFeatureBits() const { - assert(Kind == k_SysReg && "Invalid access!"); - return SysReg.FeatureBits; - } - unsigned getSysCR() const { assert(Kind == k_SysCR && "Invalid access!"); return SysCRImm.Val; @@ -855,28 +850,17 @@ public: bool isMRSSystemRegister() const { if (!isSysReg()) return false; - bool IsKnownRegister; - auto Mapper = AArch64SysReg::MRSMapper(getSysRegFeatureBits()); - Mapper.fromString(getSysReg(), IsKnownRegister); - - return IsKnownRegister; + return SysReg.MRSReg != -1U; } bool isMSRSystemRegister() const { if (!isSysReg()) return false; - bool IsKnownRegister; - auto Mapper = AArch64SysReg::MSRMapper(getSysRegFeatureBits()); - Mapper.fromString(getSysReg(), IsKnownRegister); - - return IsKnownRegister; + return SysReg.MSRReg != -1U; } bool isSystemPStateField() const { if (!isSysReg()) return false; - bool IsKnownRegister; - AArch64PState::PStateMapper().fromString(getSysReg(), IsKnownRegister); - - return IsKnownRegister; + return SysReg.PStateField != -1U; } bool isReg() const override { return Kind == k_Register && !Reg.isVector; } bool isVectorReg() const { return Kind == k_Register && Reg.isVector; } @@ -1454,31 +1438,19 @@ public: void addMRSSystemRegisterOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - bool Valid; - auto Mapper = AArch64SysReg::MRSMapper(getSysRegFeatureBits()); - uint32_t Bits = Mapper.fromString(getSysReg(), Valid); - - Inst.addOperand(MCOperand::CreateImm(Bits)); + Inst.addOperand(MCOperand::CreateImm(SysReg.MRSReg)); } void addMSRSystemRegisterOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - bool Valid; - auto Mapper = AArch64SysReg::MSRMapper(getSysRegFeatureBits()); - uint32_t Bits = Mapper.fromString(getSysReg(), Valid); - - Inst.addOperand(MCOperand::CreateImm(Bits)); + Inst.addOperand(MCOperand::CreateImm(SysReg.MSRReg)); } void addSystemPStateFieldOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - bool Valid; - uint32_t Bits = - AArch64PState::PStateMapper().fromString(getSysReg(), Valid); - - Inst.addOperand(MCOperand::CreateImm(Bits)); + Inst.addOperand(MCOperand::CreateImm(SysReg.PStateField)); } void addSysCROperands(MCInst &Inst, unsigned N) const { @@ -1645,12 +1617,17 @@ public: return Op; } - static std::unique_ptr<AArch64Operand> - CreateSysReg(StringRef Str, SMLoc S, uint64_t FeatureBits, MCContext &Ctx) { + static std::unique_ptr<AArch64Operand> CreateSysReg(StringRef Str, SMLoc S, + uint32_t MRSReg, + uint32_t MSRReg, + uint32_t PStateField, + MCContext &Ctx) { auto Op = make_unique<AArch64Operand>(k_SysReg, Ctx); Op->SysReg.Data = Str.data(); Op->SysReg.Length = Str.size(); - Op->SysReg.FeatureBits = FeatureBits; + Op->SysReg.MRSReg = MRSReg; + Op->SysReg.MSRReg = MSRReg; + Op->SysReg.PStateField = PStateField; Op->StartLoc = S; Op->EndLoc = S; return Op; @@ -2643,8 +2620,24 @@ AArch64AsmParser::tryParseSysReg(OperandVector &Operands) { if (Tok.isNot(AsmToken::Identifier)) return MatchOperand_NoMatch; - Operands.push_back(AArch64Operand::CreateSysReg(Tok.getString(), getLoc(), - STI.getFeatureBits(), getContext())); + bool IsKnown; + auto MRSMapper = AArch64SysReg::MRSMapper(STI.getFeatureBits()); + uint32_t MRSReg = MRSMapper.fromString(Tok.getString(), IsKnown); + assert(IsKnown == (MRSReg != -1U) && + "register should be -1 if and only if it's unknown"); + + auto MSRMapper = AArch64SysReg::MSRMapper(STI.getFeatureBits()); + uint32_t MSRReg = MSRMapper.fromString(Tok.getString(), IsKnown); + assert(IsKnown == (MSRReg != -1U) && + "register should be -1 if and only if it's unknown"); + + uint32_t PStateField = + AArch64PState::PStateMapper().fromString(Tok.getString(), IsKnown); + assert(IsKnown == (PStateField != -1U) && + "register should be -1 if and only if it's unknown"); + + Operands.push_back(AArch64Operand::CreateSysReg( + Tok.getString(), getLoc(), MRSReg, MSRReg, PStateField, getContext())); Parser.Lex(); // Eat identifier return MatchOperand_Success; @@ -3927,7 +3920,6 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, } llvm_unreachable("Implement any new match types added!"); - return true; } /// ParseDirective parses the arm specific directives @@ -4140,7 +4132,7 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) { Parser.Lex(); // Consume the EndOfStatement auto pair = std::make_pair(IsVector, RegNum); - if (!RegisterReqs.insert(std::make_pair(Name, pair)).second) + if (RegisterReqs.insert(std::make_pair(Name, pair)).first->second != pair) Warning(L, "ignoring redefinition of register alias '" + Name + "'"); return true; diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index 878e29c..fb25089 100644 --- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -221,13 +221,11 @@ DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size, static MCSymbolizer * createAArch64ExternalSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo, - LLVMSymbolLookupCallback SymbolLookUp, - void *DisInfo, MCContext *Ctx, - MCRelocationInfo *RelInfo) { - return new llvm::AArch64ExternalSymbolizer( - *Ctx, - std::unique_ptr<MCRelocationInfo>(RelInfo), - GetOpInfo, SymbolLookUp, DisInfo); + LLVMSymbolLookupCallback SymbolLookUp, + void *DisInfo, MCContext *Ctx, + std::unique_ptr<MCRelocationInfo> &&RelInfo) { + return new llvm::AArch64ExternalSymbolizer(*Ctx, move(RelInfo), GetOpInfo, + SymbolLookUp, DisInfo); } extern "C" void LLVMInitializeAArch64Disassembler() { diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h index 1dc506a..ed24343 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h @@ -51,7 +51,7 @@ enum ShiftExtendType { /// getShiftName - Get the string encoding for the shift type. static inline const char *getShiftExtendName(AArch64_AM::ShiftExtendType ST) { switch (ST) { - default: assert(false && "unhandled shift type!"); + default: llvm_unreachable("unhandled shift type!"); case AArch64_AM::LSL: return "lsl"; case AArch64_AM::LSR: return "lsr"; case AArch64_AM::ASR: return "asr"; @@ -236,21 +236,22 @@ static inline bool processLogicalImmediate(uint64_t Imm, unsigned RegSize, if (isShiftedMask_64(Imm)) { I = countTrailingZeros(Imm); - CTO = CountTrailingOnes_64(Imm >> I); + assert(I < 64 && "undefined behavior"); + CTO = countTrailingOnes(Imm >> I); } else { Imm |= ~Mask; if (!isShiftedMask_64(~Imm)) return false; - unsigned CLO = CountLeadingOnes_64(Imm); + unsigned CLO = countLeadingOnes(Imm); I = 64 - CLO; - CTO = CLO + CountTrailingOnes_64(Imm) - (64 - Size); + CTO = CLO + countTrailingOnes(Imm) - (64 - Size); } // Encode in Immr the number of RORs it would take to get *from* 0^m 1^n - // to our target value, where i is the number of RORs to go the opposite + // to our target value, where I is the number of RORs to go the opposite // direction. - assert(Size > I && "I should be smaller than element Size"); + assert(Size > I && "I should be smaller than element size"); unsigned Immr = (Size - I) & (Size - 1); // If size has a 1 in the n'th bit, create a value that has zeroes in diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 0bc2f77..423da65 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -13,8 +13,8 @@ #include "llvm/ADT/Triple.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCDirectives.h" -#include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" @@ -132,7 +132,7 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) { int64_t SignedValue = static_cast<int64_t>(Value); switch (Kind) { default: - assert(false && "Unknown fixup kind!"); + llvm_unreachable("Unknown fixup kind!"); case AArch64::fixup_aarch64_pcrel_adr_imm21: if (SignedValue > 2097151 || SignedValue < -2097152) report_fatal_error("fixup value out of range"); @@ -239,7 +239,7 @@ bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, void AArch64AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const { - assert(false && "AArch64AsmBackend::relaxInstruction() unimplemented"); + llvm_unreachable("AArch64AsmBackend::relaxInstruction() unimplemented"); } bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { @@ -317,42 +317,6 @@ public: MachO::CPU_SUBTYPE_ARM64_ALL); } - bool doesSectionRequireSymbols(const MCSection &Section) const override { - // Any section for which the linker breaks things into atoms needs to - // preserve symbols, including assembler local symbols, to identify - // those atoms. These sections are: - // Sections of type: - // - // S_CSTRING_LITERALS (e.g. __cstring) - // S_LITERAL_POINTERS (e.g. objc selector pointers) - // S_16BYTE_LITERALS, S_8BYTE_LITERALS, S_4BYTE_LITERALS - // - // Sections named: - // - // __TEXT,__eh_frame - // __TEXT,__ustring - // __DATA,__cfstring - // __DATA,__objc_classrefs - // __DATA,__objc_catlist - // - // FIXME: It would be better if the compiler used actual linker local - // symbols for each of these sections rather than preserving what - // are ostensibly assembler local symbols. - const MCSectionMachO &SMO = static_cast<const MCSectionMachO &>(Section); - return (SMO.getType() == MachO::S_CSTRING_LITERALS || - SMO.getType() == MachO::S_4BYTE_LITERALS || - SMO.getType() == MachO::S_8BYTE_LITERALS || - SMO.getType() == MachO::S_16BYTE_LITERALS || - SMO.getType() == MachO::S_LITERAL_POINTERS || - (SMO.getSegmentName() == "__TEXT" && - (SMO.getSectionName() == "__eh_frame" || - SMO.getSectionName() == "__ustring")) || - (SMO.getSegmentName() == "__DATA" && - (SMO.getSectionName() == "__cfstring" || - SMO.getSectionName() == "__objc_classrefs" || - SMO.getSectionName() == "__objc_catlist"))); - } - /// \brief Generate the compact unwind encoding from the CFI directives. uint32_t generateCompactUnwindEncoding( ArrayRef<MCCFIInstruction> Instrs) const override { diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index e05191e..5ea49c3 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -78,7 +78,7 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, if (SymLoc == AArch64MCExpr::VK_GOTTPREL && !IsNC) return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21; if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC) - return ELF::R_AARCH64_TLSDESC_ADR_PAGE; + return ELF::R_AARCH64_TLSDESC_ADR_PAGE21; llvm_unreachable("invalid symbol kind for ADRP relocation"); case AArch64::fixup_aarch64_pcrel_branch26: return ELF::R_AARCH64_JUMP26; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 60e9c19..8dc6c30 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -177,7 +177,9 @@ private: MCELF::SetType(SD, ELF::STT_NOTYPE); MCELF::SetBinding(SD, ELF::STB_LOCAL); SD.setExternal(false); - Symbol->setSection(*getCurrentSection().first); + auto Sec = getCurrentSection().first; + assert(Sec && "need a section"); + Symbol->setSection(*Sec); const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext()); Symbol->setVariableValue(Value); diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 70b9329..f048474 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -37,6 +37,7 @@ AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() { AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant; PrivateGlobalPrefix = "L"; + PrivateLabelPrefix = "L"; SeparatorString = "%%"; CommentString = ";"; PointerSize = CalleeSaveStackSlotSize = 8; @@ -79,6 +80,7 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) { CommentString = "//"; PrivateGlobalPrefix = ".L"; + PrivateLabelPrefix = ".L"; Code32Directive = ".code\t32"; Data16bitsDirective = "\t.hword\t"; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h index 5d03c21..9b88de7 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H #include "llvm/MC/MCAsmInfoDarwin.h" +#include "llvm/MC/MCAsmInfoELF.h" namespace llvm { class Target; @@ -27,7 +28,7 @@ struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin { MCStreamer &Streamer) const override; }; -struct AArch64MCAsmInfoELF : public MCAsmInfo { +struct AArch64MCAsmInfoELF : public MCAsmInfoELF { explicit AArch64MCAsmInfoELF(StringRef TT); }; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index c306b11..4756a19 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -437,8 +437,7 @@ AArch64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx, return 3; } - assert(false && "Invalid value for vector shift amount!"); - return 0; + llvm_unreachable("Invalid value for vector shift amount!"); } uint32_t diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index e12a24b..0d9385d 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -10,6 +10,7 @@ #include "MCTargetDesc/AArch64FixupKinds.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" @@ -33,7 +34,7 @@ public: : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype, /*UseAggressiveSymbolFolding=*/true) {} - void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm, + void RecordRelocation(MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout, const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, uint64_t &FixedValue) override; @@ -112,8 +113,36 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( } } +static bool canUseLocalRelocation(const MCSectionMachO &Section, + const MCSymbol &Symbol, unsigned Log2Size) { + // Debug info sections can use local relocations. + if (Section.hasAttribute(MachO::S_ATTR_DEBUG)) + return true; + + // Otherwise, only pointer sized relocations are supported. + if (Log2Size != 3) + return false; + + // But only if they don't point to a few forbidden sections. + if (!Symbol.isInSection()) + return true; + const MCSectionMachO &RefSec = cast<MCSectionMachO>(Symbol.getSection()); + if (RefSec.getType() == MachO::S_CSTRING_LITERALS) + return false; + + if (RefSec.getSegmentName() == "__DATA" && + RefSec.getSectionName() == "__cfstring") + return false; + + if (RefSec.getSegmentName() == "__DATA" && + RefSec.getSectionName() == "__objc_classrefs") + return false; + + return true; +} + void AArch64MachObjectWriter::RecordRelocation( - MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout, + MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout, const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, uint64_t &FixedValue) { unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); @@ -123,9 +152,9 @@ void AArch64MachObjectWriter::RecordRelocation( unsigned Log2Size = 0; int64_t Value = 0; unsigned Index = 0; - unsigned IsExtern = 0; unsigned Type = 0; unsigned Kind = Fixup.getKind(); + const MCSymbolData *RelSymbol = nullptr; FixupOffset += Fixup.getOffset(); @@ -171,10 +200,8 @@ void AArch64MachObjectWriter::RecordRelocation( // FIXME: Should this always be extern? // SymbolNum of 0 indicates the absolute section. Type = MachO::ARM64_RELOC_UNSIGNED; - Index = 0; if (IsPCRel) { - IsExtern = 1; Asm.getContext().FatalError(Fixup.getLoc(), "PC relative absolute relocation!"); @@ -198,15 +225,12 @@ void AArch64MachObjectWriter::RecordRelocation( Layout.getSymbolOffset(&B_SD) == Layout.getFragmentOffset(Fragment) + Fixup.getOffset()) { // SymB is the PC, so use a PC-rel pointer-to-GOT relocation. - Index = A_Base->getIndex(); - IsExtern = 1; Type = MachO::ARM64_RELOC_POINTER_TO_GOT; IsPCRel = 1; MachO::any_relocation_info MRE; MRE.r_word0 = FixupOffset; - MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | - (IsExtern << 27) | (Type << 28)); - Writer->addRelocation(Fragment->getParent(), MRE); + MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); + Writer->addRelocation(A_Base, Fragment->getParent(), MRE); return; } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None || Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) @@ -252,26 +276,31 @@ void AArch64MachObjectWriter::RecordRelocation( ? 0 : Writer->getSymbolAddress(B_Base, Layout)); - Index = A_Base->getIndex(); - IsExtern = 1; Type = MachO::ARM64_RELOC_UNSIGNED; MachO::any_relocation_info MRE; MRE.r_word0 = FixupOffset; - MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | - (IsExtern << 27) | (Type << 28)); - Writer->addRelocation(Fragment->getParent(), MRE); + MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); + Writer->addRelocation(A_Base, Fragment->getParent(), MRE); - Index = B_Base->getIndex(); - IsExtern = 1; + RelSymbol = B_Base; Type = MachO::ARM64_RELOC_SUBTRACTOR; } else { // A + constant const MCSymbol *Symbol = &Target.getSymA()->getSymbol(); - const MCSymbolData &SD = Asm.getSymbolData(*Symbol); - const MCSymbolData *Base = Asm.getAtom(&SD); const MCSectionMachO &Section = static_cast<const MCSectionMachO &>( Fragment->getParent()->getSection()); + bool CanUseLocalRelocation = + canUseLocalRelocation(Section, *Symbol, Log2Size); + if (Symbol->isTemporary() && (Value || !CanUseLocalRelocation)) { + const MCSection &Sec = Symbol->getSection(); + if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec)) + Asm.addLocalUsedInReloc(*Symbol); + } + + const MCSymbolData &SD = Asm.getSymbolData(*Symbol); + const MCSymbolData *Base = Asm.getAtom(&SD); + // If the symbol is a variable and we weren't able to get a Base for it // (i.e., it's not in the symbol table associated with a section) resolve // the relocation based its expansion instead. @@ -310,16 +339,13 @@ void AArch64MachObjectWriter::RecordRelocation( // sections, and for pointer-sized relocations (.quad), we allow section // relocations. It's code sections that run into trouble. if (Base) { - Index = Base->getIndex(); - IsExtern = 1; + RelSymbol = Base; // Add the local offset, if needed. if (Base != &SD) Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base); } else if (Symbol->isInSection()) { - // Pointer-sized relocations can use a local relocation. Otherwise, - // we have to be in a debug info section. - if (!Section.hasAttribute(MachO::S_ATTR_DEBUG) && Log2Size != 3) + if (!CanUseLocalRelocation) Asm.getContext().FatalError( Fixup.getLoc(), "unsupported relocation of local symbol '" + Symbol->getName() + @@ -329,7 +355,6 @@ void AArch64MachObjectWriter::RecordRelocation( const MCSectionData &SymSD = Asm.getSectionData(SD.getSymbol().getSection()); Index = SymSD.getOrdinal() + 1; - IsExtern = 0; Value += Writer->getSymbolAddress(&SD, Layout); if (IsPCRel) @@ -362,16 +387,16 @@ void AArch64MachObjectWriter::RecordRelocation( MachO::any_relocation_info MRE; MRE.r_word0 = FixupOffset; - MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | - (IsExtern << 27) | (Type << 28)); - Writer->addRelocation(Fragment->getParent(), MRE); + MRE.r_word1 = + (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); + Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE); // Now set up the Addend relocation. Type = MachO::ARM64_RELOC_ADDEND; Index = Value; + RelSymbol = nullptr; IsPCRel = 0; Log2Size = 2; - IsExtern = 0; // Put zero into the instruction itself. The addend is in the relocation. Value = 0; @@ -383,9 +408,9 @@ void AArch64MachObjectWriter::RecordRelocation( // struct relocation_info (8 bytes) MachO::any_relocation_info MRE; MRE.r_word0 = FixupOffset; - MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | - (IsExtern << 27) | (Type << 28)); - Writer->addRelocation(Fragment->getParent(), MRE); + MRE.r_word1 = + (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); + Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE); } MCObjectWriter *llvm::createAArch64MachObjectWriter(raw_ostream &OS, |