diff options
Diffstat (limited to 'lib/Target')
385 files changed, 10436 insertions, 8245 deletions
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index dff48f9..bb3db4b 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -32,6 +32,9 @@ def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", "Enable ARMv8 CRC-32 checksum instructions">; +def FeatureV8_1a : SubtargetFeature<"v8.1a", "HasV8_1a", "true", + "Enable ARMv8.1a extensions", [FeatureCRC]>; + /// Cyclone has register move instructions which are "free". def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", "Has zero-cycle register moves">; @@ -89,6 +92,10 @@ def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8, FeatureNEON, FeatureCRC]>; +def : ProcessorModel<"generic-armv8.1-a", NoSchedModel, [FeatureV8_1a, + FeatureNEON, + FeatureCrypto]>; + def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; // FIXME: Cortex-A72 is currently modelled as an Cortex-A57. diff --git a/lib/Target/AArch64/AArch64A53Fix835769.cpp b/lib/Target/AArch64/AArch64A53Fix835769.cpp index dd401c6..3bc5a54 100644 --- a/lib/Target/AArch64/AArch64A53Fix835769.cpp +++ b/lib/Target/AArch64/AArch64A53Fix835769.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" using namespace llvm; diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp index 2cf3c22..bffd9e6 100644 --- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -142,7 +142,7 @@ private: int scavengeRegister(Chain *G, Color C, MachineBasicBlock &MBB); void scanInstruction(MachineInstr *MI, unsigned Idx, std::map<unsigned, Chain*> &Active, - std::set<std::unique_ptr<Chain>> &AllChains); + std::vector<std::unique_ptr<Chain>> &AllChains); void maybeKillChain(MachineOperand &MO, unsigned Idx, std::map<unsigned, Chain*> &RegChains); Color getColor(unsigned Register); @@ -287,12 +287,12 @@ public: raw_string_ostream OS(S); OS << "{"; - StartInst->print(OS, NULL, true); + StartInst->print(OS, /* SkipOpers= */true); OS << " -> "; - LastInst->print(OS, NULL, true); + LastInst->print(OS, /* SkipOpers= */true); if (KillInst) { OS << " (kill @ "; - KillInst->print(OS, NULL, true); + KillInst->print(OS, /* SkipOpers= */true); OS << ")"; } OS << "}"; @@ -307,6 +307,11 @@ public: //===----------------------------------------------------------------------===// bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) { + // Don't do anything if this isn't an A53 or A57. + if (!(F.getSubtarget<AArch64Subtarget>().isCortexA53() || + F.getSubtarget<AArch64Subtarget>().isCortexA57())) + return false; + bool Changed = false; DEBUG(dbgs() << "***** AArch64A57FPLoadBalancing *****\n"); @@ -331,7 +336,7 @@ bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) { // been killed yet. This is keyed by register - all chains can only have one // "link" register between each inst in the chain. std::map<unsigned, Chain*> ActiveChains; - std::set<std::unique_ptr<Chain>> AllChains; + std::vector<std::unique_ptr<Chain>> AllChains; unsigned Idx = 0; for (auto &MI : MBB) scanInstruction(&MI, Idx++, ActiveChains, AllChains); @@ -598,10 +603,9 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C, return Changed; } -void AArch64A57FPLoadBalancing:: -scanInstruction(MachineInstr *MI, unsigned Idx, - std::map<unsigned, Chain*> &ActiveChains, - std::set<std::unique_ptr<Chain>> &AllChains) { +void AArch64A57FPLoadBalancing::scanInstruction( + MachineInstr *MI, unsigned Idx, std::map<unsigned, Chain *> &ActiveChains, + std::vector<std::unique_ptr<Chain>> &AllChains) { // Inspect "MI", updating ActiveChains and AllChains. if (isMul(MI)) { @@ -620,7 +624,7 @@ scanInstruction(MachineInstr *MI, unsigned Idx, auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg)); ActiveChains[DestReg] = G.get(); - AllChains.insert(std::move(G)); + AllChains.push_back(std::move(G)); } else if (isMla(MI)) { @@ -664,7 +668,7 @@ scanInstruction(MachineInstr *MI, unsigned Idx, << TRI->getName(DestReg) << "\n"); auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg)); ActiveChains[DestReg] = G.get(); - AllChains.insert(std::move(G)); + AllChains.push_back(std::move(G)); } else { diff --git a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp index 287989f..716e1a3 100644 --- a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp +++ b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp @@ -41,6 +41,7 @@ #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp index d64d851..1b4483a 100644 --- a/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -12,12 +12,14 @@ // //===----------------------------------------------------------------------===// +#include "MCTargetDesc/AArch64AddressingModes.h" #include "AArch64.h" #include "AArch64MCInstLower.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" #include "InstPrinter/AArch64InstPrinter.h" +#include "MCTargetDesc/AArch64MCExpr.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" @@ -34,8 +36,10 @@ #include "llvm/MC/MCInstBuilder.h" #include "llvm/MC/MCLinkerOptimizationHint.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/Support/Debug.h" #include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; #define DEBUG_TYPE "asm-printer" @@ -49,7 +53,7 @@ class AArch64AsmPrinter : public AsmPrinter { public: AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this), - SM(*this), AArch64FI(nullptr), LOHLabelCounter(0) {} + SM(*this), AArch64FI(nullptr) {} const char *getPassName() const override { return "AArch64 Assembly Printer"; @@ -110,7 +114,6 @@ private: typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol; MInstToMCSymbol LOHInstToLabel; - unsigned LOHLabelCounter; }; } // end of anonymous namespace @@ -219,6 +222,17 @@ void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum, O << '#' << Imm; break; } + case MachineOperand::MO_GlobalAddress: { + const GlobalValue *GV = MO.getGlobal(); + MCSymbol *Sym = getSymbol(GV); + + // FIXME: Can we get anything other than a plain symbol here? + assert(!MO.getTargetFlags() && "Unknown operand target flag!"); + + O << *Sym; + printOffset(MO.getOffset(), O); + break; + } } } @@ -450,7 +464,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { if (AArch64FI->getLOHRelated().count(MI)) { // Generate a label for LOH related instruction - MCSymbol *LOHLabel = GetTempSymbol("loh", LOHLabelCounter++); + MCSymbol *LOHLabel = createTempSymbol("loh"); // Associate the instruction with the label LOHInstToLabel[MI] = LOHLabel; OutStreamer.EmitLabel(LOHLabel); @@ -489,24 +503,57 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { EmitToStreamer(OutStreamer, TmpInst); return; } - case AArch64::TLSDESC_BLR: { - MCOperand Callee, Sym; - MCInstLowering.lowerOperand(MI->getOperand(0), Callee); - MCInstLowering.lowerOperand(MI->getOperand(1), Sym); - - // First emit a relocation-annotation. This expands to no code, but requests + case AArch64::TLSDESC_CALLSEQ: { + /// lower this to: + /// adrp x0, :tlsdesc:var + /// ldr x1, [x0, #:tlsdesc_lo12:var] + /// add x0, x0, #:tlsdesc_lo12:var + /// .tlsdesccall var + /// blr x1 + /// (TPIDR_EL0 offset now in x0) + const MachineOperand &MO_Sym = MI->getOperand(0); + MachineOperand MO_TLSDESC_LO12(MO_Sym), MO_TLSDESC(MO_Sym); + MCOperand Sym, SymTLSDescLo12, SymTLSDesc; + MO_TLSDESC_LO12.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | + AArch64II::MO_NC); + MO_TLSDESC.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGE); + MCInstLowering.lowerOperand(MO_Sym, Sym); + MCInstLowering.lowerOperand(MO_TLSDESC_LO12, SymTLSDescLo12); + MCInstLowering.lowerOperand(MO_TLSDESC, SymTLSDesc); + + MCInst Adrp; + Adrp.setOpcode(AArch64::ADRP); + Adrp.addOperand(MCOperand::CreateReg(AArch64::X0)); + Adrp.addOperand(SymTLSDesc); + EmitToStreamer(OutStreamer, Adrp); + + MCInst Ldr; + Ldr.setOpcode(AArch64::LDRXui); + Ldr.addOperand(MCOperand::CreateReg(AArch64::X1)); + Ldr.addOperand(MCOperand::CreateReg(AArch64::X0)); + Ldr.addOperand(SymTLSDescLo12); + Ldr.addOperand(MCOperand::CreateImm(0)); + EmitToStreamer(OutStreamer, Ldr); + + MCInst Add; + Add.setOpcode(AArch64::ADDXri); + Add.addOperand(MCOperand::CreateReg(AArch64::X0)); + Add.addOperand(MCOperand::CreateReg(AArch64::X0)); + Add.addOperand(SymTLSDescLo12); + Add.addOperand(MCOperand::CreateImm(AArch64_AM::getShiftValue(0))); + EmitToStreamer(OutStreamer, Add); + + // Emit a relocation-annotation. This expands to no code, but requests // the following instruction gets an R_AARCH64_TLSDESC_CALL. MCInst TLSDescCall; TLSDescCall.setOpcode(AArch64::TLSDESCCALL); TLSDescCall.addOperand(Sym); EmitToStreamer(OutStreamer, TLSDescCall); - // Other than that it's just a normal indirect call to the function loaded - // from the descriptor. - MCInst BLR; - BLR.setOpcode(AArch64::BLR); - BLR.addOperand(Callee); - EmitToStreamer(OutStreamer, BLR); + MCInst Blr; + Blr.setOpcode(AArch64::BLR); + Blr.addOperand(MCOperand::CreateReg(AArch64::X1)); + EmitToStreamer(OutStreamer, Blr); return; } diff --git a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp index 3b74481..06ff9af 100644 --- a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp +++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp @@ -62,10 +62,10 @@ struct LDTLSCleanup : public MachineFunctionPass { for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { switch (I->getOpcode()) { - case AArch64::TLSDESC_BLR: + case AArch64::TLSDESC_CALLSEQ: // Make sure it's a local dynamic access. - if (!I->getOperand(1).isSymbol() || - strcmp(I->getOperand(1).getSymbolName(), "_TLS_MODULE_BASE_")) + if (!I->getOperand(0).isSymbol() || + strcmp(I->getOperand(0).getSymbolName(), "_TLS_MODULE_BASE_")) break; if (TLSBaseAddrReg) diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp index 938dcb3..568f258 100644 --- a/lib/Target/AArch64/AArch64CollectLOH.cpp +++ b/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -279,7 +279,7 @@ static const SetOfMachineInstr *getUses(const InstrToInstrs *sets, unsigned reg, /// definition. It also consider definitions of ADRP instructions as uses and /// ignore other uses. The ADRPMode is used to collect the information for LHO /// that involve ADRP operation only. -static void initReachingDef(MachineFunction &MF, +static void initReachingDef(const MachineFunction &MF, InstrToInstrs *ColorOpToReachedUses, BlockToInstrPerColor &Gen, BlockToRegSet &Kill, BlockToSetOfInstrsPerColor &ReachableUses, @@ -288,7 +288,7 @@ static void initReachingDef(MachineFunction &MF, const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); unsigned NbReg = RegToId.size(); - for (MachineBasicBlock &MBB : MF) { + for (const MachineBasicBlock &MBB : MF) { auto &BBGen = Gen[&MBB]; BBGen = make_unique<const MachineInstr *[]>(NbReg); std::fill(BBGen.get(), BBGen.get() + NbReg, nullptr); @@ -382,7 +382,7 @@ static void initReachingDef(MachineFunction &MF, /// op.reachedUses /// /// Out[bb] = Gen[bb] U (In[bb] - Kill[bb]) -static void reachingDefAlgorithm(MachineFunction &MF, +static void reachingDefAlgorithm(const MachineFunction &MF, InstrToInstrs *ColorOpToReachedUses, BlockToSetOfInstrsPerColor &In, BlockToSetOfInstrsPerColor &Out, @@ -392,7 +392,7 @@ static void reachingDefAlgorithm(MachineFunction &MF, bool HasChanged; do { HasChanged = false; - for (MachineBasicBlock &MBB : MF) { + for (const MachineBasicBlock &MBB : MF) { unsigned CurReg; for (CurReg = 0; CurReg < NbReg; ++CurReg) { SetOfMachineInstr &BBInSet = getSet(In, MBB, CurReg, NbReg); @@ -401,7 +401,7 @@ static void reachingDefAlgorithm(MachineFunction &MF, SetOfMachineInstr &BBOutSet = getSet(Out, MBB, CurReg, NbReg); unsigned Size = BBOutSet.size(); // In[bb][color] = U Out[bb.predecessors][color] - for (MachineBasicBlock *PredMBB : MBB.predecessors()) { + for (const MachineBasicBlock *PredMBB : MBB.predecessors()) { SetOfMachineInstr &PredOutSet = getSet(Out, *PredMBB, CurReg, NbReg); BBInSet.insert(PredOutSet.begin(), PredOutSet.end()); } @@ -433,7 +433,7 @@ static void reachingDefAlgorithm(MachineFunction &MF, /// @p DummyOp. /// \pre ColorOpToReachedUses is an array of at least number of registers of /// InstrToInstrs. -static void reachingDef(MachineFunction &MF, +static void reachingDef(const MachineFunction &MF, InstrToInstrs *ColorOpToReachedUses, const MapRegToId &RegToId, bool ADRPMode = false, const MachineInstr *DummyOp = nullptr) { @@ -983,7 +983,7 @@ static void computeOthers(const InstrToInstrs &UseToDefs, /// Look for every register defined by potential LOHs candidates. /// Map these registers with dense id in @p RegToId and vice-versa in /// @p IdToReg. @p IdToReg is populated only in DEBUG mode. -static void collectInvolvedReg(MachineFunction &MF, MapRegToId &RegToId, +static void collectInvolvedReg(const MachineFunction &MF, MapRegToId &RegToId, MapIdToReg &IdToReg, const TargetRegisterInfo *TRI) { unsigned CurRegId = 0; diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index 61017c1..99cb641 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -3158,7 +3158,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { // Add a register mask with the call-preserved registers. // Proper defs for return values will be added by setPhysRegsDeadExcept(). - MIB.addRegMask(TRI.getCallPreservedMask(CC)); + MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC)); CLI.Call = MIB; @@ -4563,7 +4563,7 @@ bool AArch64FastISel::selectShift(const Instruction *I) { unsigned ResultReg = 0; uint64_t ShiftVal = C->getZExtValue(); MVT SrcVT = RetVT; - bool IsZExt = (I->getOpcode() == Instruction::AShr) ? false : true; + bool IsZExt = I->getOpcode() != Instruction::AShr; const Value *Op0 = I->getOperand(0); if (const auto *ZExt = dyn_cast<ZExtInst>(Op0)) { if (!isIntExtFree(ZExt)) { diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index ac11c4d..0a47dcb 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -65,7 +65,7 @@ public: /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for /// inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, - char ConstraintCode, + unsigned ConstraintID, std::vector<SDValue> &OutOps) override; SDNode *SelectMLAV64LaneV128(SDNode *N); @@ -211,13 +211,20 @@ static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, } bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand( - const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps) { - assert(ConstraintCode == 'm' && "unexpected asm memory constraint"); - // Require the address to be in a register. That is safe for all AArch64 - // variants and it is hard to do anything much smarter without knowing - // how the operand is used. - OutOps.push_back(Op); - return false; + const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) { + switch(ConstraintID) { + default: + llvm_unreachable("Unexpected asm memory constraint"); + case InlineAsm::Constraint_i: + case InlineAsm::Constraint_m: + case InlineAsm::Constraint_Q: + // Require the address to be in a register. That is safe for all AArch64 + // variants and it is hard to do anything much smarter without knowing + // how the operand is used. + OutOps.push_back(Op); + return false; + } + return true; } /// SelectArithImmed - Select an immediate value that can be represented as @@ -299,7 +306,7 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) { } } -/// \brief Determine wether it is worth to fold V into an extended register. +/// \brief Determine whether it is worth to fold V into an extended register. bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const { // it hurts if the value is used at least twice, unless we are optimizing // for code size. @@ -1055,7 +1062,7 @@ SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, SDValue Ops[] = {N->getOperand(2), // Mem operand; Chain}; - EVT ResTys[] = {MVT::Untyped, MVT::Other}; + const EVT ResTys[] = {MVT::Untyped, MVT::Other}; SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); SDValue SuperReg = SDValue(Ld, 0); @@ -1077,8 +1084,8 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs, N->getOperand(2), // Incremental Chain}; - EVT ResTys[] = {MVT::i64, // Type of the write back register - MVT::Untyped, MVT::Other}; + const EVT ResTys[] = {MVT::i64, // Type of the write back register + MVT::Untyped, MVT::Other}; SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); @@ -1119,8 +1126,8 @@ SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc) { SDLoc dl(N); EVT VT = N->getOperand(2)->getValueType(0); - EVT ResTys[] = {MVT::i64, // Type of the write back register - MVT::Other}; // Type for the Chain + const EVT ResTys[] = {MVT::i64, // Type of the write back register + MVT::Other}; // Type for the Chain // Form a REG_SEQUENCE to force register allocation. bool Is128Bit = VT.getSizeInBits() == 128; @@ -1136,6 +1143,7 @@ SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs, return St; } +namespace { /// WidenVector - Given a value in the V64 register class, produce the /// equivalent value in the V128 register class. class WidenVector { @@ -1156,6 +1164,7 @@ public: return DAG.getTargetInsertSubreg(AArch64::dsub, DL, WideTy, Undef, V64Reg); } }; +} // namespace /// NarrowVector - Given a value in the V128 register class, produce the /// equivalent value in the V64 register class. @@ -1184,7 +1193,7 @@ SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs, SDValue RegSeq = createQTuple(Regs); - EVT ResTys[] = {MVT::Untyped, MVT::Other}; + const EVT ResTys[] = {MVT::Untyped, MVT::Other}; unsigned LaneNo = cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue(); @@ -1224,8 +1233,8 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs, SDValue RegSeq = createQTuple(Regs); - EVT ResTys[] = {MVT::i64, // Type of the write back register - MVT::Untyped, MVT::Other}; + const EVT ResTys[] = {MVT::i64, // Type of the write back register + MVT::Untyped, MVT::Other}; unsigned LaneNo = cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue(); @@ -1309,8 +1318,8 @@ SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs, SDValue RegSeq = createQTuple(Regs); - EVT ResTys[] = {MVT::i64, // Type of the write back register - MVT::Other}; + const EVT ResTys[] = {MVT::i64, // Type of the write back register + MVT::Other}; unsigned LaneNo = cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue(); diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index a1b324e..0c0e856 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -64,8 +64,16 @@ EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden, static cl::opt<bool> EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, - cl::desc("Allow AArch64 SLI/SRI formation"), - cl::init(false)); + cl::desc("Allow AArch64 SLI/SRI formation"), + cl::init(false)); + +// FIXME: The necessary dtprel relocations don't seem to be supported +// well in the GNU bfd and gold linkers at the moment. Therefore, by +// default, for now, fall back to GeneralDynamic code generation. +cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( + "aarch64-elf-ldtls-generation", cl::Hidden, + cl::desc("Allow AArch64 Local Dynamic TLS code generation"), + cl::init(false)); AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI) @@ -362,9 +370,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FLOG10, MVT::v8f16, Expand); // AArch64 has implementations of a lot of rounding-like FP operations. - static MVT RoundingTypes[] = { MVT::f32, MVT::f64}; - for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) { - MVT Ty = RoundingTypes[I]; + for (MVT Ty : {MVT::f32, MVT::f64}) { setOperationAction(ISD::FFLOOR, Ty, Legal); setOperationAction(ISD::FNEARBYINT, Ty, Legal); setOperationAction(ISD::FCEIL, Ty, Legal); @@ -561,9 +567,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } // AArch64 has implementations of a lot of rounding-like FP operations. - static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 }; - for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) { - MVT Ty = RoundingVecTypes[I]; + for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) { setOperationAction(ISD::FFLOOR, Ty, Legal); setOperationAction(ISD::FNEARBYINT, Ty, Legal); setOperationAction(ISD::FCEIL, Ty, Legal); @@ -752,7 +756,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG"; case AArch64ISD::CSINC: return "AArch64ISD::CSINC"; case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER"; - case AArch64ISD::TLSDESC_CALL: return "AArch64ISD::TLSDESC_CALL"; + case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ"; case AArch64ISD::ADC: return "AArch64ISD::ADC"; case AArch64ISD::SBC: return "AArch64ISD::SBC"; case AArch64ISD::ADDS: return "AArch64ISD::ADDS"; @@ -811,6 +815,12 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz"; case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz"; case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz"; + case AArch64ISD::SADDV: return "AArch64ISD::SADDV"; + case AArch64ISD::UADDV: return "AArch64ISD::UADDV"; + case AArch64ISD::SMINV: return "AArch64ISD::SMINV"; + case AArch64ISD::UMINV: return "AArch64ISD::UMINV"; + case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV"; + case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV"; case AArch64ISD::NOT: return "AArch64ISD::NOT"; case AArch64ISD::BIT: return "AArch64ISD::BIT"; case AArch64ISD::CBZ: return "AArch64ISD::CBZ"; @@ -1247,7 +1257,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { case ISD::SMULO: case ISD::UMULO: { CC = AArch64CC::NE; - bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false; + bool IsSigned = Op.getOpcode() == ISD::SMULO; if (Op.getValueType() == MVT::i32) { unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; // For a 32 bit multiply with overflow check we want the instruction @@ -2784,13 +2794,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); if (IsThisReturn) { // For 'this' returns, use the X0-preserving mask if applicable - Mask = TRI->getThisReturnPreservedMask(CallConv); + Mask = TRI->getThisReturnPreservedMask(MF, CallConv); if (!Mask) { IsThisReturn = false; - Mask = TRI->getCallPreservedMask(CallConv); + Mask = TRI->getCallPreservedMask(MF, CallConv); } } else - Mask = TRI->getCallPreservedMask(CallConv); + Mask = TRI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -3027,58 +3037,34 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, /// When accessing thread-local variables under either the general-dynamic or /// local-dynamic system, we make a "TLS-descriptor" call. The variable will /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry -/// is a function pointer to carry out the resolution. This function takes the -/// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All -/// other registers (except LR, NZCV) are preserved. -/// -/// Thus, the ideal call sequence on AArch64 is: -/// -/// adrp x0, :tlsdesc:thread_var -/// ldr x8, [x0, :tlsdesc_lo12:thread_var] -/// add x0, x0, :tlsdesc_lo12:thread_var -/// .tlsdesccall thread_var -/// blr x8 -/// (TPIDR_EL0 offset now in x0). +/// is a function pointer to carry out the resolution. /// -/// The ".tlsdesccall" directive instructs the assembler to insert a particular -/// relocation to help the linker relax this sequence if it turns out to be too -/// conservative. +/// The sequence is: +/// adrp x0, :tlsdesc:var +/// ldr x1, [x0, #:tlsdesc_lo12:var] +/// add x0, x0, #:tlsdesc_lo12:var +/// .tlsdesccall var +/// blr x1 +/// (TPIDR_EL0 offset now in x0) /// -/// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this -/// is harmless. -SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr, - SDValue DescAddr, SDLoc DL, - SelectionDAG &DAG) const { +/// The above sequence must be produced unscheduled, to enable the linker to +/// optimize/relax this sequence. +/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the +/// above sequence, and expanded really late in the compilation flow, to ensure +/// the sequence is produced as per above. +SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL, + SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(); - // The function we need to call is simply the first entry in the GOT for this - // descriptor, load it in preparation. - SDValue Func = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, SymAddr); - - // TLS calls preserve all registers except those that absolutely must be - // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be - // silly). - const uint32_t *Mask = - Subtarget->getRegisterInfo()->getTLSCallPreservedMask(); - - // The function takes only one argument: the address of the descriptor itself - // in X0. - SDValue Glue, Chain; - Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue); - Glue = Chain.getValue(1); + SDValue Chain = DAG.getEntryNode(); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); - // We're now ready to populate the argument list, as with a normal call: - SmallVector<SDValue, 6> Ops; + SmallVector<SDValue, 2> Ops; Ops.push_back(Chain); - Ops.push_back(Func); Ops.push_back(SymAddr); - Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT)); - Ops.push_back(DAG.getRegisterMask(Mask)); - Ops.push_back(Glue); - SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); - Chain = DAG.getNode(AArch64ISD::TLSDESC_CALL, DL, NodeTys, Ops); - Glue = Chain.getValue(1); + Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, Ops); + SDValue Glue = Chain.getValue(1); return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); } @@ -3089,9 +3075,18 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, assert(Subtarget->isTargetELF() && "This function expects an ELF target"); assert(getTargetMachine().getCodeModel() == CodeModel::Small && "ELF TLS only supported in small memory model"); + // Different choices can be made for the maximum size of the TLS area for a + // module. For the small address model, the default TLS size is 16MiB and the + // maximum TLS size is 4GiB. + // FIXME: add -mtls-size command line option and make it control the 16MiB + // vs. 4GiB code sequence generation. const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); + if (!EnableAArch64ELFLocalDynamicTLSGeneration) { + if (Model == TLSModel::LocalDynamic) + Model = TLSModel::GeneralDynamic; + } SDValue TPOff; EVT PtrVT = getPointerTy(); @@ -3102,17 +3097,20 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, if (Model == TLSModel::LocalExec) { SDValue HiVar = DAG.getTargetGlobalAddress( - GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1); + GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); SDValue LoVar = DAG.getTargetGlobalAddress( GV, DL, PtrVT, 0, - AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); + AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); - TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, - DAG.getTargetConstant(16, MVT::i32)), - 0); - TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar, - DAG.getTargetConstant(0, MVT::i32)), - 0); + SDValue TPWithOff_lo = + SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, + HiVar, DAG.getTargetConstant(0, MVT::i32)), + 0); + SDValue TPWithOff = + SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo, + LoVar, DAG.getTargetConstant(0, MVT::i32)), + 0); + return TPWithOff; } else if (Model == TLSModel::InitialExec) { TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); @@ -3127,19 +3125,6 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); MFI->incNumLocalDynamicTLSAccesses(); - // Accesses used in this sequence go via the TLS descriptor which lives in - // the GOT. Prepare an address we can use to handle this. - SDValue HiDesc = DAG.getTargetExternalSymbol( - "_TLS_MODULE_BASE_", PtrVT, AArch64II::MO_TLS | AArch64II::MO_PAGE); - SDValue LoDesc = DAG.getTargetExternalSymbol( - "_TLS_MODULE_BASE_", PtrVT, - AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); - - // First argument to the descriptor call is the address of the descriptor - // itself. - SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc); - DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc); - // The call needs a relocation too for linker relaxation. It doesn't make // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of // the address. @@ -3148,40 +3133,23 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, // Now we can calculate the offset from TPIDR_EL0 to this module's // thread-local area. - TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG); + TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); // Now use :dtprel_whatever: operations to calculate this variable's offset // in its thread-storage area. SDValue HiVar = DAG.getTargetGlobalAddress( - GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_G1); + GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); SDValue LoVar = DAG.getTargetGlobalAddress( GV, DL, MVT::i64, 0, - AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); - - SDValue DTPOff = - SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, - DAG.getTargetConstant(16, MVT::i32)), - 0); - DTPOff = - SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, DTPOff, LoVar, - DAG.getTargetConstant(0, MVT::i32)), - 0); - - TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff); - } else if (Model == TLSModel::GeneralDynamic) { - // Accesses used in this sequence go via the TLS descriptor which lives in - // the GOT. Prepare an address we can use to handle this. - SDValue HiDesc = DAG.getTargetGlobalAddress( - GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGE); - SDValue LoDesc = DAG.getTargetGlobalAddress( - GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); - // First argument to the descriptor call is the address of the descriptor - // itself. - SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc); - DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc); - + TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar, + DAG.getTargetConstant(0, MVT::i32)), + 0); + TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar, + DAG.getTargetConstant(0, MVT::i32)), + 0); + } else if (Model == TLSModel::GeneralDynamic) { // The call needs a relocation too for linker relaxation. It doesn't make // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of // the address. @@ -3189,7 +3157,7 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); // Finally we can make a call to calculate the offset from tpidr_el0. - TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG); + TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); } else llvm_unreachable("Unsupported ELF TLS access model"); @@ -3356,11 +3324,12 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, EVT VecVT; EVT EltVT; - SDValue EltMask, VecVal1, VecVal2; + uint64_t EltMask; + SDValue VecVal1, VecVal2; if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { EltVT = MVT::i32; VecVT = MVT::v4i32; - EltMask = DAG.getConstant(0x80000000ULL, EltVT); + EltMask = 0x80000000ULL; if (!VT.isVector()) { VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, @@ -3378,7 +3347,7 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, // We want to materialize a mask with the the high bit set, but the AdvSIMD // immediate moves cannot materialize that in a single instruction for // 64-bit elements. Instead, materialize zero and then negate it. - EltMask = DAG.getConstant(0, EltVT); + EltMask = 0; if (!VT.isVector()) { VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, @@ -3393,11 +3362,7 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, llvm_unreachable("Invalid type for copysign!"); } - std::vector<SDValue> BuildVectorOps; - for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i) - BuildVectorOps.push_back(EltMask); - - SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps); + SDValue BuildVec = DAG.getConstant(EltMask, VecVT); // If we couldn't materialize the mask above, then the mask vector will be // the zero vector, and we need to negate it here. @@ -5927,8 +5892,10 @@ FailedModImm: if (VT.getVectorElementType().isFloatingPoint()) { SmallVector<SDValue, 8> Ops; - MVT NewType = - (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64; + EVT EltTy = VT.getVectorElementType(); + assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) && + "Unsupported floating-point vector type"); + MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits()); for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); @@ -6781,7 +6748,7 @@ bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, unsigned LZ = countLeadingZeros((uint64_t)Val); unsigned Shift = (63 - LZ) / 16; // MOVZ is free so return true for one or fewer MOVK. - return (Shift < 3) ? true : false; + return Shift < 3; } // Generate SUBS and CSEL for integer abs. @@ -6898,6 +6865,15 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, N->getOperand(0)); } } else { + // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) + APInt VNP1 = -Value + 1; + if (VNP1.isPowerOf2()) { + SDValue ShiftedVal = + DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), + DAG.getConstant(VNP1.logBase2(), MVT::i64)); + return DAG.getNode(ISD::SUB, SDLoc(N), VT, N->getOperand(0), + ShiftedVal); + } // (mul x, -(2^N + 1)) => - (add (shl x, N), x) APInt VNM1 = -Value - 1; if (VNM1.isPowerOf2()) { @@ -6908,15 +6884,6 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0)); return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), Add); } - // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) - APInt VNP1 = -Value + 1; - if (VNP1.isPowerOf2()) { - SDValue ShiftedVal = - DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), - DAG.getConstant(VNP1.logBase2(), MVT::i64)); - return DAG.getNode(ISD::SUB, SDLoc(N), VT, N->getOperand(0), - ShiftedVal); - } } } return SDValue(); @@ -7211,21 +7178,54 @@ static SDValue performBitcastCombine(SDNode *N, static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); + + // Optimize concat_vectors of truncated vectors, where the intermediate + // type is illegal, to avoid said illegality, e.g., + // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))), + // (v2i16 (truncate (v2i64))))) + // -> + // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))), + // (v4i32 (bitcast (v2i64))), + // <0, 2, 4, 6>))) + // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed + // on both input and result type, so we might generate worse code. + // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8. + if (N->getNumOperands() == 2 && + N0->getOpcode() == ISD::TRUNCATE && + N1->getOpcode() == ISD::TRUNCATE) { + SDValue N00 = N0->getOperand(0); + SDValue N10 = N1->getOperand(0); + EVT N00VT = N00.getValueType(); + + if (N00VT == N10.getValueType() && + (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) && + N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) { + MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16); + SmallVector<int, 8> Mask(MidVT.getVectorNumElements()); + for (size_t i = 0; i < Mask.size(); ++i) + Mask[i] = i * 2; + return DAG.getNode(ISD::TRUNCATE, dl, VT, + DAG.getVectorShuffle( + MidVT, dl, + DAG.getNode(ISD::BITCAST, dl, MidVT, N00), + DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask)); + } + } + // Wait 'til after everything is legalized to try this. That way we have // legal vector types and such. if (DCI.isBeforeLegalizeOps()) return SDValue(); - SDLoc dl(N); - EVT VT = N->getValueType(0); - // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector // splat. The indexed instructions are going to be expecting a DUPLANE64, so // canonicalise to that. - if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) { + if (N0 == N1 && VT.getVectorNumElements() == 2) { assert(VT.getVectorElementType().getSizeInBits() == 64); - return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, - WidenVector(N->getOperand(0), DAG), + return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG), DAG.getConstant(0, MVT::i64)); } @@ -7238,10 +7238,9 @@ static SDValue performConcatVectorsCombine(SDNode *N, // becomes // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) - SDValue Op1 = N->getOperand(1); - if (Op1->getOpcode() != ISD::BITCAST) + if (N1->getOpcode() != ISD::BITCAST) return SDValue(); - SDValue RHS = Op1->getOperand(0); + SDValue RHS = N1->getOperand(0); MVT RHSTy = RHS.getValueType().getSimpleVT(); // If the RHS is not a vector, this is not the pattern we're looking for. if (!RHSTy.isVector()) @@ -7251,10 +7250,10 @@ static SDValue performConcatVectorsCombine(SDNode *N, MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(), RHSTy.getVectorNumElements() * 2); - return DAG.getNode( - ISD::BITCAST, dl, VT, - DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy, - DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS)); + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy, + DAG.getNode(ISD::BITCAST, dl, RHSTy, N0), + RHS)); } static SDValue tryCombineFixedPointConvert(SDNode *N, @@ -7651,6 +7650,15 @@ static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) { N->getOperand(0), N->getOperand(1), AndN.getOperand(0)); } +static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, + SelectionDAG &DAG) { + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), N->getValueType(0), + DAG.getNode(Opc, SDLoc(N), + N->getOperand(1).getSimpleValueType(), + N->getOperand(1)), + DAG.getConstant(0, MVT::i64)); +} + static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { @@ -7663,6 +7671,18 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_neon_vcvtfxu2fp: return tryCombineFixedPointConvert(N, DCI, DAG); break; + case Intrinsic::aarch64_neon_saddv: + return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG); + case Intrinsic::aarch64_neon_uaddv: + return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG); + case Intrinsic::aarch64_neon_sminv: + return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG); + case Intrinsic::aarch64_neon_uminv: + return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG); + case Intrinsic::aarch64_neon_smaxv: + return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG); + case Intrinsic::aarch64_neon_umaxv: + return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG); case Intrinsic::aarch64_neon_fmax: return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); @@ -8792,9 +8812,11 @@ bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { } // For the real atomic operations, we have ldxr/stxr up to 128 bits, -bool AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { +TargetLoweringBase::AtomicRMWExpansionKind +AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); - return Size <= 128; + return Size <= 128 ? AtomicRMWExpansionKind::LLSC + : AtomicRMWExpansionKind::None; } bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const { diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index e973364..5ff11e8 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -30,9 +30,9 @@ enum { WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses. CALL, // Function call. - // Almost the same as a normal call node, except that a TLSDesc relocation is - // needed so the linker can relax it correctly if possible. - TLSDESC_CALL, + // Produces the full sequence of instructions for getting the thread pointer + // offset of a variable into X0, using the TLSDesc model. + TLSDESC_CALLSEQ, ADRP, // Page address of a TargetGlobalAddress operand. ADDlow, // Add the low 12 bits of a TargetGlobalAddress operand. LOADgot, // Load from automatically generated descriptor (e.g. Global @@ -141,6 +141,18 @@ enum { FCMLEz, FCMLTz, + // Vector across-lanes addition + // Only the lower result lane is defined. + SADDV, + UADDV, + + // Vector across-lanes min/max + // Only the lower result lane is defined. + SMINV, + UMINV, + SMAXV, + UMAXV, + // Vector bitwise negation NOT, @@ -335,7 +347,8 @@ public: bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; - bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + TargetLoweringBase::AtomicRMWExpansionKind + shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; bool useLoadStackGuardNode() const override; TargetLoweringBase::LegalizeTypeAction @@ -399,8 +412,8 @@ private: SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerELFTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL, - SelectionDAG &DAG) const; + SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL, + SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; @@ -460,6 +473,16 @@ private: std::vector<SDValue> &Ops, SelectionDAG &DAG) const override; + unsigned getInlineAsmMemConstraint( + const std::string &ConstraintCode) const override { + if (ConstraintCode == "Q") + return InlineAsm::Constraint_Q; + // FIXME: clang has code for 'Ump', 'Utf', 'Usa', and 'Ush' but these are + // followed by llvm_unreachable so we'll leave them unimplemented in + // the backend for now. + return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); + } + bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; bool mayBeEmittedAsTailCall(CallInst *CI) const override; bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset, diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index 64cec55..8e0af2d 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -31,7 +31,7 @@ using namespace llvm; AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP), - RI(this, &STI), Subtarget(STI) {} + RI(STI.getTargetTriple()), Subtarget(STI) {} /// GetInstSize - Return the number of bytes of code the specified /// instruction may be. This returns the maximum number of bytes. @@ -2068,10 +2068,10 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, .setMIFlag(Flag); } -MachineInstr * -AArch64InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, - int FrameIndex) const { +MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + ArrayRef<unsigned> Ops, + int FrameIndex) const { // This is a bit of a hack. Consider this instruction: // // %vreg0<def> = COPY %SP; GPR64all:%vreg0 diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index d8f1274..fa4b8b7 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -129,10 +129,9 @@ public: const TargetRegisterInfo *TRI) const override; using TargetInstrInfo::foldMemoryOperandImpl; - MachineInstr * - foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, - int FrameIndex) const override; + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, + int FrameIndex) const override; bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index 6e4c0b0..ec6fa5c 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -22,6 +22,8 @@ def HasCrypto : Predicate<"Subtarget->hasCrypto()">, AssemblerPredicate<"FeatureCrypto", "crypto">; def HasCRC : Predicate<"Subtarget->hasCRC()">, AssemblerPredicate<"FeatureCRC", "crc">; +def HasV8_1a : Predicate<"Subtarget->hasV8_1a()">, + AssemblerPredicate<"FeatureV8_1a", "v8.1a">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; def IsCyclone : Predicate<"Subtarget->isCyclone()">; @@ -96,6 +98,19 @@ def SDT_AArch64ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>; def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>; + +// Generates the general dynamic sequences, i.e. +// adrp x0, :tlsdesc:var +// ldr x1, [x0, #:tlsdesc_lo12:var] +// add x0, x0, #:tlsdesc_lo12:var +// .tlsdesccall var +// blr x1 + +// (the TPIDR_EL0 offset is put directly in X0, hence no "result" here) +// number of operands (the variable) +def SDT_AArch64TLSDescCallSeq : SDTypeProfile<0,1, + [SDTCisPtrTy<0>]>; + def SDT_AArch64WrapperLarge : SDTypeProfile<1, 4, [SDTCisVT<0, i64>, SDTCisVT<1, i32>, SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>, @@ -229,10 +244,11 @@ def AArch64Prefetch : SDNode<"AArch64ISD::PREFETCH", SDT_AArch64PREFETCH, def AArch64sitof: SDNode<"AArch64ISD::SITOF", SDT_AArch64ITOF>; def AArch64uitof: SDNode<"AArch64ISD::UITOF", SDT_AArch64ITOF>; -def AArch64tlsdesc_call : SDNode<"AArch64ISD::TLSDESC_CALL", - SDT_AArch64TLSDescCall, - [SDNPInGlue, SDNPOutGlue, SDNPHasChain, - SDNPVariadic]>; +def AArch64tlsdesc_callseq : SDNode<"AArch64ISD::TLSDESC_CALLSEQ", + SDT_AArch64TLSDescCallSeq, + [SDNPInGlue, SDNPOutGlue, SDNPHasChain, + SDNPVariadic]>; + def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge", SDT_AArch64WrapperLarge>; @@ -244,6 +260,13 @@ def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>; def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>; +def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>; +def AArch64uaddv : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>; +def AArch64sminv : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>; +def AArch64uminv : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>; +def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>; +def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; + //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// @@ -1049,15 +1072,16 @@ def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> { let AsmString = ".tlsdesccall $sym"; } -// Pseudo-instruction representing a BLR with attached TLSDESC relocation. It -// gets expanded to two MCInsts during lowering. -let isCall = 1, Defs = [LR] in -def TLSDESC_BLR - : Pseudo<(outs), (ins GPR64:$dest, i64imm:$sym), - [(AArch64tlsdesc_call GPR64:$dest, tglobaltlsaddr:$sym)]>; +// FIXME: maybe the scratch register used shouldn't be fixed to X1? +// FIXME: can "hasSideEffects be dropped? +let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1, + isCodeGenOnly = 1 in +def TLSDESC_CALLSEQ + : Pseudo<(outs), (ins i64imm:$sym), + [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>; +def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym), + (TLSDESC_CALLSEQ texternalsym:$sym)>; -def : Pat<(AArch64tlsdesc_call GPR64:$dest, texternalsym:$sym), - (TLSDESC_BLR GPR64:$dest, texternalsym:$sym)>; //===----------------------------------------------------------------------===// // Conditional branch (immediate) instruction. //===----------------------------------------------------------------------===// @@ -2326,8 +2350,15 @@ defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>; defm FMOV : UnscaledConversion<"fmov">; -def : Pat<(f32 (fpimm0)), (FMOVWSr WZR)>, Requires<[NoZCZ]>; -def : Pat<(f64 (fpimm0)), (FMOVXDr XZR)>, Requires<[NoZCZ]>; +// Add pseudo ops for FMOV 0 so we can mark them as isReMaterializable +let isReMaterializable = 1, isCodeGenOnly = 1 in { +def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>, + PseudoInstExpansion<(FMOVWSr FPR32:$Rd, WZR)>, + Requires<[NoZCZ]>; +def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>, + PseudoInstExpansion<(FMOVXDr FPR64:$Rd, XZR)>, + Requires<[NoZCZ]>; +} //===----------------------------------------------------------------------===// // Floating point conversion instruction. @@ -3416,10 +3447,10 @@ defm FMAXNMP : SIMDPairwiseScalarSD<1, 0, 0b01100, "fmaxnmp">; defm FMAXP : SIMDPairwiseScalarSD<1, 0, 0b01111, "fmaxp">; defm FMINNMP : SIMDPairwiseScalarSD<1, 1, 0b01100, "fminnmp">; defm FMINP : SIMDPairwiseScalarSD<1, 1, 0b01111, "fminp">; -def : Pat<(i64 (int_aarch64_neon_saddv (v2i64 V128:$Rn))), - (ADDPv2i64p V128:$Rn)>; -def : Pat<(i64 (int_aarch64_neon_uaddv (v2i64 V128:$Rn))), - (ADDPv2i64p V128:$Rn)>; +def : Pat<(v2i64 (AArch64saddv V128:$Rn)), + (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>; +def : Pat<(v2i64 (AArch64uaddv V128:$Rn)), + (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>; def : Pat<(f32 (int_aarch64_neon_faddv (v2f32 V64:$Rn))), (FADDPv2i32p V64:$Rn)>; def : Pat<(f32 (int_aarch64_neon_faddv (v4f32 V128:$Rn))), @@ -3709,10 +3740,6 @@ multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64, defm : Neon_INS_elt_pattern<v8f16, v4f16, f16, INSvi16lane>; defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>; defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>; -defm : Neon_INS_elt_pattern<v16i8, v8i8, i32, INSvi8lane>; -defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, INSvi16lane>; -defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, INSvi32lane>; -defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, INSvi32lane>; // Floating point vector extractions are codegen'd as either a sequence of @@ -3776,121 +3803,143 @@ defm FMAXV : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>; defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>; defm FMINV : SIMDAcrossLanesS<0b01111, 1, "fminv", int_aarch64_neon_fminv>; -multiclass SIMDAcrossLanesSignedIntrinsic<string baseOpc, Intrinsic intOp> { -// If there is a sign extension after this intrinsic, consume it as smov already -// performed it - def : Pat<(i32 (sext_inreg (i32 (intOp (v8i8 V64:$Rn))), i8)), - (i32 (SMOVvi8to32 - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), - (i64 0)))>; - def : Pat<(i32 (intOp (v8i8 V64:$Rn))), - (i32 (SMOVvi8to32 +// Patterns for across-vector intrinsics, that have a node equivalent, that +// returns a vector (with only the low lane defined) instead of a scalar. +// In effect, opNode is the same as (scalar_to_vector (IntNode)). +multiclass SIMDAcrossLanesIntrinsic<string baseOpc, + SDPatternOperator opNode> { +// If a lane instruction caught the vector_extract around opNode, we can +// directly match the latter to the instruction. +def : Pat<(v8i8 (opNode V64:$Rn)), + (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub)>; +def : Pat<(v16i8 (opNode V128:$Rn)), (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), - (i64 0)))>; -// If there is a sign extension after this intrinsic, consume it as smov already -// performed it -def : Pat<(i32 (sext_inreg (i32 (intOp (v16i8 V128:$Rn))), i8)), - (i32 (SMOVvi8to32 - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), - (i64 0)))>; -def : Pat<(i32 (intOp (v16i8 V128:$Rn))), - (i32 (SMOVvi8to32 - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), - (i64 0)))>; + (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub)>; +def : Pat<(v4i16 (opNode V64:$Rn)), + (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub)>; +def : Pat<(v8i16 (opNode V128:$Rn)), + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub)>; +def : Pat<(v4i32 (opNode V128:$Rn)), + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub)>; + + +// If none did, fallback to the explicit patterns, consuming the vector_extract. +def : Pat<(i32 (vector_extract (insert_subvector undef, (v8i8 (opNode V64:$Rn)), + (i32 0)), (i64 0))), + (EXTRACT_SUBREG (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), + bsub), ssub)>; +def : Pat<(i32 (vector_extract (v16i8 (opNode V128:$Rn)), (i64 0))), + (EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), + bsub), ssub)>; +def : Pat<(i32 (vector_extract (insert_subvector undef, + (v4i16 (opNode V64:$Rn)), (i32 0)), (i64 0))), + (EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), + hsub), ssub)>; +def : Pat<(i32 (vector_extract (v8i16 (opNode V128:$Rn)), (i64 0))), + (EXTRACT_SUBREG (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), + hsub), ssub)>; +def : Pat<(i32 (vector_extract (v4i32 (opNode V128:$Rn)), (i64 0))), + (EXTRACT_SUBREG (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), + ssub), ssub)>; + +} + +multiclass SIMDAcrossLanesSignedIntrinsic<string baseOpc, + SDPatternOperator opNode> + : SIMDAcrossLanesIntrinsic<baseOpc, opNode> { // If there is a sign extension after this intrinsic, consume it as smov already // performed it -def : Pat<(i32 (sext_inreg (i32 (intOp (v4i16 V64:$Rn))), i16)), +def : Pat<(i32 (sext_inreg (i32 (vector_extract (insert_subvector undef, + (opNode (v8i8 V64:$Rn)), (i32 0)), (i64 0))), i8)), + (i32 (SMOVvi8to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), + (i64 0)))>; +def : Pat<(i32 (sext_inreg (i32 (vector_extract + (opNode (v16i8 V128:$Rn)), (i64 0))), i8)), + (i32 (SMOVvi8to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), + (i64 0)))>; +def : Pat<(i32 (sext_inreg (i32 (vector_extract (insert_subvector undef, + (opNode (v4i16 V64:$Rn)), (i32 0)), (i64 0))), i16)), (i32 (SMOVvi16to32 (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), (i64 0)))>; -def : Pat<(i32 (intOp (v4i16 V64:$Rn))), +def : Pat<(i32 (sext_inreg (i32 (vector_extract + (opNode (v8i16 V128:$Rn)), (i64 0))), i16)), (i32 (SMOVvi16to32 - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), - (i64 0)))>; -// If there is a sign extension after this intrinsic, consume it as smov already -// performed it -def : Pat<(i32 (sext_inreg (i32 (intOp (v8i16 V128:$Rn))), i16)), - (i32 (SMOVvi16to32 - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub), - (i64 0)))>; -def : Pat<(i32 (intOp (v8i16 V128:$Rn))), - (i32 (SMOVvi16to32 - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub), - (i64 0)))>; - -def : Pat<(i32 (intOp (v4i32 V128:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub), - ssub))>; + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub), + (i64 0)))>; } -multiclass SIMDAcrossLanesUnsignedIntrinsic<string baseOpc, Intrinsic intOp> { -// If there is a masking operation keeping only what has been actually -// generated, consume it. - def : Pat<(i32 (and (i32 (intOp (v8i8 V64:$Rn))), maski8_or_more)), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), - ssub))>; - def : Pat<(i32 (intOp (v8i8 V64:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), - ssub))>; +multiclass SIMDAcrossLanesUnsignedIntrinsic<string baseOpc, + SDPatternOperator opNode> + : SIMDAcrossLanesIntrinsic<baseOpc, opNode> { // If there is a masking operation keeping only what has been actually // generated, consume it. -def : Pat<(i32 (and (i32 (intOp (v16i8 V128:$Rn))), maski8_or_more)), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), - ssub))>; -def : Pat<(i32 (intOp (v16i8 V128:$Rn))), +def : Pat<(i32 (and (i32 (vector_extract (insert_subvector undef, + (opNode (v8i8 V64:$Rn)), (i32 0)), (i64 0))), maski8_or_more)), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), + ssub))>; +def : Pat<(i32 (and (i32 (vector_extract (opNode (v16i8 V128:$Rn)), (i64 0))), + maski8_or_more)), (i32 (EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), ssub))>; - -// If there is a masking operation keeping only what has been actually -// generated, consume it. -def : Pat<(i32 (and (i32 (intOp (v4i16 V64:$Rn))), maski16_or_more)), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), - ssub))>; -def : Pat<(i32 (intOp (v4i16 V64:$Rn))), +def : Pat<(i32 (and (i32 (vector_extract (insert_subvector undef, + (opNode (v4i16 V64:$Rn)), (i32 0)), (i64 0))), maski16_or_more)), (i32 (EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), ssub))>; -// If there is a masking operation keeping only what has been actually -// generated, consume it. -def : Pat<(i32 (and (i32 (intOp (v8i16 V128:$Rn))), maski16_or_more)), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub), - ssub))>; -def : Pat<(i32 (intOp (v8i16 V128:$Rn))), +def : Pat<(i32 (and (i32 (vector_extract (opNode (v8i16 V128:$Rn)), (i64 0))), + maski16_or_more)), (i32 (EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub), ssub))>; +} -def : Pat<(i32 (intOp (v4i32 V128:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub), - ssub))>; +defm : SIMDAcrossLanesSignedIntrinsic<"ADDV", AArch64saddv>; +// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm +def : Pat<(v2i32 (AArch64saddv (v2i32 V64:$Rn))), + (ADDPv2i32 V64:$Rn, V64:$Rn)>; -} +defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV", AArch64uaddv>; +// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm +def : Pat<(v2i32 (AArch64uaddv (v2i32 V64:$Rn))), + (ADDPv2i32 V64:$Rn, V64:$Rn)>; + +defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", AArch64smaxv>; +def : Pat<(v2i32 (AArch64smaxv (v2i32 V64:$Rn))), + (SMAXPv2i32 V64:$Rn, V64:$Rn)>; + +defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", AArch64sminv>; +def : Pat<(v2i32 (AArch64sminv (v2i32 V64:$Rn))), + (SMINPv2i32 V64:$Rn, V64:$Rn)>; + +defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", AArch64umaxv>; +def : Pat<(v2i32 (AArch64umaxv (v2i32 V64:$Rn))), + (UMAXPv2i32 V64:$Rn, V64:$Rn)>; + +defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", AArch64uminv>; +def : Pat<(v2i32 (AArch64uminv (v2i32 V64:$Rn))), + (UMINPv2i32 V64:$Rn, V64:$Rn)>; multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> { def : Pat<(i32 (intOp (v8i8 V64:$Rn))), @@ -3953,32 +4002,6 @@ def : Pat<(i64 (intOp (v4i32 V128:$Rn))), dsub))>; } -defm : SIMDAcrossLanesSignedIntrinsic<"ADDV", int_aarch64_neon_saddv>; -// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm -def : Pat<(i32 (int_aarch64_neon_saddv (v2i32 V64:$Rn))), - (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>; - -defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV", int_aarch64_neon_uaddv>; -// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm -def : Pat<(i32 (int_aarch64_neon_uaddv (v2i32 V64:$Rn))), - (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>; - -defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", int_aarch64_neon_smaxv>; -def : Pat<(i32 (int_aarch64_neon_smaxv (v2i32 V64:$Rn))), - (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>; - -defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", int_aarch64_neon_sminv>; -def : Pat<(i32 (int_aarch64_neon_sminv (v2i32 V64:$Rn))), - (EXTRACT_SUBREG (SMINPv2i32 V64:$Rn, V64:$Rn), ssub)>; - -defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", int_aarch64_neon_umaxv>; -def : Pat<(i32 (int_aarch64_neon_umaxv (v2i32 V64:$Rn))), - (EXTRACT_SUBREG (UMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>; - -defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", int_aarch64_neon_uminv>; -def : Pat<(i32 (int_aarch64_neon_uminv (v2i32 V64:$Rn))), - (EXTRACT_SUBREG (UMINPv2i32 V64:$Rn, V64:$Rn), ssub)>; - defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>; defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>; diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 8463ce6..b1499e2 100644 --- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -63,16 +63,24 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // If a matching instruction is found, MergeForward is set to true if the // merge is to remove the first instruction and replace the second with // a pair-wise insn, and false if the reverse is true. + // \p SExtIdx[out] gives the index of the result of the load pair that + // must be extended. The value of SExtIdx assumes that the paired load + // produces the value in this order: (I, returned iterator), i.e., + // -1 means no value has to be extended, 0 means I, and 1 means the + // returned iterator. MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, - bool &MergeForward, + bool &MergeForward, int &SExtIdx, unsigned Limit); // Merge the two instructions indicated into a single pair-wise instruction. // If MergeForward is true, erase the first instruction and fold its // operation into the second. If false, the reverse. Return the instruction // following the first instruction (which may change during processing). + // \p SExtIdx index of the result that must be extended for a paired load. + // -1 means none, 0 means I, and 1 means Paired. MachineBasicBlock::iterator mergePairedInsns(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, bool MergeForward); + MachineBasicBlock::iterator Paired, bool MergeForward, + int SExtIdx); // Scan the instruction list to find a base register update that can // be combined with the current instruction (a load or store) using @@ -181,6 +189,43 @@ int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) { } } +static unsigned getMatchingNonSExtOpcode(unsigned Opc, + bool *IsValidLdStrOpc = nullptr) { + if (IsValidLdStrOpc) + *IsValidLdStrOpc = true; + switch (Opc) { + default: + if (IsValidLdStrOpc) + *IsValidLdStrOpc = false; + return UINT_MAX; + case AArch64::STRDui: + case AArch64::STURDi: + case AArch64::STRQui: + case AArch64::STURQi: + case AArch64::STRWui: + case AArch64::STURWi: + case AArch64::STRXui: + case AArch64::STURXi: + case AArch64::LDRDui: + case AArch64::LDURDi: + case AArch64::LDRQui: + case AArch64::LDURQi: + case AArch64::LDRWui: + case AArch64::LDURWi: + case AArch64::LDRXui: + case AArch64::LDURXi: + case AArch64::STRSui: + case AArch64::STURSi: + case AArch64::LDRSui: + case AArch64::LDURSi: + return Opc; + case AArch64::LDRSWui: + return AArch64::LDRWui; + case AArch64::LDURSWi: + return AArch64::LDURWi; + } +} + static unsigned getMatchingPairOpcode(unsigned Opc) { switch (Opc) { default: @@ -282,7 +327,7 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - bool MergeForward) { + bool MergeForward, int SExtIdx) { MachineBasicBlock::iterator NextI = I; ++NextI; // If NextI is the second of the two instructions to be merged, we need @@ -292,11 +337,13 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, if (NextI == Paired) ++NextI; - bool IsUnscaled = isUnscaledLdst(I->getOpcode()); + unsigned Opc = + SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); + bool IsUnscaled = isUnscaledLdst(Opc); int OffsetStride = IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1; - unsigned NewOpc = getMatchingPairOpcode(I->getOpcode()); + unsigned NewOpc = getMatchingPairOpcode(Opc); // Insert our new paired instruction after whichever of the paired // instructions MergeForward indicates. MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; @@ -311,6 +358,11 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, Paired->getOperand(2).getImm() + OffsetStride) { RtMI = Paired; Rt2MI = I; + // Here we swapped the assumption made for SExtIdx. + // I.e., we turn ldp I, Paired into ldp Paired, I. + // Update the index accordingly. + if (SExtIdx != -1) + SExtIdx = (SExtIdx + 1) % 2; } else { RtMI = I; Rt2MI = Paired; @@ -337,8 +389,47 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, DEBUG(dbgs() << " "); DEBUG(Paired->print(dbgs())); DEBUG(dbgs() << " with instruction:\n "); - DEBUG(((MachineInstr *)MIB)->print(dbgs())); - DEBUG(dbgs() << "\n"); + + if (SExtIdx != -1) { + // Generate the sign extension for the proper result of the ldp. + // I.e., with X1, that would be: + // %W1<def> = KILL %W1, %X1<imp-def> + // %X1<def> = SBFMXri %X1<kill>, 0, 31 + MachineOperand &DstMO = MIB->getOperand(SExtIdx); + // Right now, DstMO has the extended register, since it comes from an + // extended opcode. + unsigned DstRegX = DstMO.getReg(); + // Get the W variant of that register. + unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); + // Update the result of LDP to use the W instead of the X variant. + DstMO.setReg(DstRegW); + DEBUG(((MachineInstr *)MIB)->print(dbgs())); + DEBUG(dbgs() << "\n"); + // Make the machine verifier happy by providing a definition for + // the X register. + // Insert this definition right after the generated LDP, i.e., before + // InsertionPoint. + MachineInstrBuilder MIBKill = + BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(TargetOpcode::KILL), DstRegW) + .addReg(DstRegW) + .addReg(DstRegX, RegState::Define); + MIBKill->getOperand(2).setImplicit(); + // Create the sign extension. + MachineInstrBuilder MIBSXTW = + BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(AArch64::SBFMXri), DstRegX) + .addReg(DstRegX) + .addImm(0) + .addImm(31); + (void)MIBSXTW; + DEBUG(dbgs() << " Extend operand:\n "); + DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs())); + DEBUG(dbgs() << "\n"); + } else { + DEBUG(((MachineInstr *)MIB)->print(dbgs())); + DEBUG(dbgs() << "\n"); + } // Erase the old instructions. I->eraseFromParent(); @@ -396,7 +487,8 @@ static int alignTo(int Num, int PowOf2) { /// be combined with the current instruction into a load/store pair. MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, - bool &MergeForward, unsigned Limit) { + bool &MergeForward, int &SExtIdx, + unsigned Limit) { MachineBasicBlock::iterator E = I->getParent()->end(); MachineBasicBlock::iterator MBBI = I; MachineInstr *FirstMI = I; @@ -436,7 +528,19 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // Now that we know this is a real instruction, count it. ++Count; - if (Opc == MI->getOpcode() && MI->getOperand(2).isImm()) { + bool CanMergeOpc = Opc == MI->getOpcode(); + SExtIdx = -1; + if (!CanMergeOpc) { + bool IsValidLdStrOpc; + unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc); + if (!IsValidLdStrOpc) + continue; + // Opc will be the first instruction in the pair. + SExtIdx = NonSExtOpc == (unsigned)Opc ? 1 : 0; + CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode()); + } + + if (CanMergeOpc && MI->getOperand(2).isImm()) { // If we've found another instruction with the same opcode, check to see // if the base and offset are compatible with our starting instruction. // These instructions all have scaled immediate operands, so we just @@ -823,13 +927,14 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { } // Look ahead up to ScanLimit instructions for a pairable instruction. bool MergeForward = false; + int SExtIdx = -1; MachineBasicBlock::iterator Paired = - findMatchingInsn(MBBI, MergeForward, ScanLimit); + findMatchingInsn(MBBI, MergeForward, SExtIdx, ScanLimit); if (Paired != E) { // Merge the loads into a pair. Keeping the iterator straight is a // pain, so we let the merge routine tell us what the next instruction // is after it's done mucking about. - MBBI = mergePairedInsns(MBBI, Paired, MergeForward); + MBBI = mergePairedInsns(MBBI, Paired, MergeForward, SExtIdx); Modified = true; ++NumPairCreated; diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp index e57b0f4..b829341 100644 --- a/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -22,9 +22,12 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; +extern cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration; + AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, AsmPrinter &printer) : Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {} @@ -84,10 +87,16 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO, if (MO.isGlobal()) { const GlobalValue *GV = MO.getGlobal(); Model = Printer.TM.getTLSModel(GV); + if (!EnableAArch64ELFLocalDynamicTLSGeneration && + Model == TLSModel::LocalDynamic) + Model = TLSModel::GeneralDynamic; + } else { assert(MO.isSymbol() && StringRef(MO.getSymbolName()) == "_TLS_MODULE_BASE_" && "unexpected external TLS symbol"); + // The general dynamic access sequence is used to get the + // address of _TLS_MODULE_BASE_. Model = TLSModel::GeneralDynamic; } switch (Model) { @@ -123,6 +132,8 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO, RefFlags |= AArch64MCExpr::VK_G1; else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G0) RefFlags |= AArch64MCExpr::VK_G0; + else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_HI12) + RefFlags |= AArch64MCExpr::VK_HI12; if (MO.getTargetFlags() & AArch64II::MO_NC) RefFlags |= AArch64MCExpr::VK_NC; diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp index 4690177..5394875 100644 --- a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp +++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp @@ -319,7 +319,7 @@ void A57ChainingConstraint::addInterChainConstraint(PBQPRAGraph &G, unsigned Rd, static bool regJustKilledBefore(const LiveIntervals &LIs, unsigned reg, const MachineInstr &MI) { - LiveInterval LI = LIs.getInterval(reg); + const LiveInterval &LI = LIs.getInterval(reg); SlotIndex SI = LIs.getInstructionIndex(&MI); return LI.expiredAt(SI); } diff --git a/lib/Target/AArch64/AArch64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp index c037c86..e1b93bf 100644 --- a/lib/Target/AArch64/AArch64PromoteConstant.cpp +++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp @@ -38,6 +38,7 @@ #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -189,9 +190,11 @@ private: IPI->second.push_back(&Use); // Transfer the dominated uses of IPI to NewPt // Inserting into the DenseMap may invalidate existing iterator. - // Keep a copy of the key to find the iterator to erase. + // Keep a copy of the key to find the iterator to erase. Keep a copy of the + // value so that we don't have to dereference IPI->second. Instruction *OldInstr = IPI->first; - InsertPts[NewPt] = std::move(IPI->second); + Uses OldUses = std::move(IPI->second); + InsertPts[NewPt] = std::move(OldUses); // Erase IPI. InsertPts.erase(OldInstr); } diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp index 206cdbb..33c11fe 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -18,6 +18,7 @@ #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/Triple.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -37,9 +38,8 @@ static cl::opt<bool> ReserveX18("aarch64-reserve-x18", cl::Hidden, cl::desc("Reserve X18, making it unavailable as GPR")); -AArch64RegisterInfo::AArch64RegisterInfo(const AArch64InstrInfo *tii, - const AArch64Subtarget *sti) - : AArch64GenRegisterInfo(AArch64::LR), TII(tii), STI(sti) {} +AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT) + : AArch64GenRegisterInfo(AArch64::LR), TT(TT) {} const MCPhysReg * AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { @@ -55,7 +55,8 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { } const uint32_t * -AArch64RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { +AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { if (CC == CallingConv::GHC) // This is academic becase all GHC calls are (supposed to be) tail calls return CSR_AArch64_NoRegs_RegMask; @@ -66,15 +67,16 @@ AArch64RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { } const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const { - if (STI->isTargetDarwin()) + if (TT.isOSDarwin()) return CSR_AArch64_TLS_Darwin_RegMask; - assert(STI->isTargetELF() && "only expect Darwin or ELF TLS"); + assert(TT.isOSBinFormatELF() && "only expect Darwin or ELF TLS"); return CSR_AArch64_TLS_ELF_RegMask; } const uint32_t * -AArch64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID CC) const { +AArch64RegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { // This should return a register mask that is the same as that returned by // getCallPreservedMask but that additionally preserves the register used for // the first i64 argument (which must also be the register used to return a @@ -97,12 +99,12 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(AArch64::WSP); Reserved.set(AArch64::WZR); - if (TFI->hasFP(MF) || STI->isTargetDarwin()) { + if (TFI->hasFP(MF) || TT.isOSDarwin()) { Reserved.set(AArch64::FP); Reserved.set(AArch64::W29); } - if (STI->isTargetDarwin() || ReserveX18) { + if (TT.isOSDarwin() || ReserveX18) { Reserved.set(AArch64::X18); // Platform register Reserved.set(AArch64::W18); } @@ -129,10 +131,10 @@ bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF, return true; case AArch64::X18: case AArch64::W18: - return STI->isTargetDarwin() || ReserveX18; + return TT.isOSDarwin() || ReserveX18; case AArch64::FP: case AArch64::W29: - return TFI->hasFP(MF) || STI->isTargetDarwin(); + return TFI->hasFP(MF) || TT.isOSDarwin(); case AArch64::W19: case AArch64::X19: return hasBasePointer(MF); @@ -269,7 +271,7 @@ bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI, // The FP is only available if there is no dynamic realignment. We // don't know for sure yet whether we'll need that, so we guess based // on whether there are any local variables that would trigger it. - if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, FPOffset)) + if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, AArch64::FP, FPOffset)) return false; // If we can reference via the stack pointer or base pointer, try that. @@ -277,7 +279,7 @@ bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI, // to only disallow SP relative references in the live range of // the VLA(s). In practice, it's unclear how much difference that // would make, but it may be worth doing. - if (isFrameOffsetLegal(MI, Offset)) + if (isFrameOffsetLegal(MI, AArch64::SP, Offset)) return false; // The offset likely isn't legal; we want to allocate a virtual base register. @@ -285,6 +287,7 @@ bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI, } bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, + unsigned BaseReg, int64_t Offset) const { assert(Offset <= INT_MAX && "Offset too big to fit in int."); assert(MI && "Unable to get the legal offset for nil instruction."); @@ -302,10 +305,11 @@ void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, DebugLoc DL; // Defaults to "unknown" if (Ins != MBB->end()) DL = Ins->getDebugLoc(); - + const MachineFunction &MF = *MBB->getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); const MCInstrDesc &MCID = TII->get(AArch64::ADDXri); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - const MachineFunction &MF = *MBB->getParent(); MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF)); unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0); @@ -324,6 +328,9 @@ void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, ++i; assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); } + const MachineFunction *MF = MI.getParent()->getParent(); + const AArch64InstrInfo *TII = + MF->getSubtarget<AArch64Subtarget>().getInstrInfo(); bool Done = rewriteAArch64FrameIndex(MI, i, BaseReg, Off, TII); assert(Done && "Unable to resolve frame index!"); (void)Done; @@ -337,6 +344,8 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MachineInstr &MI = *II; MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); const AArch64FrameLowering *TFI = static_cast<const AArch64FrameLowering *>( MF.getSubtarget().getFrameLowering()); @@ -389,10 +398,10 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, case AArch64::GPR64RegClassID: case AArch64::GPR32commonRegClassID: case AArch64::GPR64commonRegClassID: - return 32 - 1 // XZR/SP - - (TFI->hasFP(MF) || STI->isTargetDarwin()) // FP - - (STI->isTargetDarwin() || ReserveX18) // X18 reserved as platform register - - hasBasePointer(MF); // X19 + return 32 - 1 // XZR/SP + - (TFI->hasFP(MF) || TT.isOSDarwin()) // FP + - (TT.isOSDarwin() || ReserveX18) // X18 reserved as platform register + - hasBasePointer(MF); // X19 case AArch64::FPR8RegClassID: case AArch64::FPR16RegClassID: case AArch64::FPR32RegClassID: diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h index 51a5034..c01bfa5 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/lib/Target/AArch64/AArch64RegisterInfo.h @@ -19,26 +19,24 @@ namespace llvm { -class AArch64InstrInfo; -class AArch64Subtarget; class MachineFunction; class RegScavenger; class TargetRegisterClass; +class Triple; struct AArch64RegisterInfo : public AArch64GenRegisterInfo { private: - const AArch64InstrInfo *TII; - const AArch64Subtarget *STI; + const Triple &TT; public: - AArch64RegisterInfo(const AArch64InstrInfo *tii, const AArch64Subtarget *sti); + AArch64RegisterInfo(const Triple &TT); bool isReservedReg(const MachineFunction &MF, unsigned Reg) const; /// Code Generation virtual methods... - const MCPhysReg * - getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override; - const uint32_t *getCallPreservedMask(CallingConv::ID) const override; + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const uint32_t *getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID) const override; unsigned getCSRFirstUseCost() const override { // The cost will be compared against BlockFrequency where entry has the @@ -59,7 +57,8 @@ public: /// /// Should return NULL in the case that the calling convention does not have /// this property - const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const; + const uint32_t *getThisReturnPreservedMask(const MachineFunction &MF, + CallingConv::ID) const; BitVector getReservedRegs(const MachineFunction &MF) const override; const TargetRegisterClass * @@ -73,7 +72,7 @@ public: bool requiresFrameIndexScavenging(const MachineFunction &MF) const override; bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override; - bool isFrameOffsetLegal(const MachineInstr *MI, + bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg, int64_t Offset) const override; void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg, int FrameIdx, diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index c613025..221d70d 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -48,7 +48,7 @@ AArch64Subtarget::AArch64Subtarget(const std::string &TT, const TargetMachine &TM, bool LittleEndian) : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false), - HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), + HasV8_1a(false), HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), IsLittle(LittleEndian), CPUString(CPU), TargetTriple(TT), FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(TM.getDataLayout()), TLInfo(TM, *this) {} diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index d418cc5..bcab97d 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -41,6 +41,7 @@ protected: bool HasNEON; bool HasCrypto; bool HasCRC; + bool HasV8_1a; // HasZeroCycleRegMove - Has zero-cycle register mov instructions. bool HasZeroCycleRegMove; @@ -86,6 +87,7 @@ public: const AArch64RegisterInfo *getRegisterInfo() const override { return &getInstrInfo()->getRegisterInfo(); } + const Triple &getTargetTriple() const { return TargetTriple; } bool enableMachineScheduler() const override { return true; } bool enablePostMachineScheduler() const override { return isCortexA53() || isCortexA57(); @@ -99,6 +101,7 @@ public: bool hasNEON() const { return HasNEON; } bool hasCrypto() const { return HasCrypto; } bool hasCRC() const { return HasCRC; } + bool hasV8_1a() const { return HasV8_1a; } bool isLittleEndian() const { return IsLittle; } diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index d73d0b3..f902f64 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -104,6 +104,16 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { return make_unique<AArch64_ELFTargetObjectFile>(); } +// Helper function to build a DataLayout string +static std::string computeDataLayout(StringRef TT, bool LittleEndian) { + Triple Triple(TT); + if (Triple.isOSBinFormatMachO()) + return "e-m:o-i64:64-i128:128-n32:64-S128"; + if (LittleEndian) + return "e-m:e-i64:64-i128:128-n32:64-S128"; + return "E-m:e-i64:64-i128:128-n32:64-S128"; +} + /// TargetMachine ctor - Create an AArch64 architecture model. /// AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT, @@ -112,16 +122,12 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL, bool LittleEndian) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - // This nested ternary is horrible, but DL needs to be properly - // initialized - // before TLInfo is constructed. - DL(Triple(TT).isOSBinFormatMachO() - ? "e-m:o-i64:64-i128:128-n32:64-S128" - : (LittleEndian ? "e-m:e-i64:64-i128:128-n32:64-S128" - : "E-m:e-i64:64-i128:128-n32:64-S128")), + // This nested ternary is horrible, but DL needs to be properly + // initialized before TLInfo is constructed. + : LLVMTargetMachine(T, computeDataLayout(TT, LittleEndian), TT, CPU, FS, + Options, RM, CM, OL), TLOF(createTLOF(Triple(getTargetTriple()))), - Subtarget(TT, CPU, FS, *this, LittleEndian), isLittle(LittleEndian) { + isLittle(LittleEndian) { initAsmInfo(); } @@ -239,7 +245,7 @@ bool AArch64PassConfig::addPreISel() { // FIXME: On AArch64, this depends on the type. // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes(). // and the offset has to be a multiple of the related size in bytes. - if (TM->getOptLevel() != CodeGenOpt::None) + if (TM->getOptLevel() == CodeGenOpt::Aggressive) addPass(createGlobalMergePass(TM, 4095)); if (TM->getOptLevel() != CodeGenOpt::None) addPass(createAArch64AddressTypePromotionPass()); @@ -287,10 +293,7 @@ void AArch64PassConfig::addPostRegAlloc() { // Change dead register definitions to refer to the zero register. if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination) addPass(createAArch64DeadRegisterDefinitions()); - if (TM->getOptLevel() != CodeGenOpt::None && - (TM->getSubtarget<AArch64Subtarget>().isCortexA53() || - TM->getSubtarget<AArch64Subtarget>().isCortexA57()) && - usingDefaultRegAlloc()) + if (TM->getOptLevel() != CodeGenOpt::None && usingDefaultRegAlloc()) // Improve performance for some FP/SIMD code for A57. addPass(createAArch64A57FPLoadBalancing()); } diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h index 7143adf..ec34fad 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.h +++ b/lib/Target/AArch64/AArch64TargetMachine.h @@ -23,9 +23,7 @@ namespace llvm { class AArch64TargetMachine : public LLVMTargetMachine { protected: - const DataLayout DL; std::unique_ptr<TargetLoweringObjectFile> TLOF; - AArch64Subtarget Subtarget; mutable StringMap<std::unique_ptr<AArch64Subtarget>> SubtargetMap; public: @@ -35,11 +33,6 @@ public: CodeGenOpt::Level OL, bool IsLittleEndian); ~AArch64TargetMachine() override; - - const DataLayout *getDataLayout() const override { return &DL; } - const AArch64Subtarget *getSubtargetImpl() const override { - return &Subtarget; - } const AArch64Subtarget *getSubtargetImpl(const Function &F) const override; // Pass Pipeline Configuration diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp index 4069038..8ff58e9 100644 --- a/lib/Target/AArch64/AArch64TargetObjectFile.cpp +++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp @@ -13,6 +13,7 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCValue.h" #include "llvm/Support/Dwarf.h" using namespace llvm; using namespace dwarf; @@ -23,6 +24,11 @@ void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx, InitializeELF(TM.Options.UseInitArray); } +AArch64_MachoTargetObjectFile::AArch64_MachoTargetObjectFile() + : TargetLoweringObjectFileMachO() { + SupportGOTPCRelWithOffset = false; +} + const MCExpr *AArch64_MachoTargetObjectFile::getTTypeGlobalReference( const GlobalValue *GV, unsigned Encoding, Mangler &Mang, const TargetMachine &TM, MachineModuleInfo *MMI, @@ -50,3 +56,18 @@ MCSymbol *AArch64_MachoTargetObjectFile::getCFIPersonalitySymbol( MachineModuleInfo *MMI) const { return TM.getSymbol(GV, Mang); } + +const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel( + const MCSymbol *Sym, const MCValue &MV, int64_t Offset, + MachineModuleInfo *MMI, MCStreamer &Streamer) const { + assert((Offset+MV.getConstant() == 0) && + "Arch64 does not support GOT PC rel with extra offset"); + // On ARM64 Darwin, we can reference symbols with foo@GOT-., which + // is an indirect pc-relative reference. + const MCExpr *Res = + MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, getContext()); + MCSymbol *PCSym = getContext().CreateTempSymbol(); + Streamer.EmitLabel(PCSym); + const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, getContext()); + return MCBinaryExpr::CreateSub(Res, PC, getContext()); +} diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h index 2e595f9..d41f445 100644 --- a/lib/Target/AArch64/AArch64TargetObjectFile.h +++ b/lib/Target/AArch64/AArch64TargetObjectFile.h @@ -24,6 +24,8 @@ class AArch64_ELFTargetObjectFile : public TargetLoweringObjectFileELF { /// AArch64_MachoTargetObjectFile - This TLOF implementation is used for Darwin. class AArch64_MachoTargetObjectFile : public TargetLoweringObjectFileMachO { public: + AArch64_MachoTargetObjectFile(); + const MCExpr *getTTypeGlobalReference(const GlobalValue *GV, unsigned Encoding, Mangler &Mang, const TargetMachine &TM, @@ -33,6 +35,11 @@ public: MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM, MachineModuleInfo *MMI) const override; + + const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym, + const MCValue &MV, int64_t Offset, + MachineModuleInfo *MMI, + MCStreamer &Streamer) const override; }; } // end namespace llvm diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 0646d85..0533355 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -10,6 +10,7 @@ #include "AArch64TargetTransformInfo.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" @@ -352,7 +353,7 @@ unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, // We don't lower vector selects well that are wider than the register width. if (ValTy->isVectorTy() && ISD == ISD::SELECT) { // We would need this many instructions to hide the scalarization happening. - unsigned AmortizationCost = 20; + const unsigned AmortizationCost = 20; static const TypeConversionCostTblEntry<MVT::SimpleValueType> VectorSelectTbl[] = { { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost }, @@ -426,6 +427,15 @@ unsigned AArch64TTIImpl::getMaxInterleaveFactor() { void AArch64TTIImpl::getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP) { + // Enable partial unrolling and runtime unrolling. + BaseT::getUnrollingPreferences(L, UP); + + // For inner loop, it is more likely to be a hot one, and the runtime check + // can be promoted out from LICM pass, so the overhead is less, let's try + // a larger threshold to unroll more loops. + if (L->getLoopDepth() > 1) + UP.PartialThreshold *= 2; + // Disable partial & runtime unrolling on -Os. UP.PartialOptSizeThreshold = 0; } diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 1960c99..1219ffc 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -113,11 +113,10 @@ public: #define GET_OPERAND_DIAGNOSTIC_TYPES #include "AArch64GenAsmMatcher.inc" }; - AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser, - const MCInstrInfo &MII, - const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(_STI) { - MCAsmParserExtension::Initialize(_Parser); + AArch64AsmParser(MCSubtargetInfo &STI, MCAsmParser &Parser, + const MCInstrInfo &MII, const MCTargetOptions &Options) + : MCTargetAsmParser(), STI(STI) { + MCAsmParserExtension::Initialize(Parser); MCStreamer &S = getParser().getStreamer(); if (S.getTargetStreamer() == nullptr) new AArch64TargetStreamer(S); @@ -205,6 +204,8 @@ private: struct BarrierOp { unsigned Val; // Not the enum since not all values have names. + const char *Data; + unsigned Length; }; struct SysRegOp { @@ -221,6 +222,8 @@ private: struct PrefetchOp { unsigned Val; + const char *Data; + unsigned Length; }; struct ShiftExtendOp { @@ -254,8 +257,7 @@ private: MCContext &Ctx; public: - AArch64Operand(KindTy K, MCContext &_Ctx) - : MCParsedAsmOperand(), Kind(K), Ctx(_Ctx) {} + AArch64Operand(KindTy K, MCContext &Ctx) : Kind(K), Ctx(Ctx) {} AArch64Operand(const AArch64Operand &o) : MCParsedAsmOperand(), Ctx(o.Ctx) { Kind = o.Kind; @@ -349,6 +351,11 @@ public: return Barrier.Val; } + StringRef getBarrierName() const { + assert(Kind == k_Barrier && "Invalid access!"); + return StringRef(Barrier.Data, Barrier.Length); + } + unsigned getReg() const override { assert(Kind == k_Register && "Invalid access!"); return Reg.RegNum; @@ -384,6 +391,11 @@ public: return Prefetch.Val; } + StringRef getPrefetchName() const { + assert(Kind == k_Prefetch && "Invalid access!"); + return StringRef(Prefetch.Data, Prefetch.Length); + } + AArch64_AM::ShiftExtendType getShiftExtendType() const { assert(Kind == k_ShiftExtend && "Invalid access!"); return ShiftExtend.Type; @@ -752,58 +764,47 @@ public: } bool isMovZSymbolG3() const { - static AArch64MCExpr::VariantKind Variants[] = { AArch64MCExpr::VK_ABS_G3 }; - return isMovWSymbol(Variants); + return isMovWSymbol(AArch64MCExpr::VK_ABS_G3); } bool isMovZSymbolG2() const { - static AArch64MCExpr::VariantKind Variants[] = { - AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S, - AArch64MCExpr::VK_TPREL_G2, AArch64MCExpr::VK_DTPREL_G2}; - return isMovWSymbol(Variants); + return isMovWSymbol({AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S, + AArch64MCExpr::VK_TPREL_G2, + AArch64MCExpr::VK_DTPREL_G2}); } bool isMovZSymbolG1() const { - static AArch64MCExpr::VariantKind Variants[] = { - AArch64MCExpr::VK_ABS_G1, AArch64MCExpr::VK_ABS_G1_S, + return isMovWSymbol({ + AArch64MCExpr::VK_ABS_G1, AArch64MCExpr::VK_ABS_G1_S, AArch64MCExpr::VK_GOTTPREL_G1, AArch64MCExpr::VK_TPREL_G1, AArch64MCExpr::VK_DTPREL_G1, - }; - return isMovWSymbol(Variants); + }); } bool isMovZSymbolG0() const { - static AArch64MCExpr::VariantKind Variants[] = { - AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S, - AArch64MCExpr::VK_TPREL_G0, AArch64MCExpr::VK_DTPREL_G0}; - return isMovWSymbol(Variants); + return isMovWSymbol({AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S, + AArch64MCExpr::VK_TPREL_G0, + AArch64MCExpr::VK_DTPREL_G0}); } bool isMovKSymbolG3() const { - static AArch64MCExpr::VariantKind Variants[] = { AArch64MCExpr::VK_ABS_G3 }; - return isMovWSymbol(Variants); + return isMovWSymbol(AArch64MCExpr::VK_ABS_G3); } bool isMovKSymbolG2() const { - static AArch64MCExpr::VariantKind Variants[] = { - AArch64MCExpr::VK_ABS_G2_NC}; - return isMovWSymbol(Variants); + return isMovWSymbol(AArch64MCExpr::VK_ABS_G2_NC); } bool isMovKSymbolG1() const { - static AArch64MCExpr::VariantKind Variants[] = { - AArch64MCExpr::VK_ABS_G1_NC, AArch64MCExpr::VK_TPREL_G1_NC, - AArch64MCExpr::VK_DTPREL_G1_NC - }; - return isMovWSymbol(Variants); + return isMovWSymbol({AArch64MCExpr::VK_ABS_G1_NC, + AArch64MCExpr::VK_TPREL_G1_NC, + AArch64MCExpr::VK_DTPREL_G1_NC}); } bool isMovKSymbolG0() const { - static AArch64MCExpr::VariantKind Variants[] = { - AArch64MCExpr::VK_ABS_G0_NC, AArch64MCExpr::VK_GOTTPREL_G0_NC, - AArch64MCExpr::VK_TPREL_G0_NC, AArch64MCExpr::VK_DTPREL_G0_NC - }; - return isMovWSymbol(Variants); + return isMovWSymbol( + {AArch64MCExpr::VK_ABS_G0_NC, AArch64MCExpr::VK_GOTTPREL_G0_NC, + AArch64MCExpr::VK_TPREL_G0_NC, AArch64MCExpr::VK_DTPREL_G0_NC}); } template<int RegWidth, int Shift> @@ -1608,10 +1609,14 @@ public: return Op; } - static std::unique_ptr<AArch64Operand> CreateBarrier(unsigned Val, SMLoc S, + static std::unique_ptr<AArch64Operand> CreateBarrier(unsigned Val, + StringRef Str, + SMLoc S, MCContext &Ctx) { auto Op = make_unique<AArch64Operand>(k_Barrier, Ctx); Op->Barrier.Val = Val; + Op->Barrier.Data = Str.data(); + Op->Barrier.Length = Str.size(); Op->StartLoc = S; Op->EndLoc = S; return Op; @@ -1642,10 +1647,14 @@ public: return Op; } - static std::unique_ptr<AArch64Operand> CreatePrefetch(unsigned Val, SMLoc S, + static std::unique_ptr<AArch64Operand> CreatePrefetch(unsigned Val, + StringRef Str, + SMLoc S, MCContext &Ctx) { auto Op = make_unique<AArch64Operand>(k_Prefetch, Ctx); Op->Prefetch.Val = Val; + Op->Barrier.Data = Str.data(); + Op->Barrier.Length = Str.size(); Op->StartLoc = S; Op->EndLoc = S; return Op; @@ -1673,9 +1682,8 @@ void AArch64Operand::print(raw_ostream &OS) const { << AArch64_AM::getFPImmFloat(getFPImm()) << ") >"; break; case k_Barrier: { - bool Valid; - StringRef Name = AArch64DB::DBarrierMapper().toString(getBarrier(), Valid); - if (Valid) + StringRef Name = getBarrierName(); + if (!Name.empty()) OS << "<barrier " << Name << ">"; else OS << "<barrier invalid #" << getBarrier() << ">"; @@ -1718,9 +1726,8 @@ void AArch64Operand::print(raw_ostream &OS) const { OS << "c" << getSysCR(); break; case k_Prefetch: { - bool Valid; - StringRef Name = AArch64PRFM::PRFMMapper().toString(getPrefetch(), Valid); - if (Valid) + StringRef Name = getPrefetchName(); + if (!Name.empty()) OS << "<prfop " << Name << ">"; else OS << "<prfop invalid #" << getPrefetch() << ">"; @@ -1963,7 +1970,11 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { return MatchOperand_ParseFail; } - Operands.push_back(AArch64Operand::CreatePrefetch(prfop, S, getContext())); + bool Valid; + auto Mapper = AArch64PRFM::PRFMMapper(); + StringRef Name = Mapper.toString(MCE->getValue(), Valid); + Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Name, + S, getContext())); return MatchOperand_Success; } @@ -1973,14 +1984,16 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { } bool Valid; - unsigned prfop = AArch64PRFM::PRFMMapper().fromString(Tok.getString(), Valid); + auto Mapper = AArch64PRFM::PRFMMapper(); + unsigned prfop = Mapper.fromString(Tok.getString(), Valid); if (!Valid) { TokError("pre-fetch hint expected"); return MatchOperand_ParseFail; } Parser.Lex(); // Eat identifier token. - Operands.push_back(AArch64Operand::CreatePrefetch(prfop, S, getContext())); + Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Tok.getString(), + S, getContext())); return MatchOperand_Success; } @@ -2582,8 +2595,11 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { Error(ExprLoc, "barrier operand out of range"); return MatchOperand_ParseFail; } - Operands.push_back( - AArch64Operand::CreateBarrier(MCE->getValue(), ExprLoc, getContext())); + bool Valid; + auto Mapper = AArch64DB::DBarrierMapper(); + StringRef Name = Mapper.toString(MCE->getValue(), Valid); + Operands.push_back( AArch64Operand::CreateBarrier(MCE->getValue(), Name, + ExprLoc, getContext())); return MatchOperand_Success; } @@ -2593,7 +2609,8 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { } bool Valid; - unsigned Opt = AArch64DB::DBarrierMapper().fromString(Tok.getString(), Valid); + auto Mapper = AArch64DB::DBarrierMapper(); + unsigned Opt = Mapper.fromString(Tok.getString(), Valid); if (!Valid) { TokError("invalid barrier option name"); return MatchOperand_ParseFail; @@ -2605,8 +2622,8 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { return MatchOperand_ParseFail; } - Operands.push_back( - AArch64Operand::CreateBarrier(Opt, getLoc(), getContext())); + Operands.push_back( AArch64Operand::CreateBarrier(Opt, Tok.getString(), + getLoc(), getContext())); Parser.Lex(); // Consume the option return MatchOperand_Success; @@ -2631,8 +2648,8 @@ AArch64AsmParser::tryParseSysReg(OperandVector &Operands) { assert(IsKnown == (MSRReg != -1U) && "register should be -1 if and only if it's unknown"); - uint32_t PStateField = - AArch64PState::PStateMapper().fromString(Tok.getString(), IsKnown); + auto PStateMapper = AArch64PState::PStateMapper(); + uint32_t PStateField = PStateMapper.fromString(Tok.getString(), IsKnown); assert(IsKnown == (PStateField != -1U) && "register should be -1 if and only if it's unknown"); diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 423da65..84b63a0 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -18,6 +18,7 @@ #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCValue.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MachO.h" using namespace llvm; @@ -493,14 +494,28 @@ void ELFAArch64AsmBackend::processFixupValue( IsResolved = false; } +// Returns whether this fixup is based on an address in the .eh_frame section, +// and therefore should be byte swapped. +// FIXME: Should be replaced with something more principled. +static bool isByteSwappedFixup(const MCExpr *E) { + MCValue Val; + if (!E->EvaluateAsRelocatable(Val, nullptr, nullptr)) + return false; + + if (!Val.getSymA() || Val.getSymA()->getSymbol().isUndefined()) + return false; + + const MCSectionELF *SecELF = + dyn_cast<MCSectionELF>(&Val.getSymA()->getSymbol().getSection()); + return SecELF->getSectionName() == ".eh_frame"; +} + void ELFAArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value, bool IsPCRel) const { // store fixups in .eh_frame section in big endian order if (!IsLittleEndian && Fixup.getKind() == FK_Data_4) { - const MCSection *Sec = Fixup.getValue()->FindAssociatedSection(); - const MCSectionELF *SecELF = dyn_cast_or_null<const MCSectionELF>(Sec); - if (SecELF && SecELF->getSectionName() == ".eh_frame") + if (isByteSwappedFixup(Fixup.getValue())) Value = ByteSwap_32(unsigned(Value)); } AArch64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel); diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 8dc6c30..8f780d2 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -203,24 +203,27 @@ void AArch64TargetELFStreamer::emitInst(uint32_t Inst) { } namespace llvm { -MCStreamer * -createAArch64MCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS, - bool isVerboseAsm, bool useDwarfDirectory, - MCInstPrinter *InstPrint, MCCodeEmitter *CE, - MCAsmBackend *TAB, bool ShowInst) { - MCStreamer *S = llvm::createAsmStreamer( - Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst); - new AArch64TargetAsmStreamer(*S, OS); - return S; +MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S, + formatted_raw_ostream &OS, + MCInstPrinter *InstPrint, + bool isVerboseAsm) { + return new AArch64TargetAsmStreamer(S, OS); } MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS, MCCodeEmitter *Emitter, bool RelaxAll) { AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter); - new AArch64TargetELFStreamer(*S); if (RelaxAll) S->getAssembler().setRelaxAll(true); return S; } + +MCTargetStreamer * +createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) { + Triple TT(STI.getTargetTriple()); + if (TT.getObjectFormat() == Triple::ELF) + return new AArch64TargetELFStreamer(S); + return nullptr; +} } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index 4756a19..9ea49f0 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -38,9 +38,7 @@ class AArch64MCCodeEmitter : public MCCodeEmitter { AArch64MCCodeEmitter(const AArch64MCCodeEmitter &); // DO NOT IMPLEMENT void operator=(const AArch64MCCodeEmitter &); // DO NOT IMPLEMENT public: - AArch64MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti, - MCContext &ctx) - : Ctx(ctx) {} + AArch64MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) : Ctx(ctx) {} ~AArch64MCCodeEmitter() {} @@ -205,9 +203,8 @@ public: MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx) { - return new AArch64MCCodeEmitter(MCII, STI, Ctx); + return new AArch64MCCodeEmitter(MCII, Ctx); } /// getMachineOpValue - Return binary encoding of operand. If the machine diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp index e396df8..9e31508 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp @@ -16,6 +16,7 @@ #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCELF.h" +#include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCValue.h" #include "llvm/Object/ELF.h" diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index 0f7a6b8..38b399d 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -123,94 +123,61 @@ static MCInstPrinter *createAArch64MCInstPrinter(const Target &T, return nullptr; } -static MCStreamer *createMCStreamer(const Target &T, StringRef TT, - MCContext &Ctx, MCAsmBackend &TAB, - raw_ostream &OS, MCCodeEmitter *Emitter, - const MCSubtargetInfo &STI, bool RelaxAll) { - Triple TheTriple(TT); - - if (TheTriple.isOSDarwin()) - return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll, - /*LabelSections*/ true); - +static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx, + MCAsmBackend &TAB, raw_ostream &OS, + MCCodeEmitter *Emitter, bool RelaxAll) { return createAArch64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll); } +static MCStreamer *createMachOStreamer(MCContext &Ctx, MCAsmBackend &TAB, + raw_ostream &OS, MCCodeEmitter *Emitter, + bool RelaxAll, + bool DWARFMustBeAtTheEnd) { + return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll, + DWARFMustBeAtTheEnd, + /*LabelSections*/ true); +} + // Force static initialization. extern "C" void LLVMInitializeAArch64TargetMC() { - // Register the MC asm info. - RegisterMCAsmInfoFn X(TheAArch64leTarget, createAArch64MCAsmInfo); - RegisterMCAsmInfoFn Y(TheAArch64beTarget, createAArch64MCAsmInfo); - RegisterMCAsmInfoFn Z(TheARM64Target, createAArch64MCAsmInfo); - - // Register the MC codegen info. - TargetRegistry::RegisterMCCodeGenInfo(TheAArch64leTarget, - createAArch64MCCodeGenInfo); - TargetRegistry::RegisterMCCodeGenInfo(TheAArch64beTarget, - createAArch64MCCodeGenInfo); - TargetRegistry::RegisterMCCodeGenInfo(TheARM64Target, - createAArch64MCCodeGenInfo); - - // Register the MC instruction info. - TargetRegistry::RegisterMCInstrInfo(TheAArch64leTarget, - createAArch64MCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(TheAArch64beTarget, - createAArch64MCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(TheARM64Target, - createAArch64MCInstrInfo); - - // Register the MC register info. - TargetRegistry::RegisterMCRegInfo(TheAArch64leTarget, - createAArch64MCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(TheAArch64beTarget, - createAArch64MCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(TheARM64Target, - createAArch64MCRegisterInfo); - - // Register the MC subtarget info. - TargetRegistry::RegisterMCSubtargetInfo(TheAArch64leTarget, - createAArch64MCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(TheAArch64beTarget, - createAArch64MCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(TheARM64Target, - createAArch64MCSubtargetInfo); + for (Target *T : + {&TheAArch64leTarget, &TheAArch64beTarget, &TheARM64Target}) { + // Register the MC asm info. + RegisterMCAsmInfoFn X(*T, createAArch64MCAsmInfo); + + // Register the MC codegen info. + TargetRegistry::RegisterMCCodeGenInfo(*T, createAArch64MCCodeGenInfo); + + // Register the MC instruction info. + TargetRegistry::RegisterMCInstrInfo(*T, createAArch64MCInstrInfo); + + // Register the MC register info. + TargetRegistry::RegisterMCRegInfo(*T, createAArch64MCRegisterInfo); + + // Register the MC subtarget info. + TargetRegistry::RegisterMCSubtargetInfo(*T, createAArch64MCSubtargetInfo); + + // Register the MC Code Emitter + TargetRegistry::RegisterMCCodeEmitter(*T, createAArch64MCCodeEmitter); + + // Register the obj streamers. + TargetRegistry::RegisterELFStreamer(*T, createELFStreamer); + TargetRegistry::RegisterMachOStreamer(*T, createMachOStreamer); + + // Register the obj target streamer. + TargetRegistry::RegisterObjectTargetStreamer( + *T, createAArch64ObjectTargetStreamer); + + // Register the asm streamer. + TargetRegistry::RegisterAsmTargetStreamer(*T, + createAArch64AsmTargetStreamer); + // Register the MCInstPrinter. + TargetRegistry::RegisterMCInstPrinter(*T, createAArch64MCInstPrinter); + } // Register the asm backend. - TargetRegistry::RegisterMCAsmBackend(TheAArch64leTarget, - createAArch64leAsmBackend); + for (Target *T : {&TheAArch64leTarget, &TheARM64Target}) + TargetRegistry::RegisterMCAsmBackend(*T, createAArch64leAsmBackend); TargetRegistry::RegisterMCAsmBackend(TheAArch64beTarget, createAArch64beAsmBackend); - TargetRegistry::RegisterMCAsmBackend(TheARM64Target, - createAArch64leAsmBackend); - - // Register the MC Code Emitter - TargetRegistry::RegisterMCCodeEmitter(TheAArch64leTarget, - createAArch64MCCodeEmitter); - TargetRegistry::RegisterMCCodeEmitter(TheAArch64beTarget, - createAArch64MCCodeEmitter); - TargetRegistry::RegisterMCCodeEmitter(TheARM64Target, - createAArch64MCCodeEmitter); - - // Register the object streamer. - TargetRegistry::RegisterMCObjectStreamer(TheAArch64leTarget, - createMCStreamer); - TargetRegistry::RegisterMCObjectStreamer(TheAArch64beTarget, - createMCStreamer); - TargetRegistry::RegisterMCObjectStreamer(TheARM64Target, createMCStreamer); - - // Register the asm streamer. - TargetRegistry::RegisterAsmStreamer(TheAArch64leTarget, - createAArch64MCAsmStreamer); - TargetRegistry::RegisterAsmStreamer(TheAArch64beTarget, - createAArch64MCAsmStreamer); - TargetRegistry::RegisterAsmStreamer(TheARM64Target, - createAArch64MCAsmStreamer); - - // Register the MCInstPrinter. - TargetRegistry::RegisterMCInstPrinter(TheAArch64leTarget, - createAArch64MCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(TheAArch64beTarget, - createAArch64MCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(TheARM64Target, - createAArch64MCInstPrinter); } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h index 1553115..7ce303b 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h @@ -28,8 +28,10 @@ class MCRegisterInfo; class MCObjectWriter; class MCStreamer; class MCSubtargetInfo; +class MCTargetStreamer; class StringRef; class Target; +class Triple; class raw_ostream; extern Target TheAArch64leTarget; @@ -37,9 +39,8 @@ extern Target TheAArch64beTarget; extern Target TheARM64Target; MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, - MCContext &Ctx); + const MCRegisterInfo &MRI, + MCContext &Ctx); MCAsmBackend *createAArch64leAsmBackend(const Target &T, const MCRegisterInfo &MRI, StringRef TT, StringRef CPU); @@ -53,11 +54,14 @@ MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS, uint8_t OSABI, MCObjectWriter *createAArch64MachObjectWriter(raw_ostream &OS, uint32_t CPUType, uint32_t CPUSubtype); -MCStreamer * -createAArch64MCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS, - bool isVerboseAsm, bool useDwarfDirectory, - MCInstPrinter *InstPrint, MCCodeEmitter *CE, - MCAsmBackend *TAB, bool ShowInst); +MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S, + formatted_raw_ostream &OS, + MCInstPrinter *InstPrint, + bool isVerboseAsm); + +MCTargetStreamer *createAArch64ObjectTargetStreamer(MCStreamer &S, + const MCSubtargetInfo &STI); + } // End llvm namespace // Defines symbolic names for AArch64 registers. This defines a mapping from diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index bc6c7a9..160c1c5 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -19,10 +19,10 @@ using namespace llvm; StringRef AArch64NamedImmMapper::toString(uint32_t Value, bool &Valid) const { - for (unsigned i = 0; i < NumPairs; ++i) { - if (Pairs[i].Value == Value) { + for (unsigned i = 0; i < NumMappings; ++i) { + if (Mappings[i].Value == Value) { Valid = true; - return Pairs[i].Name; + return Mappings[i].Name; } } @@ -32,10 +32,10 @@ StringRef AArch64NamedImmMapper::toString(uint32_t Value, bool &Valid) const { uint32_t AArch64NamedImmMapper::fromString(StringRef Name, bool &Valid) const { std::string LowerCaseName = Name.lower(); - for (unsigned i = 0; i < NumPairs; ++i) { - if (Pairs[i].Name == LowerCaseName) { + for (unsigned i = 0; i < NumMappings; ++i) { + if (Mappings[i].Name == LowerCaseName) { Valid = true; - return Pairs[i].Value; + return Mappings[i].Value; } } @@ -47,7 +47,7 @@ bool AArch64NamedImmMapper::validImm(uint32_t Value) const { return Value < TooBigImm; } -const AArch64NamedImmMapper::Mapping AArch64AT::ATMapper::ATPairs[] = { +const AArch64NamedImmMapper::Mapping AArch64AT::ATMapper::ATMappings[] = { {"s1e1r", S1E1R}, {"s1e2r", S1E2R}, {"s1e3r", S1E3R}, @@ -63,9 +63,9 @@ const AArch64NamedImmMapper::Mapping AArch64AT::ATMapper::ATPairs[] = { }; AArch64AT::ATMapper::ATMapper() - : AArch64NamedImmMapper(ATPairs, 0) {} + : AArch64NamedImmMapper(ATMappings, 0) {} -const AArch64NamedImmMapper::Mapping AArch64DB::DBarrierMapper::DBarrierPairs[] = { +const AArch64NamedImmMapper::Mapping AArch64DB::DBarrierMapper::DBarrierMappings[] = { {"oshld", OSHLD}, {"oshst", OSHST}, {"osh", OSH}, @@ -81,9 +81,9 @@ const AArch64NamedImmMapper::Mapping AArch64DB::DBarrierMapper::DBarrierPairs[] }; AArch64DB::DBarrierMapper::DBarrierMapper() - : AArch64NamedImmMapper(DBarrierPairs, 16u) {} + : AArch64NamedImmMapper(DBarrierMappings, 16u) {} -const AArch64NamedImmMapper::Mapping AArch64DC::DCMapper::DCPairs[] = { +const AArch64NamedImmMapper::Mapping AArch64DC::DCMapper::DCMappings[] = { {"zva", ZVA}, {"ivac", IVAC}, {"isw", ISW}, @@ -95,25 +95,25 @@ const AArch64NamedImmMapper::Mapping AArch64DC::DCMapper::DCPairs[] = { }; AArch64DC::DCMapper::DCMapper() - : AArch64NamedImmMapper(DCPairs, 0) {} + : AArch64NamedImmMapper(DCMappings, 0) {} -const AArch64NamedImmMapper::Mapping AArch64IC::ICMapper::ICPairs[] = { +const AArch64NamedImmMapper::Mapping AArch64IC::ICMapper::ICMappings[] = { {"ialluis", IALLUIS}, {"iallu", IALLU}, {"ivau", IVAU} }; AArch64IC::ICMapper::ICMapper() - : AArch64NamedImmMapper(ICPairs, 0) {} + : AArch64NamedImmMapper(ICMappings, 0) {} -const AArch64NamedImmMapper::Mapping AArch64ISB::ISBMapper::ISBPairs[] = { +const AArch64NamedImmMapper::Mapping AArch64ISB::ISBMapper::ISBMappings[] = { {"sy", SY}, }; AArch64ISB::ISBMapper::ISBMapper() - : AArch64NamedImmMapper(ISBPairs, 16) {} + : AArch64NamedImmMapper(ISBMappings, 16) {} -const AArch64NamedImmMapper::Mapping AArch64PRFM::PRFMMapper::PRFMPairs[] = { +const AArch64NamedImmMapper::Mapping AArch64PRFM::PRFMMapper::PRFMMappings[] = { {"pldl1keep", PLDL1KEEP}, {"pldl1strm", PLDL1STRM}, {"pldl2keep", PLDL2KEEP}, @@ -135,18 +135,18 @@ const AArch64NamedImmMapper::Mapping AArch64PRFM::PRFMMapper::PRFMPairs[] = { }; AArch64PRFM::PRFMMapper::PRFMMapper() - : AArch64NamedImmMapper(PRFMPairs, 32) {} + : AArch64NamedImmMapper(PRFMMappings, 32) {} -const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStatePairs[] = { +const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStateMappings[] = { {"spsel", SPSel}, {"daifset", DAIFSet}, {"daifclr", DAIFClr} }; AArch64PState::PStateMapper::PStateMapper() - : AArch64NamedImmMapper(PStatePairs, 0) {} + : AArch64NamedImmMapper(PStateMappings, 0) {} -const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSPairs[] = { +const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = { {"mdccsr_el0", MDCCSR_EL0}, {"dbgdtrrx_el0", DBGDTRRX_EL0}, {"mdrar_el1", MDRAR_EL1}, @@ -247,11 +247,11 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSPairs[] = { AArch64SysReg::MRSMapper::MRSMapper(uint64_t FeatureBits) : SysRegMapper(FeatureBits) { - InstPairs = &MRSPairs[0]; - NumInstPairs = llvm::array_lengthof(MRSPairs); + InstMappings = &MRSMappings[0]; + NumInstMappings = llvm::array_lengthof(MRSMappings); } -const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRPairs[] = { +const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRMappings[] = { {"dbgdtrtx_el0", DBGDTRTX_EL0}, {"oslar_el1", OSLAR_EL1}, {"pmswinc_el0", PMSWINC_EL0}, @@ -271,12 +271,12 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRPairs[] = { AArch64SysReg::MSRMapper::MSRMapper(uint64_t FeatureBits) : SysRegMapper(FeatureBits) { - InstPairs = &MSRPairs[0]; - NumInstPairs = llvm::array_lengthof(MSRPairs); + InstMappings = &MSRMappings[0]; + NumInstMappings = llvm::array_lengthof(MSRMappings); } -const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegPairs[] = { +const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegMappings[] = { {"osdtrrx_el1", OSDTRRX_EL1}, {"osdtrtx_el1", OSDTRTX_EL1}, {"teecr32_el1", TEECR32_EL1}, @@ -756,7 +756,7 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegPairs[] }; const AArch64NamedImmMapper::Mapping -AArch64SysReg::SysRegMapper::CycloneSysRegPairs[] = { +AArch64SysReg::SysRegMapper::CycloneSysRegMappings[] = { {"cpm_ioacc_ctl_el3", CPM_IOACC_CTL_EL3} }; @@ -765,29 +765,29 @@ AArch64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const { std::string NameLower = Name.lower(); // First search the registers shared by all - for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) { - if (SysRegPairs[i].Name == NameLower) { + for (unsigned i = 0; i < array_lengthof(SysRegMappings); ++i) { + if (SysRegMappings[i].Name == NameLower) { Valid = true; - return SysRegPairs[i].Value; + return SysRegMappings[i].Value; } } // Next search for target specific registers if (FeatureBits & AArch64::ProcCyclone) { - for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) { - if (CycloneSysRegPairs[i].Name == NameLower) { + for (unsigned i = 0; i < array_lengthof(CycloneSysRegMappings); ++i) { + if (CycloneSysRegMappings[i].Name == NameLower) { Valid = true; - return CycloneSysRegPairs[i].Value; + return CycloneSysRegMappings[i].Value; } } } // Now try the instruction-specific registers (either read-only or // write-only). - for (unsigned i = 0; i < NumInstPairs; ++i) { - if (InstPairs[i].Name == NameLower) { + for (unsigned i = 0; i < NumInstMappings; ++i) { + if (InstMappings[i].Name == NameLower) { Valid = true; - return InstPairs[i].Value; + return InstMappings[i].Value; } } @@ -816,26 +816,26 @@ AArch64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const { std::string AArch64SysReg::SysRegMapper::toString(uint32_t Bits) const { // First search the registers shared by all - for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) { - if (SysRegPairs[i].Value == Bits) { - return SysRegPairs[i].Name; + for (unsigned i = 0; i < array_lengthof(SysRegMappings); ++i) { + if (SysRegMappings[i].Value == Bits) { + return SysRegMappings[i].Name; } } // Next search for target specific registers if (FeatureBits & AArch64::ProcCyclone) { - for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) { - if (CycloneSysRegPairs[i].Value == Bits) { - return CycloneSysRegPairs[i].Name; + for (unsigned i = 0; i < array_lengthof(CycloneSysRegMappings); ++i) { + if (CycloneSysRegMappings[i].Value == Bits) { + return CycloneSysRegMappings[i].Name; } } } // Now try the instruction-specific registers (either read-only or // write-only). - for (unsigned i = 0; i < NumInstPairs; ++i) { - if (InstPairs[i].Value == Bits) { - return InstPairs[i].Name; + for (unsigned i = 0; i < NumInstMappings; ++i) { + if (InstMappings[i].Value == Bits) { + return InstMappings[i].Name; } } @@ -850,7 +850,7 @@ AArch64SysReg::SysRegMapper::toString(uint32_t Bits) const { + "_c" + utostr(CRm) + "_" + utostr(Op2); } -const AArch64NamedImmMapper::Mapping AArch64TLBI::TLBIMapper::TLBIPairs[] = { +const AArch64NamedImmMapper::Mapping AArch64TLBI::TLBIMapper::TLBIMappings[] = { {"ipas2e1is", IPAS2E1IS}, {"ipas2le1is", IPAS2LE1IS}, {"vmalle1is", VMALLE1IS}, @@ -886,4 +886,4 @@ const AArch64NamedImmMapper::Mapping AArch64TLBI::TLBIMapper::TLBIPairs[] = { }; AArch64TLBI::TLBIMapper::TLBIMapper() - : AArch64NamedImmMapper(TLBIPairs, 0) {} + : AArch64NamedImmMapper(TLBIMappings, 0) {} diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h index c60b09a..2ae6f52 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -283,8 +283,8 @@ struct AArch64NamedImmMapper { }; template<int N> - AArch64NamedImmMapper(const Mapping (&Pairs)[N], uint32_t TooBigImm) - : Pairs(&Pairs[0]), NumPairs(N), TooBigImm(TooBigImm) {} + AArch64NamedImmMapper(const Mapping (&Mappings)[N], uint32_t TooBigImm) + : Mappings(&Mappings[0]), NumMappings(N), TooBigImm(TooBigImm) {} StringRef toString(uint32_t Value, bool &Valid) const; uint32_t fromString(StringRef Name, bool &Valid) const; @@ -294,8 +294,8 @@ struct AArch64NamedImmMapper { /// N being 0 indicates no immediate syntax-form is allowed. bool validImm(uint32_t Value) const; protected: - const Mapping *Pairs; - size_t NumPairs; + const Mapping *Mappings; + size_t NumMappings; uint32_t TooBigImm; }; @@ -317,7 +317,7 @@ namespace AArch64AT { }; struct ATMapper : AArch64NamedImmMapper { - const static Mapping ATPairs[]; + const static Mapping ATMappings[]; ATMapper(); }; @@ -341,7 +341,7 @@ namespace AArch64DB { }; struct DBarrierMapper : AArch64NamedImmMapper { - const static Mapping DBarrierPairs[]; + const static Mapping DBarrierMappings[]; DBarrierMapper(); }; @@ -361,7 +361,7 @@ namespace AArch64DC { }; struct DCMapper : AArch64NamedImmMapper { - const static Mapping DCPairs[]; + const static Mapping DCMappings[]; DCMapper(); }; @@ -378,7 +378,7 @@ namespace AArch64IC { struct ICMapper : AArch64NamedImmMapper { - const static Mapping ICPairs[]; + const static Mapping ICMappings[]; ICMapper(); }; @@ -394,7 +394,7 @@ namespace AArch64ISB { SY = 0xf }; struct ISBMapper : AArch64NamedImmMapper { - const static Mapping ISBPairs[]; + const static Mapping ISBMappings[]; ISBMapper(); }; @@ -424,7 +424,7 @@ namespace AArch64PRFM { }; struct PRFMMapper : AArch64NamedImmMapper { - const static Mapping PRFMPairs[]; + const static Mapping PRFMMappings[]; PRFMMapper(); }; @@ -439,7 +439,7 @@ namespace AArch64PState { }; struct PStateMapper : AArch64NamedImmMapper { - const static Mapping PStatePairs[]; + const static Mapping PStateMappings[]; PStateMapper(); }; @@ -1134,11 +1134,11 @@ namespace AArch64SysReg { // burdening the common AArch64NamedImmMapper with abstractions only needed in // this one case. struct SysRegMapper { - static const AArch64NamedImmMapper::Mapping SysRegPairs[]; - static const AArch64NamedImmMapper::Mapping CycloneSysRegPairs[]; + static const AArch64NamedImmMapper::Mapping SysRegMappings[]; + static const AArch64NamedImmMapper::Mapping CycloneSysRegMappings[]; - const AArch64NamedImmMapper::Mapping *InstPairs; - size_t NumInstPairs; + const AArch64NamedImmMapper::Mapping *InstMappings; + size_t NumInstMappings; uint64_t FeatureBits; SysRegMapper(uint64_t FeatureBits) : FeatureBits(FeatureBits) { } @@ -1147,12 +1147,12 @@ namespace AArch64SysReg { }; struct MSRMapper : SysRegMapper { - static const AArch64NamedImmMapper::Mapping MSRPairs[]; + static const AArch64NamedImmMapper::Mapping MSRMappings[]; MSRMapper(uint64_t FeatureBits); }; struct MRSMapper : SysRegMapper { - static const AArch64NamedImmMapper::Mapping MRSPairs[]; + static const AArch64NamedImmMapper::Mapping MRSMappings[]; MRSMapper(uint64_t FeatureBits); }; @@ -1197,7 +1197,7 @@ namespace AArch64TLBI { }; struct TLBIMapper : AArch64NamedImmMapper { - const static Mapping TLBIPairs[]; + const static Mapping TLBIMappings[]; TLBIMapper(); }; @@ -1229,7 +1229,7 @@ namespace AArch64II { MO_NO_FLAG, - MO_FRAGMENT = 0x7, + MO_FRAGMENT = 0xf, /// MO_PAGE - A symbol operand with this flag represents the pc-relative /// offset of the 4K page containing the symbol. This is used with the @@ -1257,26 +1257,31 @@ namespace AArch64II { /// 0-15 of a 64-bit address, used in a MOVZ or MOVK instruction MO_G0 = 6, + /// MO_HI12 - This flag indicates that a symbol operand represents the bits + /// 13-24 of a 64-bit address, used in a arithmetic immediate-shifted-left- + /// by-12-bits instruction. + MO_HI12 = 7, + /// MO_GOT - This flag indicates that a symbol operand represents the /// address of the GOT entry for the symbol, rather than the address of /// the symbol itself. - MO_GOT = 8, + MO_GOT = 0x10, /// MO_NC - Indicates whether the linker is expected to check the symbol /// reference for overflow. For example in an ADRP/ADD pair of relocations /// the ADRP usually does check, but not the ADD. - MO_NC = 0x10, + MO_NC = 0x20, /// MO_TLS - Indicates that the operand being accessed is some kind of /// thread-local symbol. On Darwin, only one type of thread-local access /// exists (pre linker-relaxation), but on ELF the TLSModel used for the /// referee will affect interpretation. - MO_TLS = 0x20, + MO_TLS = 0x40, /// MO_CONSTPOOL - This flag indicates that a symbol operand represents /// the address of a constant pool entry for the symbol, rather than the /// address of the symbol itself. - MO_CONSTPOOL = 0x40 + MO_CONSTPOOL = 0x80 }; } // end namespace AArch64II diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp index 387f1f6..7a1865c 100644 --- a/lib/Target/ARM/A15SDOptimizer.cpp +++ b/lib/Target/ARM/A15SDOptimizer.cpp @@ -27,12 +27,15 @@ #include "ARM.h" #include "ARMBaseInstrInfo.h" #include "ARMBaseRegisterInfo.h" +#include "ARMSubtarget.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <map> @@ -678,8 +681,13 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) { } bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) { - TII = static_cast<const ARMBaseInstrInfo *>(Fn.getSubtarget().getInstrInfo()); - TRI = Fn.getSubtarget().getRegisterInfo(); + const ARMSubtarget &STI = Fn.getSubtarget<ARMSubtarget>(); + // Since the A15SDOptimizer pass can insert VDUP instructions, it can only be + // enabled when NEON is available. + if (!(STI.isCortexA15() && STI.hasNEON())) + return false; + TII = STI.getInstrInfo(); + TRI = STI.getRegisterInfo(); MRI = &Fn.getRegInfo(); bool Modified = false; diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index f080c60..ce0aed9 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -167,9 +167,12 @@ def HasV6Ops : SubtargetFeature<"v6", "HasV6Ops", "true", def HasV6MOps : SubtargetFeature<"v6m", "HasV6MOps", "true", "Support ARM v6M instructions", [HasV6Ops]>; +def HasV6KOps : SubtargetFeature<"v6k", "HasV6KOps", "true", + "Support ARM v6k instructions", + [HasV6Ops]>; def HasV6T2Ops : SubtargetFeature<"v6t2", "HasV6T2Ops", "true", "Support ARM v6t2 instructions", - [HasV6MOps, FeatureThumb2]>; + [HasV6MOps, HasV6KOps, FeatureThumb2]>; def HasV7Ops : SubtargetFeature<"v7", "HasV7Ops", "true", "Support ARM v7 instructions", [HasV6T2Ops, FeaturePerfMon]>; @@ -177,6 +180,9 @@ def HasV8Ops : SubtargetFeature<"v8", "HasV8Ops", "true", "Support ARM v8 instructions", [HasV7Ops, FeatureVirtualization, FeatureMP]>; +def FeatureV8_1a : SubtargetFeature<"v8.1a", "HasV8_1a", "true", + "Support ARM v8.1a instructions", + [HasV8Ops, FeatureAClass, FeatureCRC]>; //===----------------------------------------------------------------------===// // ARM Processors supported. @@ -320,12 +326,6 @@ def : ProcNoItin<"iwmmxt", [HasV5TEOps]>; def : Processor<"arm1136j-s", ARMV6Itineraries, [HasV6Ops]>; def : Processor<"arm1136jf-s", ARMV6Itineraries, [HasV6Ops, FeatureVFP2, FeatureHasSlowFPVMLx]>; -def : Processor<"arm1176jz-s", ARMV6Itineraries, [HasV6Ops]>; -def : Processor<"arm1176jzf-s", ARMV6Itineraries, [HasV6Ops, FeatureVFP2, - FeatureHasSlowFPVMLx]>; -def : Processor<"mpcorenovfp", ARMV6Itineraries, [HasV6Ops]>; -def : Processor<"mpcore", ARMV6Itineraries, [HasV6Ops, FeatureVFP2, - FeatureHasSlowFPVMLx]>; // V6M Processors. def : Processor<"cortex-m0", ARMV6Itineraries, [HasV6MOps, FeatureNoARM, @@ -337,6 +337,14 @@ def : Processor<"cortex-m1", ARMV6Itineraries, [HasV6MOps, FeatureNoARM, def : Processor<"sc000", ARMV6Itineraries, [HasV6MOps, FeatureNoARM, FeatureDB, FeatureMClass]>; +// V6K Processors. +def : Processor<"arm1176jz-s", ARMV6Itineraries, [HasV6KOps]>; +def : Processor<"arm1176jzf-s", ARMV6Itineraries, [HasV6KOps, FeatureVFP2, + FeatureHasSlowFPVMLx]>; +def : Processor<"mpcorenovfp", ARMV6Itineraries, [HasV6KOps]>; +def : Processor<"mpcore", ARMV6Itineraries, [HasV6KOps, FeatureVFP2, + FeatureHasSlowFPVMLx]>; + // V6T2 Processors. def : Processor<"arm1156t2-s", ARMV6Itineraries, [HasV6T2Ops, FeatureDSPThumb2]>; @@ -449,6 +457,14 @@ def : ProcessorModel<"cyclone", SwiftModel, FeatureDB,FeatureDSPThumb2, FeatureHasRAS, FeatureZCZeroing]>; +// V8.1 Processors +def : ProcNoItin<"generic-armv8.1-a", [HasV8Ops, FeatureV8_1a, + FeatureDB, FeatureFPARMv8, + FeatureNEON, FeatureDSPThumb2, + FeatureHWDiv, FeatureHWDivARM, + FeatureTrustZone, FeatureT2XtPk, + FeatureCrypto]>; + //===----------------------------------------------------------------------===// // Register File Description //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp index 2544a01..102def1 100644 --- a/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/ARMAsmPrinter.cpp @@ -120,9 +120,6 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer.EndCOFFSymbolDef(); } - // Have common code print out the function header with linkage info etc. - EmitFunctionHeader(); - // Emit the rest of the function body. EmitFunctionBody(); @@ -438,65 +435,6 @@ void ARMAsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo, void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) { Triple TT(TM.getTargetTriple()); - if (TT.isOSBinFormatMachO()) { - Reloc::Model RelocM = TM.getRelocationModel(); - if (RelocM == Reloc::PIC_ || RelocM == Reloc::DynamicNoPIC) { - // Declare all the text sections up front (before the DWARF sections - // emitted by AsmPrinter::doInitialization) so the assembler will keep - // them together at the beginning of the object file. This helps - // avoid out-of-range branches that are due a fundamental limitation of - // the way symbol offsets are encoded with the current Darwin ARM - // relocations. - const TargetLoweringObjectFileMachO &TLOFMacho = - static_cast<const TargetLoweringObjectFileMachO &>( - getObjFileLowering()); - - // Collect the set of sections our functions will go into. - SetVector<const MCSection *, SmallVector<const MCSection *, 8>, - SmallPtrSet<const MCSection *, 8> > TextSections; - // Default text section comes first. - TextSections.insert(TLOFMacho.getTextSection()); - // Now any user defined text sections from function attributes. - for (Module::iterator F = M.begin(), e = M.end(); F != e; ++F) - if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage()) - TextSections.insert(TLOFMacho.SectionForGlobal(F, *Mang, TM)); - // Now the coalescable sections. - TextSections.insert(TLOFMacho.getTextCoalSection()); - TextSections.insert(TLOFMacho.getConstTextCoalSection()); - - // Emit the sections in the .s file header to fix the order. - for (unsigned i = 0, e = TextSections.size(); i != e; ++i) - OutStreamer.SwitchSection(TextSections[i]); - - if (RelocM == Reloc::DynamicNoPIC) { - const MCSection *sect = - OutContext.getMachOSection("__TEXT", "__symbol_stub4", - MachO::S_SYMBOL_STUBS, - 12, SectionKind::getText()); - OutStreamer.SwitchSection(sect); - } else { - const MCSection *sect = - OutContext.getMachOSection("__TEXT", "__picsymbolstub4", - MachO::S_SYMBOL_STUBS, - 16, SectionKind::getText()); - OutStreamer.SwitchSection(sect); - } - const MCSection *StaticInitSect = - OutContext.getMachOSection("__TEXT", "__StaticInit", - MachO::S_REGULAR | - MachO::S_ATTR_PURE_INSTRUCTIONS, - SectionKind::getText()); - OutStreamer.SwitchSection(StaticInitSect); - } - - // Compiling with debug info should not affect the code - // generation. Ensure the cstring section comes before the - // optional __DWARF secion. Otherwise, PC-relative loads would - // have to use different instruction sequences at "-g" in order to - // reach global data in the same object file. - OutStreamer.SwitchSection(getObjFileLowering().getCStringSection()); - } - // Use unified assembler syntax. OutStreamer.EmitAssemblerFlag(MCAF_SyntaxUnified); @@ -669,7 +607,7 @@ void ARMAsmPrinter::emitAttributes() { std::string CPUString = STI.getCPUString(); - if (CPUString != "generic") { + if (CPUString.find("generic") != 0) { //CPUString doesn't start with "generic" // FIXME: remove krait check when GNU tools support krait cpu if (STI.isKrait()) { ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, "cortex-a9"); @@ -723,7 +661,8 @@ void ARMAsmPrinter::emitAttributes() { // Emit Tag_Advanced_SIMD_arch for ARMv8 architecture if (STI.hasV8Ops()) ATS.emitAttribute(ARMBuildAttrs::Advanced_SIMD_arch, - ARMBuildAttrs::AllowNeonARMv8); + STI.hasV8_1a() ? ARMBuildAttrs::AllowNeonARMv8_1a: + ARMBuildAttrs::AllowNeonARMv8); } else { if (STI.hasFPARMv8()) // FPv5 and FP-ARMv8 have the same instructions, so are modeled as one @@ -960,10 +899,7 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) { MCSymbol *MCSym; if (ACPV->isLSDA()) { - SmallString<128> Str; - raw_svector_ostream OS(Str); - OS << DL->getPrivateGlobalPrefix() << "_LSDA_" << getFunctionNumber(); - MCSym = OutContext.GetOrCreateSymbol(OS.str()); + MCSym = getCurExceptionSym(); } else if (ACPV->isBlockAddress()) { const BlockAddress *BA = cast<ARMConstantPoolConstant>(ACPV)->getBlockAddress(); diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h index 50cb954..e475ae4 100644 --- a/lib/Target/ARM/ARMAsmPrinter.h +++ b/lib/Target/ARM/ARMAsmPrinter.h @@ -103,13 +103,16 @@ private: const MachineInstr *MI); public: - unsigned getISAEncoding(const Function *F) override { + unsigned getISAEncoding() override { // ARM/Darwin adds ISA to the DWARF info for each function. Triple TT(TM.getTargetTriple()); if (!TT.isOSBinFormatMachO()) return 0; - const ARMSubtarget &STI = TM.getSubtarget<ARMSubtarget>(*F); - return STI.isThumb() ? ARM::DW_ISA_ARM_thumb : ARM::DW_ISA_ARM_arm; + bool isThumb = TT.getArch() == Triple::thumb || + TT.getArch() == Triple::thumbeb || + TT.getSubArch() == Triple::ARMSubArch_v7m || + TT.getSubArch() == Triple::ARMSubArch_v6m; + return isThumb ? ARM::DW_ISA_ARM_thumb : ARM::DW_ISA_ARM_arm; } private: diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index 29ee22e..7ee3cb0 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -37,6 +37,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -4115,19 +4116,21 @@ enum ARMExeDomain { // std::pair<uint16_t, uint16_t> ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const { - // VMOVD, VMOVRS and VMOVSR are VFP instructions, but can be changed to NEON - // if they are not predicated. - if (MI->getOpcode() == ARM::VMOVD && !isPredicated(MI)) - return std::make_pair(ExeVFP, (1<<ExeVFP) | (1<<ExeNEON)); - - // CortexA9 is particularly picky about mixing the two and wants these - // converted. - if (Subtarget.isCortexA9() && !isPredicated(MI) && - (MI->getOpcode() == ARM::VMOVRS || - MI->getOpcode() == ARM::VMOVSR || - MI->getOpcode() == ARM::VMOVS)) - return std::make_pair(ExeVFP, (1<<ExeVFP) | (1<<ExeNEON)); - + // If we don't have access to NEON instructions then we won't be able + // to swizzle anything to the NEON domain. Check to make sure. + if (Subtarget.hasNEON()) { + // VMOVD, VMOVRS and VMOVSR are VFP instructions, but can be changed to NEON + // if they are not predicated. + if (MI->getOpcode() == ARM::VMOVD && !isPredicated(MI)) + return std::make_pair(ExeVFP, (1 << ExeVFP) | (1 << ExeNEON)); + + // CortexA9 is particularly picky about mixing the two and wants these + // converted. + if (Subtarget.isCortexA9() && !isPredicated(MI) && + (MI->getOpcode() == ARM::VMOVRS || MI->getOpcode() == ARM::VMOVSR || + MI->getOpcode() == ARM::VMOVS)) + return std::make_pair(ExeVFP, (1 << ExeVFP) | (1 << ExeNEON)); + } // No other instructions can be swizzled, so just determine their domain. unsigned Domain = MI->getDesc().TSFlags & ARMII::DomainMask; @@ -4220,6 +4223,9 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { // Zap the predicate operands. assert(!isPredicated(MI) && "Cannot predicate a VORRd"); + // Make sure we've got NEON instructions. + assert(Subtarget.hasNEON() && "VORRd requires NEON"); + // Source instruction is %DDst = VMOVD %DSrc, 14, %noreg (; implicits) DstReg = MI->getOperand(0).getReg(); SrcReg = MI->getOperand(1).getReg(); @@ -4507,7 +4513,7 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, } bool ARMBaseInstrInfo::hasNOP() const { - return (Subtarget.getFeatureBits() & ARM::HasV6T2Ops) != 0; + return (Subtarget.getFeatureBits() & ARM::HasV6KOps) != 0; } bool ARMBaseInstrInfo::isSwiftFastImmShift(const MachineInstr *MI) const { diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 7574727..a8c7657 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -45,26 +45,27 @@ using namespace llvm; -ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMSubtarget &sti) - : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), STI(sti), BasePtr(ARM::R6) { +ARMBaseRegisterInfo::ARMBaseRegisterInfo() + : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), BasePtr(ARM::R6) {} + +static unsigned getFramePointerReg(const ARMSubtarget &STI) { if (STI.isTargetMachO()) { if (STI.isTargetDarwin() || STI.isThumb1Only()) - FramePtr = ARM::R7; + return ARM::R7; else - FramePtr = ARM::R11; + return ARM::R11; } else if (STI.isTargetWindows()) - FramePtr = ARM::R11; + return ARM::R11; else // ARM EABI - FramePtr = STI.isThumb() ? ARM::R7 : ARM::R11; + return STI.isThumb() ? ARM::R7 : ARM::R11; } const MCPhysReg* ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + const ARMSubtarget &STI = MF->getSubtarget<ARMSubtarget>(); const MCPhysReg *RegList = STI.isTargetDarwin() ? CSR_iOS_SaveList : CSR_AAPCS_SaveList; - if (!MF) return RegList; - const Function *F = MF->getFunction(); if (F->getCallingConv() == CallingConv::GHC) { // GHC set of callee saved regs is empty as all those regs are @@ -89,8 +90,10 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return RegList; } -const uint32_t* -ARMBaseRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { +const uint32_t * +ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { + const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); if (CC == CallingConv::GHC) // This is academic becase all GHC calls are (supposed to be) tail calls return CSR_NoRegs_RegMask; @@ -102,8 +105,10 @@ ARMBaseRegisterInfo::getNoPreservedMask() const { return CSR_NoRegs_RegMask; } -const uint32_t* -ARMBaseRegisterInfo::getThisReturnPreservedMask(CallingConv::ID CC) const { +const uint32_t * +ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { + const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); // This should return a register mask that is the same as that returned by // getCallPreservedMask but that additionally preserves the register used for // the first i32 argument (which must also be the register used to return a @@ -121,7 +126,8 @@ ARMBaseRegisterInfo::getThisReturnPreservedMask(CallingConv::ID CC) const { BitVector ARMBaseRegisterInfo:: getReservedRegs(const MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); + const TargetFrameLowering *TFI = STI.getFrameLowering(); // FIXME: avoid re-calculating this every time. BitVector Reserved(getNumRegs()); @@ -130,7 +136,7 @@ getReservedRegs(const MachineFunction &MF) const { Reserved.set(ARM::FPSCR); Reserved.set(ARM::APSR_NZCV); if (TFI->hasFP(MF)) - Reserved.set(FramePtr); + Reserved.set(getFramePointerReg(STI)); if (hasBasePointer(MF)) Reserved.set(BasePtr); // Some targets reserve R9. @@ -150,9 +156,9 @@ getReservedRegs(const MachineFunction &MF) const { return Reserved; } -const TargetRegisterClass* -ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) - const { +const TargetRegisterClass * +ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &) const { const TargetRegisterClass *Super = RC; TargetRegisterClass::sc_iterator I = RC->getSuperClasses(); do { @@ -187,7 +193,8 @@ ARMBaseRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { unsigned ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); + const TargetFrameLowering *TFI = STI.getFrameLowering(); switch (RC->getID()) { default: @@ -283,29 +290,6 @@ ARMBaseRegisterInfo::updateRegAllocHint(unsigned Reg, unsigned NewReg, } } -bool -ARMBaseRegisterInfo::avoidWriteAfterWrite(const TargetRegisterClass *RC) const { - // CortexA9 has a Write-after-write hazard for NEON registers. - if (!STI.isLikeA9()) - return false; - - switch (RC->getID()) { - case ARM::DPRRegClassID: - case ARM::DPR_8RegClassID: - case ARM::DPR_VFP2RegClassID: - case ARM::QPRRegClassID: - case ARM::QPR_8RegClassID: - case ARM::QPR_VFP2RegClassID: - case ARM::SPRRegClassID: - case ARM::SPR_8RegClassID: - // Avoid reusing S, D, and Q registers. - // Don't increase register pressure for QQ and QQQQ. - return true; - default: - return false; - } -} - bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); @@ -350,7 +334,7 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const { return false; // Stack realignment requires a frame pointer. If we already started // register allocation with frame pointer elimination, it is too late now. - if (!MRI->canReserveReg(FramePtr)) + if (!MRI->canReserveReg(getFramePointerReg(MF.getSubtarget<ARMSubtarget>()))) return false; // We may also need a base pointer if there are dynamic allocas or stack // pointer adjustments around calls. @@ -384,10 +368,11 @@ cannotEliminateFrame(const MachineFunction &MF) const { unsigned ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); + const TargetFrameLowering *TFI = STI.getFrameLowering(); if (TFI->hasFP(MF)) - return FramePtr; + return getFramePointerReg(STI); return ARM::SP; } @@ -539,7 +524,6 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { // The incoming offset is relating to the SP at the start of the function, // but when we access the local it'll be relative to the SP after local // allocation, so adjust our SP-relative offset by that allocation size. - Offset = -Offset; Offset += MFI->getLocalFrameSize(); // Assume that we'll have at least some spill slots allocated. // FIXME: This is a total SWAG number. We should run some statistics @@ -552,9 +536,8 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { // on whether there are any local variables that would trigger it. unsigned StackAlign = TFI->getStackAlignment(); if (TFI->hasFP(MF) && - (MI->getDesc().TSFlags & ARMII::AddrModeMask) != ARMII::AddrModeT1_s && !((MFI->getLocalFrameMaxAlign() > StackAlign) && canRealignStack(MF))) { - if (isFrameOffsetLegal(MI, FPOffset)) + if (isFrameOffsetLegal(MI, getFrameRegister(MF), FPOffset)) return false; } // If we can reference via the stack pointer, try that. @@ -562,7 +545,7 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { // to only disallow SP relative references in the live range of // the VLA(s). In practice, it's unclear how much difference that // would make, but it may be worth doing. - if (!MFI->hasVarSizedObjects() && isFrameOffsetLegal(MI, Offset)) + if (!MFI->hasVarSizedObjects() && isFrameOffsetLegal(MI, ARM::SP, Offset)) return false; // The offset likely isn't legal, we want to allocate a virtual base register. @@ -625,7 +608,7 @@ void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, (void)Done; } -bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, +bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg, int64_t Offset) const { const MCInstrDesc &Desc = MI->getDesc(); unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); @@ -669,7 +652,7 @@ bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, NumBits = 8; break; case ARMII::AddrModeT1_s: - NumBits = 8; + NumBits = (BaseReg == ARM::SP ? 8 : 5); Scale = 4; isSigned = false; break; diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h index 17027c2..fdc1ef9 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -21,10 +21,6 @@ #include "ARMGenRegisterInfo.inc" namespace llvm { - class ARMSubtarget; - class ARMBaseInstrInfo; - class Type; - /// Register allocation hints. namespace ARMRI { enum { @@ -82,27 +78,22 @@ static inline bool isCalleeSavedRegister(unsigned Reg, class ARMBaseRegisterInfo : public ARMGenRegisterInfo { protected: - const ARMSubtarget &STI; - - /// FramePtr - ARM physical register used as frame ptr. - unsigned FramePtr; - /// BasePtr - ARM physical register used as a base ptr in complex stack /// frames. I.e., when we need a 3rd base, not just SP and FP, due to /// variable size stack objects. unsigned BasePtr; // Can be only subclassed. - explicit ARMBaseRegisterInfo(const ARMSubtarget &STI); + explicit ARMBaseRegisterInfo(); // Return the opcode that implements 'Op', or 0 if no opcode unsigned getOpcode(int Op) const; public: /// Code Generation virtual methods... - const MCPhysReg * - getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override; - const uint32_t *getCallPreservedMask(CallingConv::ID) const override; + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const uint32_t *getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID) const override; const uint32_t *getNoPreservedMask() const; /// getThisReturnPreservedMask - Returns a call preserved mask specific to the @@ -113,7 +104,8 @@ public: /// /// Should return NULL in the case that the calling convention does not have /// this property - const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const; + const uint32_t *getThisReturnPreservedMask(const MachineFunction &MF, + CallingConv::ID) const; BitVector getReservedRegs(const MachineFunction &MF) const override; @@ -124,7 +116,8 @@ public: getCrossCopyRegClass(const TargetRegisterClass *RC) const override; const TargetRegisterClass * - getLargestLegalSuperClass(const TargetRegisterClass *RC) const override; + getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const override; unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; @@ -138,8 +131,6 @@ public: void updateRegAllocHint(unsigned Reg, unsigned NewReg, MachineFunction &MF) const override; - bool avoidWriteAfterWrite(const TargetRegisterClass *RC) const override; - bool hasBasePointer(const MachineFunction &MF) const; bool canRealignStack(const MachineFunction &MF) const; @@ -152,7 +143,7 @@ public: int64_t Offset) const override; void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, int64_t Offset) const override; - bool isFrameOffsetLegal(const MachineInstr *MI, + bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg, int64_t Offset) const override; bool cannotEliminateFrame(const MachineFunction &MF) const; diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 375d394..9c8d228 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -2265,7 +2265,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { // Add a register mask with the call-preserved registers. // Proper defs for return values will be added by setPhysRegsDeadExcept(). - MIB.addRegMask(TRI.getCallPreservedMask(CC)); + MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC)); // Finish off the call including any return values. SmallVector<unsigned, 4> UsedRegs; @@ -2416,7 +2416,7 @@ bool ARMFastISel::SelectCall(const Instruction *I, // Add a register mask with the call-preserved registers. // Proper defs for return values will be added by setPhysRegsDeadExcept(). - MIB.addRegMask(TRI.getCallPreservedMask(CC)); + MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC)); // Finish off the call including any return values. SmallVector<unsigned, 4> UsedRegs; diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index 5a5bd57..830953b 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -293,7 +293,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { "This emitPrologue does not support Thumb1!"); bool isARM = !AFI->isThumbFunction(); unsigned Align = STI.getFrameLowering()->getStackAlignment(); - unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align); + unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); unsigned NumBytes = MFI->getStackSize(); const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); @@ -742,8 +742,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, "This emitEpilogue does not support Thumb1!"); bool isARM = !AFI->isThumbFunction(); - unsigned Align = STI.getFrameLowering()->getStackAlignment(); - unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align); + unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); int NumBytes = (int)MFI->getStackSize(); unsigned FramePtr = RegInfo->getFrameRegister(MF); diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 6ebf640..44cd1ef 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -257,7 +257,7 @@ private: /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for /// inline asm expressions. - bool SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, + bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) override; // Form pairs of consecutive R, S, D, or Q registers. @@ -3086,7 +3086,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { // Store exclusive double return a i32 value which is the return status // of the issued store. - EVT ResTys[] = { MVT::i32, MVT::Other }; + const EVT ResTys[] = {MVT::i32, MVT::Other}; bool isThumb = Subtarget->isThumb() && Subtarget->hasThumb2(); // Place arguments in the right order. @@ -3472,9 +3472,10 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){ bool ARMDAGToDAGISel:: -SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, +SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) { - assert(ConstraintCode == 'm' && "unexpected asm memory constraint"); + assert(ConstraintID == InlineAsm::Constraint_m && + "unexpected asm memory constraint"); // Require the address to be in a register. That is safe for all ARM // variants and it is hard to do anything much smarter without knowing // how the operand is used. diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 56290aa..3b1b8dd 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -23,6 +23,7 @@ #include "MCTargetDesc/ARMAddressingModes.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -40,6 +41,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Type.h" #include "llvm/MC/MCSectionMachO.h" @@ -47,6 +49,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" #include <utility> using namespace llvm; @@ -568,14 +571,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::LOAD); // It is legal to extload from v4i8 to v4i16 or v4i32. - MVT Tys[6] = {MVT::v8i8, MVT::v4i8, MVT::v2i8, - MVT::v4i16, MVT::v2i16, - MVT::v2i32}; - for (unsigned i = 0; i < 6; ++i) { + for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, + MVT::v2i32}) { for (MVT VT : MVT::integer_vector_valuetypes()) { - setLoadExtAction(ISD::EXTLOAD, VT, Tys[i], Legal); - setLoadExtAction(ISD::ZEXTLOAD, VT, Tys[i], Legal); - setLoadExtAction(ISD::SEXTLOAD, VT, Tys[i], Legal); + setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); + setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); + setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); } } } @@ -614,6 +615,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FRINT, MVT::f64, Expand); setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); setOperationAction(ISD::FFLOOR, MVT::f64, Expand); + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); } @@ -869,14 +876,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // Various VFP goodness if (!TM.Options.UseSoftFloat && !Subtarget->isThumb1Only()) { - // int <-> fp are custom expanded into bit_convert + ARMISD ops. - if (Subtarget->hasVFP2()) { - setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); - } - // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) { setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); @@ -1033,11 +1032,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::RBIT: return "ARMISD::RBIT"; - case ARMISD::FTOSI: return "ARMISD::FTOSI"; - case ARMISD::FTOUI: return "ARMISD::FTOUI"; - case ARMISD::SITOF: return "ARMISD::SITOF"; - case ARMISD::UITOF: return "ARMISD::UITOF"; - case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; case ARMISD::RRX: return "ARMISD::RRX"; @@ -1164,6 +1158,20 @@ const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const { return TargetLowering::getRegClassFor(VT); } +// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the +// source/dest is aligned and the copy size is large enough. We therefore want +// to align such objects passed to memory intrinsics. +bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, + unsigned &PrefAlign) const { + if (!isa<MemIntrinsic>(CI)) + return false; + MinSize = 8; + // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 + // cycle faster than 4-byte aligned LDM. + PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); + return true; +} + // Create a fast isel object. FastISel * ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, @@ -1815,16 +1823,16 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); if (isThisReturn) { // For 'this' returns, use the R0-preserving mask if applicable - Mask = ARI->getThisReturnPreservedMask(CallConv); + Mask = ARI->getThisReturnPreservedMask(MF, CallConv); if (!Mask) { // Set isThisReturn to false if the calling convention is not one that // allows 'returned' to be modeled in this way, so LowerCallResult does // not try to pass 'this' straight through isThisReturn = false; - Mask = ARI->getCallPreservedMask(CallConv); + Mask = ARI->getCallPreservedMask(MF, CallConv); } } else - Mask = ARI->getCallPreservedMask(CallConv); + Mask = ARI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -1857,60 +1865,61 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, /// on the stack. Remember the next parameter register to allocate, /// and then confiscate the rest of the parameter registers to insure /// this. -void -ARMTargetLowering::HandleByVal( - CCState *State, unsigned &size, unsigned Align) const { - unsigned reg = State->AllocateReg(GPRArgRegs); +void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, + unsigned Align) const { assert((State->getCallOrPrologue() == Prologue || State->getCallOrPrologue() == Call) && "unhandled ParmContext"); - if ((ARM::R0 <= reg) && (reg <= ARM::R3)) { - if (Subtarget->isAAPCS_ABI() && Align > 4) { - unsigned AlignInRegs = Align / 4; - unsigned Waste = (ARM::R4 - reg) % AlignInRegs; - for (unsigned i = 0; i < Waste; ++i) - reg = State->AllocateReg(GPRArgRegs); - } - if (reg != 0) { - unsigned excess = 4 * (ARM::R4 - reg); - - // Special case when NSAA != SP and parameter size greater than size of - // all remained GPR regs. In that case we can't split parameter, we must - // send it to stack. We also must set NCRN to R4, so waste all - // remained registers. - const unsigned NSAAOffset = State->getNextStackOffset(); - if (Subtarget->isAAPCS_ABI() && NSAAOffset != 0 && size > excess) { - while (State->AllocateReg(GPRArgRegs)) - ; - return; - } + // Byval (as with any stack) slots are always at least 4 byte aligned. + Align = std::max(Align, 4U); - // First register for byval parameter is the first register that wasn't - // allocated before this method call, so it would be "reg". - // If parameter is small enough to be saved in range [reg, r4), then - // the end (first after last) register would be reg + param-size-in-regs, - // else parameter would be splitted between registers and stack, - // end register would be r4 in this case. - unsigned ByValRegBegin = reg; - unsigned ByValRegEnd = (size < excess) ? reg + size/4 : (unsigned)ARM::R4; - State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); - // Note, first register is allocated in the beginning of function already, - // allocate remained amount of registers we need. - for (unsigned i = reg+1; i != ByValRegEnd; ++i) - State->AllocateReg(GPRArgRegs); - // A byval parameter that is split between registers and memory needs its - // size truncated here. - // In the case where the entire structure fits in registers, we set the - // size in memory to zero. - if (size < excess) - size = 0; - else - size -= excess; - } + unsigned Reg = State->AllocateReg(GPRArgRegs); + if (!Reg) + return; + + unsigned AlignInRegs = Align / 4; + unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; + for (unsigned i = 0; i < Waste; ++i) + Reg = State->AllocateReg(GPRArgRegs); + + if (!Reg) + return; + + unsigned Excess = 4 * (ARM::R4 - Reg); + + // Special case when NSAA != SP and parameter size greater than size of + // all remained GPR regs. In that case we can't split parameter, we must + // send it to stack. We also must set NCRN to R4, so waste all + // remained registers. + const unsigned NSAAOffset = State->getNextStackOffset(); + if (NSAAOffset != 0 && Size > Excess) { + while (State->AllocateReg(GPRArgRegs)) + ; + return; } + + // First register for byval parameter is the first register that wasn't + // allocated before this method call, so it would be "reg". + // If parameter is small enough to be saved in range [reg, r4), then + // the end (first after last) register would be reg + param-size-in-regs, + // else parameter would be splitted between registers and stack, + // end register would be r4 in this case. + unsigned ByValRegBegin = Reg; + unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4); + State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); + // Note, first register is allocated in the beginning of function already, + // allocate remained amount of registers we need. + for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) + State->AllocateReg(GPRArgRegs); + // A byval parameter that is split between registers and memory needs its + // size truncated here. + // In the case where the entire structure fits in registers, we set the + // size in memory to zero. + Size = std::max<int>(Size - Excess, 0); } + /// MatchingStackOffset - Return true if the given stack call argument is /// already available in the same position (relatively) of the caller's /// incoming argument stack. @@ -1991,7 +2000,7 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (isCalleeStructRet || isCallerStructRet) return false; - // FIXME: Completely disable sibcall for Thumb1 since Thumb1RegisterInfo:: + // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo:: // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation // support in the assembler and linker to be used. This would need to be @@ -2819,50 +2828,6 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); } -void -ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF, - unsigned InRegsParamRecordIdx, - unsigned ArgSize, - unsigned &ArgRegsSize, - unsigned &ArgRegsSaveSize) - const { - unsigned NumGPRs; - if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { - unsigned RBegin, REnd; - CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); - NumGPRs = REnd - RBegin; - } else { - unsigned int firstUnalloced; - firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs); - NumGPRs = (firstUnalloced <= 3) ? (4 - firstUnalloced) : 0; - } - - unsigned Align = Subtarget->getFrameLowering()->getStackAlignment(); - ArgRegsSize = NumGPRs * 4; - - // If parameter is split between stack and GPRs... - if (NumGPRs && Align > 4 && - (ArgRegsSize < ArgSize || - InRegsParamRecordIdx >= CCInfo.getInRegsParamsCount())) { - // Add padding for part of param recovered from GPRs. For example, - // if Align == 8, its last byte must be at address K*8 - 1. - // We need to do it, since remained (stack) part of parameter has - // stack alignment, and we need to "attach" "GPRs head" without gaps - // to it: - // Stack: - // |---- 8 bytes block ----| |---- 8 bytes block ----| |---- 8 bytes... - // [ [padding] [GPRs head] ] [ Tail passed via stack .... - // - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - unsigned Padding = - OffsetToAlignment(ArgRegsSize + AFI->getArgRegsSaveSize(), Align); - ArgRegsSaveSize = ArgRegsSize + Padding; - } else - // We don't need to extend regs save size for byval parameters if they - // are passed via GPRs only. - ArgRegsSaveSize = ArgRegsSize; -} - // The remaining GPRs hold either the beginning of variable-argument // data, or the beginning of an aggregate passed by value (usually // byval). Either way, we allocate stack slots adjacent to the data @@ -2876,13 +2841,8 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, SDLoc dl, SDValue &Chain, const Value *OrigArg, unsigned InRegsParamRecordIdx, - unsigned OffsetFromOrigArg, - unsigned ArgOffset, - unsigned ArgSize, - bool ForceMutable, - unsigned ByValStoreOffset, - unsigned TotalArgRegsSaveSize) const { - + int ArgOffset, + unsigned ArgSize) const { // Currently, two use-cases possible: // Case #1. Non-var-args function, and we meet first byval parameter. // Setup first unallocated register as first byval register; @@ -2897,82 +2857,39 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - unsigned firstRegToSaveIndex, lastRegToSaveIndex; unsigned RBegin, REnd; if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); - firstRegToSaveIndex = RBegin - ARM::R0; - lastRegToSaveIndex = REnd - ARM::R0; } else { - firstRegToSaveIndex = CCInfo.getFirstUnallocated(GPRArgRegs); - lastRegToSaveIndex = 4; - } - - unsigned ArgRegsSize, ArgRegsSaveSize; - computeRegArea(CCInfo, MF, InRegsParamRecordIdx, ArgSize, - ArgRegsSize, ArgRegsSaveSize); - - // Store any by-val regs to their spots on the stack so that they may be - // loaded by deferencing the result of formal parameter pointer or va_next. - // Note: once stack area for byval/varargs registers - // was initialized, it can't be initialized again. - if (ArgRegsSaveSize) { - unsigned Padding = ArgRegsSaveSize - ArgRegsSize; - - if (Padding) { - assert(AFI->getStoredByValParamsPadding() == 0 && - "The only parameter may be padded."); - AFI->setStoredByValParamsPadding(Padding); - } - - int FrameIndex = MFI->CreateFixedObject(ArgRegsSaveSize, - Padding + - ByValStoreOffset - - (int64_t)TotalArgRegsSaveSize, - false); - SDValue FIN = DAG.getFrameIndex(FrameIndex, getPointerTy()); - if (Padding) { - MFI->CreateFixedObject(Padding, - ArgOffset + ByValStoreOffset - - (int64_t)ArgRegsSaveSize, - false); - } - - SmallVector<SDValue, 4> MemOps; - for (unsigned i = 0; firstRegToSaveIndex < lastRegToSaveIndex; - ++firstRegToSaveIndex, ++i) { - const TargetRegisterClass *RC; - if (AFI->isThumb1OnlyFunction()) - RC = &ARM::tGPRRegClass; - else - RC = &ARM::GPRRegClass; + unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); + RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; + REnd = ARM::R4; + } - unsigned VReg = MF.addLiveIn(GPRArgRegs[firstRegToSaveIndex], RC); - SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); - SDValue Store = - DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo(OrigArg, OffsetFromOrigArg + 4*i), - false, false, 0); - MemOps.push_back(Store); - FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, - DAG.getConstant(4, getPointerTy())); - } + if (REnd != RBegin) + ArgOffset = -4 * (ARM::R4 - RBegin); - AFI->setArgRegsSaveSize(ArgRegsSaveSize + AFI->getArgRegsSaveSize()); + int FrameIndex = MFI->CreateFixedObject(ArgSize, ArgOffset, false); + SDValue FIN = DAG.getFrameIndex(FrameIndex, getPointerTy()); - if (!MemOps.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); - return FrameIndex; - } else { - if (ArgSize == 0) { - // We cannot allocate a zero-byte object for the first variadic argument, - // so just make up a size. - ArgSize = 4; - } - // This will point to the next argument passed via stack. - return MFI->CreateFixedObject( - ArgSize, ArgOffset, !ForceMutable); + SmallVector<SDValue, 4> MemOps; + const TargetRegisterClass *RC = + AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; + + for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { + unsigned VReg = MF.addLiveIn(Reg, RC); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(OrigArg, 4 * i), false, false, 0); + MemOps.push_back(Store); + FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, + DAG.getConstant(4, getPointerTy())); } + + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); + return FrameIndex; } // Setup stack frame, the va_list pointer will start from. @@ -2990,11 +2907,9 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, // the result of va_next. // If there is no regs to be stored, just point address after last // argument passed via stack. - int FrameIndex = - StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, - CCInfo.getInRegsParamsCount(), 0, ArgOffset, 0, ForceMutable, - 0, TotalArgRegsSaveSize); - + int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, + CCInfo.getInRegsParamsCount(), + CCInfo.getNextStackOffset(), 4); AFI->setVarArgsFrameIndex(FrameIndex); } @@ -3020,7 +2935,6 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, isVarArg)); SmallVector<SDValue, 16> ArgValues; - int lastInsIndex = -1; SDValue ArgValue; Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); unsigned CurArgIdx = 0; @@ -3030,50 +2944,40 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, // We also increase this value in case of varargs function. AFI->setArgRegsSaveSize(0); - unsigned ByValStoreOffset = 0; - unsigned TotalArgRegsSaveSize = 0; - unsigned ArgRegsSaveSizeMaxAlign = 4; - // Calculate the amount of stack space that we need to allocate to store // byval and variadic arguments that are passed in registers. // We need to know this before we allocate the first byval or variadic // argument, as they will be allocated a stack slot below the CFA (Canonical // Frame Address, the stack pointer at entry to the function). + unsigned ArgRegBegin = ARM::R4; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) + break; + CCValAssign &VA = ArgLocs[i]; - if (VA.isMemLoc()) { - int index = VA.getValNo(); - if (index != lastInsIndex) { - ISD::ArgFlagsTy Flags = Ins[index].Flags; - if (Flags.isByVal()) { - unsigned ExtraArgRegsSize; - unsigned ExtraArgRegsSaveSize; - computeRegArea(CCInfo, MF, CCInfo.getInRegsParamsProcessed(), - Flags.getByValSize(), - ExtraArgRegsSize, ExtraArgRegsSaveSize); - - TotalArgRegsSaveSize += ExtraArgRegsSaveSize; - if (Flags.getByValAlign() > ArgRegsSaveSizeMaxAlign) - ArgRegsSaveSizeMaxAlign = Flags.getByValAlign(); - CCInfo.nextInRegsParam(); - } - lastInsIndex = index; - } - } + unsigned Index = VA.getValNo(); + ISD::ArgFlagsTy Flags = Ins[Index].Flags; + if (!Flags.isByVal()) + continue; + + assert(VA.isMemLoc() && "unexpected byval pointer in reg"); + unsigned RBegin, REnd; + CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); + ArgRegBegin = std::min(ArgRegBegin, RBegin); + + CCInfo.nextInRegsParam(); } CCInfo.rewindByValRegsInfo(); - lastInsIndex = -1; + + int lastInsIndex = -1; if (isVarArg && MFI->hasVAStart()) { - unsigned ExtraArgRegsSize; - unsigned ExtraArgRegsSaveSize; - computeRegArea(CCInfo, MF, CCInfo.getInRegsParamsCount(), 0, - ExtraArgRegsSize, ExtraArgRegsSaveSize); - TotalArgRegsSaveSize += ExtraArgRegsSaveSize; + unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); + if (RegIdx != array_lengthof(GPRArgRegs)) + ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); } - // If the arg regs save area contains N-byte aligned values, the - // bottom of it must be at least N-byte aligned. - TotalArgRegsSaveSize = RoundUpToAlignment(TotalArgRegsSaveSize, ArgRegsSaveSizeMaxAlign); - TotalArgRegsSaveSize = std::min(TotalArgRegsSaveSize, 16U); + + unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); + AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -3178,18 +3082,9 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, "Byval arguments cannot be implicit"); unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); - ByValStoreOffset = RoundUpToAlignment(ByValStoreOffset, Flags.getByValAlign()); - int FrameIndex = StoreByValRegs( - CCInfo, DAG, dl, Chain, CurOrigArg, - CurByValIndex, - Ins[VA.getValNo()].PartOffset, - VA.getLocMemOffset(), - Flags.getByValSize(), - true /*force mutable frames*/, - ByValStoreOffset, - TotalArgRegsSaveSize); - ByValStoreOffset += Flags.getByValSize(); - ByValStoreOffset = std::min(ByValStoreOffset, 16U); + int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, CurOrigArg, + CurByValIndex, VA.getLocMemOffset(), + Flags.getByValSize()); InVals.push_back(DAG.getFrameIndex(FrameIndex, getPointerTy())); CCInfo.nextInRegsParam(); } else { @@ -3894,7 +3789,6 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); if (VT.isVector()) return LowerVectorFP_TO_INT(Op, DAG); - if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) { RTLIB::Libcall LC; if (Op.getOpcode() == ISD::FP_TO_SINT) @@ -3907,20 +3801,7 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { /*isSigned*/ false, SDLoc(Op)).first; } - SDLoc dl(Op); - unsigned Opc; - - switch (Op.getOpcode()) { - default: llvm_unreachable("Invalid opcode!"); - case ISD::FP_TO_SINT: - Opc = ARMISD::FTOSI; - break; - case ISD::FP_TO_UINT: - Opc = ARMISD::FTOUI; - break; - } - Op = DAG.getNode(Opc, dl, MVT::f32, Op.getOperand(0)); - return DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op); + return Op; } static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { @@ -3960,7 +3841,6 @@ SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); if (VT.isVector()) return LowerVectorINT_TO_FP(Op, DAG); - if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) { RTLIB::Libcall LC; if (Op.getOpcode() == ISD::SINT_TO_FP) @@ -3973,21 +3853,7 @@ SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { /*isSigned*/ false, SDLoc(Op)).first; } - SDLoc dl(Op); - unsigned Opc; - - switch (Op.getOpcode()) { - default: llvm_unreachable("Invalid opcode!"); - case ISD::SINT_TO_FP: - Opc = ARMISD::SITOF; - break; - case ISD::UINT_TO_FP: - Opc = ARMISD::UITOF; - break; - } - - Op = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op.getOperand(0)); - return DAG.getNode(Opc, dl, VT, Op); + return Op; } SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { @@ -7239,16 +7105,20 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI, // Load an immediate to varEnd. unsigned varEnd = MRI.createVirtualRegister(TRC); - if (IsThumb2) { + if (Subtarget->useMovt(*MF)) { unsigned Vtmp = varEnd; if ((LoopSize & 0xFFFF0000) != 0) Vtmp = MRI.createVirtualRegister(TRC); - AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), Vtmp) - .addImm(LoopSize & 0xFFFF)); + AddDefaultPred(BuildMI(BB, dl, + TII->get(IsThumb2 ? ARM::t2MOVi16 : ARM::MOVi16), + Vtmp).addImm(LoopSize & 0xFFFF)); if ((LoopSize & 0xFFFF0000) != 0) - AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd) - .addReg(Vtmp).addImm(LoopSize >> 16)); + AddDefaultPred(BuildMI(BB, dl, + TII->get(IsThumb2 ? ARM::t2MOVTi16 : ARM::MOVTi16), + varEnd) + .addReg(Vtmp) + .addImm(LoopSize >> 16)); } else { MachineConstantPool *ConstantPool = MF->getConstantPool(); Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); @@ -10076,6 +9946,28 @@ bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return false; } +bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { + EVT VT = ExtVal.getValueType(); + + if (!isTypeLegal(VT)) + return false; + + // Don't create a loadext if we can fold the extension into a wide/long + // instruction. + // If there's more than one user instruction, the loadext is desirable no + // matter what. There can be two uses by the same instruction. + if (ExtVal->use_empty() || + !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) + return true; + + SDNode *U = *ExtVal->use_begin(); + if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || + U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL)) + return false; + + return true; +} + bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; @@ -10289,9 +10181,9 @@ bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM, bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { // Thumb2 and ARM modes can use cmn for negative immediates. if (!Subtarget->isThumb()) - return ARM_AM::getSOImmVal(llvm::abs64(Imm)) != -1; + return ARM_AM::getSOImmVal(std::abs(Imm)) != -1; if (Subtarget->isThumb2()) - return ARM_AM::getT2SOImmVal(llvm::abs64(Imm)) != -1; + return ARM_AM::getT2SOImmVal(std::abs(Imm)) != -1; // Thumb1 doesn't have cmn, and only 8-bit immediates. return Imm >= 0 && Imm <= 255; } @@ -10302,7 +10194,7 @@ bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { /// immediate into a register. bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { // Same encoding for add/sub, just flip the sign. - int64_t AbsImm = llvm::abs64(Imm); + int64_t AbsImm = std::abs(Imm); if (!Subtarget->isThumb()) return ARM_AM::getSOImmVal(AbsImm) != -1; if (Subtarget->isThumb2()) @@ -11198,9 +11090,12 @@ bool ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { // For the real atomic operations, we have ldrex/strex up to 32 bits, // and up to 64 bits on the non-M profiles -bool ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { +TargetLoweringBase::AtomicRMWExpansionKind +ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); - return Size <= (Subtarget->isMClass() ? 32U : 64U); + return (Size <= (Subtarget->isMClass() ? 32U : 64U)) + ? AtomicRMWExpansionKind::LLSC + : AtomicRMWExpansionKind::None; } // This has so far only been implemented for MachO. diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index ec1407d..dd4c954 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -65,11 +65,6 @@ namespace llvm { RBIT, // ARM bitreverse instruction - FTOSI, // FP to sint within a FP register. - FTOUI, // FP to uint within a FP register. - SITOF, // sint to FP within a FP register. - UITOF, // uint to FP within a FP register. - SRL_FLAG, // V,Flag = srl_flag X -> srl X, 1 + save carry out. SRA_FLAG, // V,Flag = sra_flag X -> sra X, 1 + save carry out. RRX, // V = RRX X, Flag -> srl X, 1 + shift in carry flag. @@ -283,6 +278,8 @@ namespace llvm { using TargetLowering::isZExtFree; bool isZExtFree(SDValue Val, EVT VT2) const override; + bool isVectorLoadExtDesirable(SDValue ExtVal) const override; + bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override; @@ -346,6 +343,12 @@ namespace llvm { std::vector<SDValue> &Ops, SelectionDAG &DAG) const override; + unsigned getInlineAsmMemConstraint( + const std::string &ConstraintCode) const override { + // FIXME: Map different constraints differently. + return InlineAsm::Constraint_m; + } + const ARMSubtarget* getSubtarget() const { return Subtarget; } @@ -360,6 +363,9 @@ namespace llvm { return true; } + bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, + unsigned &PrefAlign) const override; + /// createFastISel - This method returns a target specific FastISel object, /// or null if the target does not support "fast" ISel. FastISel *createFastISel(FunctionLoweringInfo &funcInfo, @@ -404,7 +410,8 @@ namespace llvm { bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; - bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + TargetLoweringBase::AtomicRMWExpansionKind + shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; bool useLoadStackGuardNode() const override; @@ -525,12 +532,8 @@ namespace llvm { SDLoc dl, SDValue &Chain, const Value *OrigArg, unsigned InRegsParamRecordIdx, - unsigned OffsetFromOrigArg, - unsigned ArgOffset, - unsigned ArgSize, - bool ForceMutable, - unsigned ByValStoreOffset, - unsigned TotalArgRegsSaveSize) const; + int ArgOffset, + unsigned ArgSize) const; void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc dl, SDValue &Chain, @@ -538,12 +541,6 @@ namespace llvm { unsigned TotalArgRegsSaveSize, bool ForceMutable = false) const; - void computeRegArea(CCState &CCInfo, MachineFunction &MF, - unsigned InRegsParamRecordIdx, - unsigned ArgSize, - unsigned &ArgRegsSize, - unsigned &ArgRegsSaveSize) const; - SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const override; diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td index 7d27cf3..e79608d 100644 --- a/lib/Target/ARM/ARMInstrFormats.td +++ b/lib/Target/ARM/ARMInstrFormats.td @@ -983,7 +983,12 @@ class ARMV5MOPat<dag pattern, dag result> : Pat<pattern, result> { class ARMV6Pat<dag pattern, dag result> : Pat<pattern, result> { list<Predicate> Predicates = [IsARM, HasV6]; } - +class VFPPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [HasVFP2]; +} +class VFPNoNEONPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [HasVFP2, DontUseNEONForFP]; +} //===----------------------------------------------------------------------===// // Thumb Instruction Format Definitions. // diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp index bc617f0..7c004c9 100644 --- a/lib/Target/ARM/ARMInstrInfo.cpp +++ b/lib/Target/ARM/ARMInstrInfo.cpp @@ -30,8 +30,7 @@ using namespace llvm; ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) - : ARMBaseInstrInfo(STI), RI(STI) { -} + : ARMBaseInstrInfo(STI), RI() {} /// getNoopForMachoTarget - Return the noop instruction to use for a noop. void ARMInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { @@ -146,6 +145,10 @@ namespace { return false; const ARMSubtarget &STI = static_cast<const ARMSubtarget &>(MF.getSubtarget()); + // Don't do this for Thumb1. + if (STI.isThumb1Only()) + return false; + const TargetMachine &TM = MF.getTarget(); if (TM.getRelocationModel() != Reloc::PIC_) return false; diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 126c552..c3984ca 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -199,6 +199,9 @@ def HasV6M : Predicate<"Subtarget->hasV6MOps()">, def HasV6T2 : Predicate<"Subtarget->hasV6T2Ops()">, AssemblerPredicate<"HasV6T2Ops", "armv6t2">; def NoV6T2 : Predicate<"!Subtarget->hasV6T2Ops()">; +def HasV6K : Predicate<"Subtarget->hasV6KOps()">, + AssemblerPredicate<"HasV6KOps", "armv6k">; +def NoV6K : Predicate<"!Subtarget->hasV6KOps()">; def HasV7 : Predicate<"Subtarget->hasV7Ops()">, AssemblerPredicate<"HasV7Ops", "armv7">; def HasV8 : Predicate<"Subtarget->hasV8Ops()">, @@ -223,6 +226,8 @@ def HasCrypto : Predicate<"Subtarget->hasCrypto()">, AssemblerPredicate<"FeatureCrypto", "crypto">; def HasCRC : Predicate<"Subtarget->hasCRC()">, AssemblerPredicate<"FeatureCRC", "crc">; +def HasV8_1a : Predicate<"Subtarget->hasV8_1a()">, + AssemblerPredicate<"FeatureV8_1a", "v8.1a">; def HasFP16 : Predicate<"Subtarget->hasFP16()">, AssemblerPredicate<"FeatureFP16","half-float">; def HasDivide : Predicate<"Subtarget->hasDivide()">, @@ -1835,11 +1840,11 @@ def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary, let Inst{7-0} = imm; } -def : InstAlias<"nop$p", (HINT 0, pred:$p)>, Requires<[IsARM, HasV6T2]>; -def : InstAlias<"yield$p", (HINT 1, pred:$p)>, Requires<[IsARM, HasV6T2]>; -def : InstAlias<"wfe$p", (HINT 2, pred:$p)>, Requires<[IsARM, HasV6T2]>; -def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6T2]>; -def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6T2]>; +def : InstAlias<"nop$p", (HINT 0, pred:$p)>, Requires<[IsARM, HasV6K]>; +def : InstAlias<"yield$p", (HINT 1, pred:$p)>, Requires<[IsARM, HasV6K]>; +def : InstAlias<"wfe$p", (HINT 2, pred:$p)>, Requires<[IsARM, HasV6K]>; +def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6K]>; +def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6K]>; def : InstAlias<"sevl$p", (HINT 5, pred:$p)>, Requires<[IsARM, HasV8]>; def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel", diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 2a7b4b5..a6a07a8 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -2790,7 +2790,7 @@ class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, imm:$lane)))))))]>; class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType Ty, SDNode MulOp, SDNode ShOp> + ValueType Ty, SDPatternOperator MulOp, SDPatternOperator ShOp> : N3VLane16<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane), @@ -2826,7 +2826,7 @@ class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, - SDNode MulOp, SDNode ShOp> + SDPatternOperator MulOp, SDPatternOperator ShOp> : N3VLane16<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane), @@ -3674,7 +3674,7 @@ multiclass N3VMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, multiclass N3VMulOpSL_HS<bits<4> op11_8, InstrItinClass itinD16, InstrItinClass itinD32, InstrItinClass itinQ16, InstrItinClass itinQ32, - string OpcodeStr, string Dt, SDNode ShOp> { + string OpcodeStr, string Dt, SDPatternOperator ShOp> { def v4i16 : N3VDMulOpSL16<0b01, op11_8, itinD16, OpcodeStr, !strconcat(Dt, "16"), v4i16, mul, ShOp>; def v2i32 : N3VDMulOpSL<0b10, op11_8, itinD32, @@ -3711,27 +3711,38 @@ multiclass N3VIntOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, } // Neon 3-argument intrinsics, -// element sizes of 8, 16 and 32 bits: -multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, - InstrItinClass itinD, InstrItinClass itinQ, +// element sizes of 16 and 32 bits: +multiclass N3VInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, string OpcodeStr, string Dt, SDPatternOperator IntOp> { // 64-bit vector types. - def v8i8 : N3VDInt3<op24, op23, 0b00, op11_8, op4, itinD, - OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>; - def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, itinD, + def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, itinD16, OpcodeStr, !strconcat(Dt, "16"), v4i16, v4i16, IntOp>; - def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, itinD, + def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, itinD32, OpcodeStr, !strconcat(Dt, "32"), v2i32, v2i32, IntOp>; // 128-bit vector types. - def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, itinQ, - OpcodeStr, !strconcat(Dt, "8"), v16i8, v16i8, IntOp>; - def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, itinQ, + def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, itinQ16, OpcodeStr, !strconcat(Dt, "16"), v8i16, v8i16, IntOp>; - def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, itinQ, + def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, itinQ32, OpcodeStr, !strconcat(Dt, "32"), v4i32, v4i32, IntOp>; } +// element sizes of 8, 16 and 32 bits: +multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, SDPatternOperator IntOp> + :N3VInt3_HS <op24, op23, op11_8, op4, itinD16, itinD32, + itinQ16, itinQ32, OpcodeStr, Dt, IntOp>{ + // 64-bit vector types. + def v8i8 : N3VDInt3<op24, op23, 0b00, op11_8, op4, itinD16, + OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>; + // 128-bit vector types. + def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, itinQ16, + OpcodeStr, !strconcat(Dt, "8"), v16i8, v16i8, IntOp>; +} // Neon Long Multiply-Op vector operations, // element sizes of 8, 16 and 32 bits: @@ -4305,6 +4316,147 @@ defm VMLALu : N3VLMulOp_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D, defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", NEONvmulls, add>; defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>; +let Predicates = [HasNEON, HasV8_1a] in { + // v8.1a Neon Rounding Double Multiply-Op vector operations, + // VQRDMLAH : Vector Saturating Rounding Doubling Multiply Accumulate Long + // (Q += D * D) + defm VQRDMLAH : N3VInt3_HS<1, 0, 0b1011, 1, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s", + null_frag>; + def : Pat<(v4i16 (int_arm_neon_vqadds + (v4i16 DPR:$src1), + (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn), + (v4i16 DPR:$Vm))))), + (v4i16 (VQRDMLAHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>; + def : Pat<(v2i32 (int_arm_neon_vqadds + (v2i32 DPR:$src1), + (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn), + (v2i32 DPR:$Vm))))), + (v2i32 (VQRDMLAHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>; + def : Pat<(v8i16 (int_arm_neon_vqadds + (v8i16 QPR:$src1), + (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn), + (v8i16 QPR:$Vm))))), + (v8i16 (VQRDMLAHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>; + def : Pat<(v4i32 (int_arm_neon_vqadds + (v4i32 QPR:$src1), + (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn), + (v4i32 QPR:$Vm))))), + (v4i32 (VQRDMLAHv4i32 QPR:$src1, QPR:$Vn, QPR:$Vm))>; + + defm VQRDMLAHsl : N3VMulOpSL_HS<0b1110, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s", + null_frag>; + def : Pat<(v4i16 (int_arm_neon_vqadds + (v4i16 DPR:$src1), + (v4i16 (int_arm_neon_vqrdmulh + (v4i16 DPR:$Vn), + (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm), + imm:$lane)))))), + (v4i16 (VQRDMLAHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, + imm:$lane))>; + def : Pat<(v2i32 (int_arm_neon_vqadds + (v2i32 DPR:$src1), + (v2i32 (int_arm_neon_vqrdmulh + (v2i32 DPR:$Vn), + (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm), + imm:$lane)))))), + (v2i32 (VQRDMLAHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, + imm:$lane))>; + def : Pat<(v8i16 (int_arm_neon_vqadds + (v8i16 QPR:$src1), + (v8i16 (int_arm_neon_vqrdmulh + (v8i16 QPR:$src2), + (v8i16 (NEONvduplane (v8i16 QPR:$src3), + imm:$lane)))))), + (v8i16 (VQRDMLAHslv8i16 (v8i16 QPR:$src1), + (v8i16 QPR:$src2), + (v4i16 (EXTRACT_SUBREG + QPR:$src3, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; + def : Pat<(v4i32 (int_arm_neon_vqadds + (v4i32 QPR:$src1), + (v4i32 (int_arm_neon_vqrdmulh + (v4i32 QPR:$src2), + (v4i32 (NEONvduplane (v4i32 QPR:$src3), + imm:$lane)))))), + (v4i32 (VQRDMLAHslv4i32 (v4i32 QPR:$src1), + (v4i32 QPR:$src2), + (v2i32 (EXTRACT_SUBREG + QPR:$src3, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + + // VQRDMLSH : Vector Saturating Rounding Doubling Multiply Subtract Long + // (Q -= D * D) + defm VQRDMLSH : N3VInt3_HS<1, 0, 0b1100, 1, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s", + null_frag>; + def : Pat<(v4i16 (int_arm_neon_vqsubs + (v4i16 DPR:$src1), + (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn), + (v4i16 DPR:$Vm))))), + (v4i16 (VQRDMLSHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>; + def : Pat<(v2i32 (int_arm_neon_vqsubs + (v2i32 DPR:$src1), + (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn), + (v2i32 DPR:$Vm))))), + (v2i32 (VQRDMLSHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>; + def : Pat<(v8i16 (int_arm_neon_vqsubs + (v8i16 QPR:$src1), + (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn), + (v8i16 QPR:$Vm))))), + (v8i16 (VQRDMLSHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>; + def : Pat<(v4i32 (int_arm_neon_vqsubs + (v4i32 QPR:$src1), + (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn), + (v4i32 QPR:$Vm))))), + (v4i32 (VQRDMLSHv4i32 QPR:$src1, QPR:$Vn, QPR:$Vm))>; + + defm VQRDMLSHsl : N3VMulOpSL_HS<0b1111, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s", + null_frag>; + def : Pat<(v4i16 (int_arm_neon_vqsubs + (v4i16 DPR:$src1), + (v4i16 (int_arm_neon_vqrdmulh + (v4i16 DPR:$Vn), + (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm), + imm:$lane)))))), + (v4i16 (VQRDMLSHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>; + def : Pat<(v2i32 (int_arm_neon_vqsubs + (v2i32 DPR:$src1), + (v2i32 (int_arm_neon_vqrdmulh + (v2i32 DPR:$Vn), + (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm), + imm:$lane)))))), + (v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, + imm:$lane))>; + def : Pat<(v8i16 (int_arm_neon_vqsubs + (v8i16 QPR:$src1), + (v8i16 (int_arm_neon_vqrdmulh + (v8i16 QPR:$src2), + (v8i16 (NEONvduplane (v8i16 QPR:$src3), + imm:$lane)))))), + (v8i16 (VQRDMLSHslv8i16 (v8i16 QPR:$src1), + (v8i16 QPR:$src2), + (v4i16 (EXTRACT_SUBREG + QPR:$src3, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; + def : Pat<(v4i32 (int_arm_neon_vqsubs + (v4i32 QPR:$src1), + (v4i32 (int_arm_neon_vqrdmulh + (v4i32 QPR:$src2), + (v4i32 (NEONvduplane (v4i32 QPR:$src3), + imm:$lane)))))), + (v4i32 (VQRDMLSHslv4i32 (v4i32 QPR:$src1), + (v4i32 QPR:$src2), + (v2i32 (EXTRACT_SUBREG + QPR:$src3, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; +} // VQDMLAL : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D) defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, "vqdmlal", "s", null_frag>; @@ -6158,6 +6310,21 @@ class N3VSMulOpPat<SDNode MulNode, SDNode OpNode, NeonI Inst> (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)), SPR:$b, ssub_0)), DPR_VFP2)), ssub_0)>; +class NVCVTIFPat<SDNode OpNode, NeonI Inst> + : NEONFPPat<(f32 (OpNode GPR:$a)), + (f32 (EXTRACT_SUBREG + (v2f32 (Inst + (INSERT_SUBREG + (v2f32 (IMPLICIT_DEF)), + (i32 (COPY_TO_REGCLASS GPR:$a, SPR)), ssub_0))), + ssub_0))>; +class NVCVTFIPat<SDNode OpNode, NeonI Inst> + : NEONFPPat<(i32 (OpNode SPR:$a)), + (i32 (EXTRACT_SUBREG + (v2f32 (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), + SPR:$a, ssub_0))), + ssub_0))>; + def : N3VSPat<fadd, VADDfd>; def : N3VSPat<fsub, VSUBfd>; def : N3VSPat<fmul, VMULfd>; @@ -6173,10 +6340,22 @@ def : N2VSPat<fabs, VABSfd>; def : N2VSPat<fneg, VNEGfd>; def : N3VSPat<NEONfmax, VMAXfd>; def : N3VSPat<NEONfmin, VMINfd>; -def : N2VSPat<arm_ftosi, VCVTf2sd>; -def : N2VSPat<arm_ftoui, VCVTf2ud>; -def : N2VSPat<arm_sitof, VCVTs2fd>; -def : N2VSPat<arm_uitof, VCVTu2fd>; +def : NVCVTFIPat<fp_to_sint, VCVTf2sd>; +def : NVCVTFIPat<fp_to_uint, VCVTf2ud>; +def : NVCVTIFPat<sint_to_fp, VCVTs2fd>; +def : NVCVTIFPat<uint_to_fp, VCVTu2fd>; + +// NEON doesn't have any f64 conversions, so provide patterns to make +// sure the VFP conversions match when extracting from a vector. +def : VFPPat<(f64 (sint_to_fp (extractelt (v2i32 DPR:$src), imm:$lane))), + (VSITOD (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane)))>; +def : VFPPat<(f64 (sint_to_fp (extractelt (v4i32 QPR:$src), imm:$lane))), + (VSITOD (EXTRACT_SUBREG QPR:$src, (SSubReg_f32_reg imm:$lane)))>; +def : VFPPat<(f64 (uint_to_fp (extractelt (v2i32 DPR:$src), imm:$lane))), + (VUITOD (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane)))>; +def : VFPPat<(f64 (uint_to_fp (extractelt (v4i32 QPR:$src), imm:$lane))), + (VUITOD (EXTRACT_SUBREG QPR:$src, (SSubReg_f32_reg imm:$lane)))>; + // Prefer VMOVDRR for i32 -> f32 bitcasts, it can write all DPR registers. def : Pat<(f32 (bitconvert GPR:$a)), diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td index e0a9314..afff016 100644 --- a/lib/Target/ARM/ARMInstrVFP.td +++ b/lib/Target/ARM/ARMInstrVFP.td @@ -11,16 +11,10 @@ // //===----------------------------------------------------------------------===// -def SDT_FTOI : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>; -def SDT_ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>; def SDT_CMPFP0 : SDTypeProfile<0, 1, [SDTCisFP<0>]>; def SDT_VMOVDRR : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>; -def arm_ftoui : SDNode<"ARMISD::FTOUI", SDT_FTOI>; -def arm_ftosi : SDNode<"ARMISD::FTOSI", SDT_FTOI>; -def arm_sitof : SDNode<"ARMISD::SITOF", SDT_ITOF>; -def arm_uitof : SDNode<"ARMISD::UITOF", SDT_ITOF>; def arm_fmstat : SDNode<"ARMISD::FMSTAT", SDTNone, [SDNPInGlue, SDNPOutGlue]>; def arm_cmpfp : SDNode<"ARMISD::CMPFP", SDT_ARMCmp, [SDNPOutGlue]>; def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>; @@ -633,7 +627,7 @@ multiclass vcvt_inst<string opc, bits<2> rm, def SS : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), NoItinerary, !strconcat("vcvt", opc, ".s32.f32\t$Sd, $Sm"), - [(set SPR:$Sd, (arm_ftosi (node SPR:$Sm)))]>, + []>, Requires<[HasFPARMv8]> { let Inst{17-16} = rm; } @@ -641,7 +635,7 @@ multiclass vcvt_inst<string opc, bits<2> rm, def US : ASuInp<0b11101, 0b11, 0b1100, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), NoItinerary, !strconcat("vcvt", opc, ".u32.f32\t$Sd, $Sm"), - [(set SPR:$Sd, (arm_ftoui (node SPR:$Sm)))]>, + []>, Requires<[HasFPARMv8]> { let Inst{17-16} = rm; } @@ -649,7 +643,7 @@ multiclass vcvt_inst<string opc, bits<2> rm, def SD : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0, (outs SPR:$Sd), (ins DPR:$Dm), NoItinerary, !strconcat("vcvt", opc, ".s32.f64\t$Sd, $Dm"), - [(set SPR:$Sd, (arm_ftosi (f64 (node (f64 DPR:$Dm)))))]>, + []>, Requires<[HasFPARMv8, HasDPVFP]> { bits<5> Dm; @@ -664,7 +658,7 @@ multiclass vcvt_inst<string opc, bits<2> rm, def UD : ASuInp<0b11101, 0b11, 0b1100, 0b01, 0, (outs SPR:$Sd), (ins DPR:$Dm), NoItinerary, !strconcat("vcvt", opc, ".u32.f64\t$Sd, $Dm"), - [(set SPR:$Sd, (arm_ftoui (f64 (node (f64 DPR:$Dm)))))]>, + []>, Requires<[HasFPARMv8, HasDPVFP]> { bits<5> Dm; @@ -676,6 +670,27 @@ multiclass vcvt_inst<string opc, bits<2> rm, let Inst{8} = 1; } } + + let Predicates = [HasFPARMv8] in { + def : Pat<(i32 (fp_to_sint (node SPR:$a))), + (COPY_TO_REGCLASS + (!cast<Instruction>(NAME#"SS") SPR:$a), + GPR)>; + def : Pat<(i32 (fp_to_uint (node SPR:$a))), + (COPY_TO_REGCLASS + (!cast<Instruction>(NAME#"US") SPR:$a), + GPR)>; + } + let Predicates = [HasFPARMv8, HasDPVFP] in { + def : Pat<(i32 (fp_to_sint (node (f64 DPR:$a)))), + (COPY_TO_REGCLASS + (!cast<Instruction>(NAME#"SD") DPR:$a), + GPR)>; + def : Pat<(i32 (fp_to_uint (node (f64 DPR:$a)))), + (COPY_TO_REGCLASS + (!cast<Instruction>(NAME#"UD") DPR:$a), + GPR)>; + } } defm VCVTA : vcvt_inst<"a", 0b00, frnd>; @@ -980,14 +995,22 @@ class AVConv1InSs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, (outs DPR:$Dd), (ins SPR:$Sm), IIC_fpCVTID, "vcvt", ".f64.s32\t$Dd, $Sm", - [(set DPR:$Dd, (f64 (arm_sitof SPR:$Sm)))]> { + []> { let Inst{7} = 1; // s32 } +let Predicates=[HasVFP2, HasDPVFP] in { + def : VFPPat<(f64 (sint_to_fp GPR:$a)), + (VSITOD (COPY_TO_REGCLASS GPR:$a, SPR))>; + + def : VFPPat<(f64 (sint_to_fp (i32 (load addrmode5:$a)))), + (VSITOD (VLDRS addrmode5:$a))>; +} + def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, (outs SPR:$Sd),(ins SPR:$Sm), IIC_fpCVTIS, "vcvt", ".f32.s32\t$Sd, $Sm", - [(set SPR:$Sd, (arm_sitof SPR:$Sm))]> { + []> { let Inst{7} = 1; // s32 // Some single precision VFP instructions may be executed on both NEON and @@ -995,17 +1018,31 @@ def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, let D = VFPNeonA8Domain; } +def : VFPNoNEONPat<(f32 (sint_to_fp GPR:$a)), + (VSITOS (COPY_TO_REGCLASS GPR:$a, SPR))>; + +def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (load addrmode5:$a)))), + (VSITOS (VLDRS addrmode5:$a))>; + def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, (outs DPR:$Dd), (ins SPR:$Sm), IIC_fpCVTID, "vcvt", ".f64.u32\t$Dd, $Sm", - [(set DPR:$Dd, (f64 (arm_uitof SPR:$Sm)))]> { + []> { let Inst{7} = 0; // u32 } +let Predicates=[HasVFP2, HasDPVFP] in { + def : VFPPat<(f64 (uint_to_fp GPR:$a)), + (VUITOD (COPY_TO_REGCLASS GPR:$a, SPR))>; + + def : VFPPat<(f64 (uint_to_fp (i32 (load addrmode5:$a)))), + (VUITOD (VLDRS addrmode5:$a))>; +} + def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, (outs SPR:$Sd), (ins SPR:$Sm), IIC_fpCVTIS, "vcvt", ".f32.u32\t$Sd, $Sm", - [(set SPR:$Sd, (arm_uitof SPR:$Sm))]> { + []> { let Inst{7} = 0; // u32 // Some single precision VFP instructions may be executed on both NEON and @@ -1013,6 +1050,12 @@ def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, let D = VFPNeonA8Domain; } +def : VFPNoNEONPat<(f32 (uint_to_fp GPR:$a)), + (VUITOS (COPY_TO_REGCLASS GPR:$a, SPR))>; + +def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (load addrmode5:$a)))), + (VUITOS (VLDRS addrmode5:$a))>; + // FP -> Int: class AVConv1IsD_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, @@ -1055,14 +1098,22 @@ class AVConv1InsS_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011, (outs SPR:$Sd), (ins DPR:$Dm), IIC_fpCVTDI, "vcvt", ".s32.f64\t$Sd, $Dm", - [(set SPR:$Sd, (arm_ftosi (f64 DPR:$Dm)))]> { + []> { let Inst{7} = 1; // Z bit } +let Predicates=[HasVFP2, HasDPVFP] in { + def : VFPPat<(i32 (fp_to_sint (f64 DPR:$a))), + (COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>; + + def : VFPPat<(store (i32 (fp_to_sint (f64 DPR:$a))), addrmode5:$ptr), + (VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>; +} + def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010, (outs SPR:$Sd), (ins SPR:$Sm), IIC_fpCVTSI, "vcvt", ".s32.f32\t$Sd, $Sm", - [(set SPR:$Sd, (arm_ftosi SPR:$Sm))]> { + []> { let Inst{7} = 1; // Z bit // Some single precision VFP instructions may be executed on both NEON and @@ -1070,17 +1121,31 @@ def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010, let D = VFPNeonA8Domain; } +def : VFPNoNEONPat<(i32 (fp_to_sint SPR:$a)), + (COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>; + +def : VFPNoNEONPat<(store (i32 (fp_to_sint (f32 SPR:$a))), addrmode5:$ptr), + (VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>; + def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011, (outs SPR:$Sd), (ins DPR:$Dm), IIC_fpCVTDI, "vcvt", ".u32.f64\t$Sd, $Dm", - [(set SPR:$Sd, (arm_ftoui (f64 DPR:$Dm)))]> { + []> { let Inst{7} = 1; // Z bit } +let Predicates=[HasVFP2, HasDPVFP] in { + def : VFPPat<(i32 (fp_to_uint (f64 DPR:$a))), + (COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>; + + def : VFPPat<(store (i32 (fp_to_uint (f64 DPR:$a))), addrmode5:$ptr), + (VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>; +} + def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010, (outs SPR:$Sd), (ins SPR:$Sm), IIC_fpCVTSI, "vcvt", ".u32.f32\t$Sd, $Sm", - [(set SPR:$Sd, (arm_ftoui SPR:$Sm))]> { + []> { let Inst{7} = 1; // Z bit // Some single precision VFP instructions may be executed on both NEON and @@ -1088,6 +1153,12 @@ def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010, let D = VFPNeonA8Domain; } +def : VFPNoNEONPat<(i32 (fp_to_uint SPR:$a)), + (COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>; + +def : VFPNoNEONPat<(store (i32 (fp_to_uint (f32 SPR:$a))), addrmode5:$ptr), + (VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>; + // And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR. let Uses = [FPSCR] in { // FIXME: Verify encoding after integrated assembler is working. diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index a8d0981..eca8e28 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -19,7 +19,7 @@ #include "ARMMachineFunctionInfo.h" #include "ARMSubtarget.h" #include "MCTargetDesc/ARMAddressingModes.h" -#include "Thumb1RegisterInfo.h" +#include "ThumbRegisterInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" @@ -38,6 +38,7 @@ #include "llvm/IR/Function.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h index ddfdb52..a68ab1b 100644 --- a/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -149,11 +149,7 @@ public: unsigned getStoredByValParamsPadding() const { return StByValParamsPadding; } void setStoredByValParamsPadding(unsigned p) { StByValParamsPadding = p; } - unsigned getArgRegsSaveSize(unsigned Align = 0) const { - if (!Align) - return ArgRegsSaveSize; - return (ArgRegsSaveSize + Align - 1) & ~(Align - 1); - } + unsigned getArgRegsSaveSize() const { return ArgRegsSaveSize; } void setArgRegsSaveSize(unsigned s) { ArgRegsSaveSize = s; } unsigned getReturnRegsCount() const { return ReturnRegsCount; } diff --git a/lib/Target/ARM/ARMRegisterInfo.cpp b/lib/Target/ARM/ARMRegisterInfo.cpp index 80b4b48..e6e8cdf 100644 --- a/lib/Target/ARM/ARMRegisterInfo.cpp +++ b/lib/Target/ARM/ARMRegisterInfo.cpp @@ -16,6 +16,4 @@ using namespace llvm; void ARMRegisterInfo::anchor() { } -ARMRegisterInfo::ARMRegisterInfo(const ARMSubtarget &sti) - : ARMBaseRegisterInfo(sti) { -} +ARMRegisterInfo::ARMRegisterInfo() : ARMBaseRegisterInfo() {} diff --git a/lib/Target/ARM/ARMRegisterInfo.h b/lib/Target/ARM/ARMRegisterInfo.h index b623173..e2e650e 100644 --- a/lib/Target/ARM/ARMRegisterInfo.h +++ b/lib/Target/ARM/ARMRegisterInfo.h @@ -23,7 +23,7 @@ class ARMSubtarget; struct ARMRegisterInfo : public ARMBaseRegisterInfo { virtual void anchor(); public: - ARMRegisterInfo(const ARMSubtarget &STI); + ARMRegisterInfo(); }; } // end namespace llvm diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index 89624dd..fbec9e6 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -129,6 +129,7 @@ void ARMSubtarget::initializeEnvironment() { HasV5TEOps = false; HasV6Ops = false; HasV6MOps = false; + HasV6KOps = false; HasV6T2Ops = false; HasV7Ops = false; HasV8Ops = false; @@ -165,6 +166,7 @@ void ARMSubtarget::initializeEnvironment() { HasTrustZone = false; HasCrypto = false; HasCRC = false; + HasV8_1a = false; HasZeroCycleZeroing = false; AllowsUnalignedMem = false; Thumb2DSP = false; diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index f4deddf..f36cd5c 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -56,13 +56,14 @@ protected: ARMProcClassEnum ARMProcClass; /// HasV4TOps, HasV5TOps, HasV5TEOps, - /// HasV6Ops, HasV6MOps, HasV6T2Ops, HasV7Ops, HasV8Ops - + /// HasV6Ops, HasV6MOps, HasV6KOps, HasV6T2Ops, HasV7Ops, HasV8Ops - /// Specify whether target support specific ARM ISA variants. bool HasV4TOps; bool HasV5TOps; bool HasV5TEOps; bool HasV6Ops; bool HasV6MOps; + bool HasV6KOps; bool HasV6T2Ops; bool HasV7Ops; bool HasV8Ops; @@ -181,6 +182,9 @@ protected: /// HasCRC - if true, processor supports CRC instructions bool HasCRC; + /// HasV8_1a - if true, the processor has V8.1a: PAN and RDMA extensions + bool HasV8_1a; + /// If true, the instructions "vmov.i32 d0, #0" and "vmov.i32 q0, #0" are /// particularly effective at zeroing a VFP register. bool HasZeroCycleZeroing; @@ -287,6 +291,7 @@ public: bool hasV5TEOps() const { return HasV5TEOps; } bool hasV6Ops() const { return HasV6Ops; } bool hasV6MOps() const { return HasV6MOps; } + bool hasV6KOps() const { return HasV6KOps; } bool hasV6T2Ops() const { return HasV6T2Ops; } bool hasV7Ops() const { return HasV7Ops; } bool hasV8Ops() const { return HasV8Ops; } @@ -311,6 +316,7 @@ public: bool hasNEON() const { return HasNEON; } bool hasCrypto() const { return HasCrypto; } bool hasCRC() const { return HasCRC; } + bool hasV8_1a() const { return HasV8_1a; } bool hasVirtualization() const { return HasVirtualization; } bool useNEONForSinglePrecisionFP() const { return hasNEON() && UseNEONForSinglePrecisionFP; diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index a97a058..1bee1b0 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -37,6 +37,11 @@ EnableAtomicTidy("arm-atomic-cfg-tidy", cl::Hidden, " to make use of cmpxchg flow-based information"), cl::init(true)); +static cl::opt<bool> +EnableARMLoadStoreOpt("arm-load-store-opt", cl::Hidden, + cl::desc("Enable ARM load/store optimization pass"), + cl::init(true)); + extern "C" void LLVMInitializeARMTarget() { // Register the target. RegisterTargetMachine<ARMLETargetMachine> X(TheARMLETarget); @@ -105,9 +110,11 @@ computeTargetABI(const Triple &TT, StringRef CPU, return TargetABI; } -static std::string computeDataLayout(const Triple &TT, - ARMBaseTargetMachine::ARMABI ABI, +static std::string computeDataLayout(StringRef TT, StringRef CPU, + const TargetOptions &Options, bool isLittle) { + const Triple Triple(TT); + auto ABI = computeTargetABI(Triple, CPU, Options); std::string Ret = ""; if (isLittle) @@ -117,7 +124,7 @@ static std::string computeDataLayout(const Triple &TT, // Big endian. Ret += "E"; - Ret += DataLayout::getManglingComponent(TT); + Ret += DataLayout::getManglingComponent(Triple); // Pointers are 32 bits and aligned to 32 bits. Ret += "-p:32:32"; @@ -147,7 +154,7 @@ static std::string computeDataLayout(const Triple &TT, // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit // aligned everywhere else. - if (TT.isOSNaCl()) + if (Triple.isOSNaCl()) Ret += "-S128"; else if (ABI == ARMBaseTargetMachine::ARM_ABI_AAPCS) Ret += "-S64"; @@ -164,9 +171,9 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, StringRef TT, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT, + CPU, FS, Options, RM, CM, OL), TargetABI(computeTargetABI(Triple(TT), CPU, Options)), - DL(computeDataLayout(Triple(TT), TargetABI, isLittle)), TLOF(createTLOF(Triple(getTargetTriple()))), Subtarget(TT, CPU, FS, *this, isLittle), isLittle(isLittle) { @@ -325,7 +332,7 @@ void ARMPassConfig::addIRPasses() { } bool ARMPassConfig::addPreISel() { - if (TM->getOptLevel() != CodeGenOpt::None) + if (TM->getOptLevel() == CodeGenOpt::Aggressive) // FIXME: This is using the thumb1 only constant value for // maximal global offset for merging globals. We may want // to look into using the old value for non-thumb1 code of @@ -339,32 +346,30 @@ bool ARMPassConfig::addPreISel() { bool ARMPassConfig::addInstSelector() { addPass(createARMISelDag(getARMTargetMachine(), getOptLevel())); - const ARMSubtarget *Subtarget = &getARMSubtarget(); - if (Subtarget->isTargetELF() && !Subtarget->isThumb1Only() && + if (Triple(TM->getTargetTriple()).isOSBinFormatELF() && TM->Options.EnableFastISel) addPass(createARMGlobalBaseRegPass()); return false; } void ARMPassConfig::addPreRegAlloc() { - if (getOptLevel() != CodeGenOpt::None) - addPass(createARMLoadStoreOptimizationPass(true)); - if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isCortexA9()) + if (getOptLevel() != CodeGenOpt::None) { addPass(createMLxExpansionPass()); - // Since the A15SDOptimizer pass can insert VDUP instructions, it can only be - // enabled when NEON is available. - if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isCortexA15() && - getARMSubtarget().hasNEON() && !DisableA15SDOptimization) { - addPass(createA15SDOptimizerPass()); + + if (EnableARMLoadStoreOpt) + addPass(createARMLoadStoreOptimizationPass(/* pre-register alloc */ true)); + + if (!DisableA15SDOptimization) + addPass(createA15SDOptimizerPass()); } } void ARMPassConfig::addPreSched2() { if (getOptLevel() != CodeGenOpt::None) { - addPass(createARMLoadStoreOptimizationPass()); + if (EnableARMLoadStoreOpt) + addPass(createARMLoadStoreOptimizationPass()); - if (getARMSubtarget().hasNEON()) - addPass(createExecutionDependencyFixPass(&ARM::DPRRegClass)); + addPass(createExecutionDependencyFixPass(&ARM::DPRRegClass)); } // Expand some pseudo instructions into multiple instructions to allow @@ -372,26 +377,21 @@ void ARMPassConfig::addPreSched2() { addPass(createARMExpandPseudoPass()); if (getOptLevel() != CodeGenOpt::None) { - if (!getARMSubtarget().isThumb1Only()) { - // in v8, IfConversion depends on Thumb instruction widths - if (getARMSubtarget().restrictIT() && - !getARMSubtarget().prefers32BitThumb()) - addPass(createThumb2SizeReductionPass()); + // in v8, IfConversion depends on Thumb instruction widths + if (getARMSubtarget().restrictIT()) + addPass(createThumb2SizeReductionPass()); + if (!getARMSubtarget().isThumb1Only()) addPass(&IfConverterID); - } } - if (getARMSubtarget().isThumb2()) - addPass(createThumb2ITBlockPass()); + addPass(createThumb2ITBlockPass()); } void ARMPassConfig::addPreEmitPass() { - if (getARMSubtarget().isThumb2()) { - if (!getARMSubtarget().prefers32BitThumb()) - addPass(createThumb2SizeReductionPass()); + addPass(createThumb2SizeReductionPass()); - // Constant island pass work on unbundled instructions. + // Constant island pass work on unbundled instructions. + if (getARMSubtarget().isThumb2()) addPass(&UnpackMachineBundlesID); - } addPass(createARMOptimizeBarriersPass()); addPass(createARMConstantIslandPass()); diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h index 7f6a1ee..20ca97b 100644 --- a/lib/Target/ARM/ARMTargetMachine.h +++ b/lib/Target/ARM/ARMTargetMachine.h @@ -30,7 +30,6 @@ public: } TargetABI; protected: - const DataLayout DL; std::unique_ptr<TargetLoweringObjectFile> TLOF; ARMSubtarget Subtarget; bool isLittle; @@ -45,9 +44,8 @@ public: bool isLittle); ~ARMBaseTargetMachine() override; - const ARMSubtarget *getSubtargetImpl() const override { return &Subtarget; } + const ARMSubtarget *getSubtargetImpl() const { return &Subtarget; } const ARMSubtarget *getSubtargetImpl(const Function &F) const override; - const DataLayout *getDataLayout() const override { return &DL; } bool isLittleEndian() const { return isLittle; } /// \brief Get the TargetIRAnalysis for this target. diff --git a/lib/Target/ARM/Android.mk b/lib/Target/ARM/Android.mk index 55a5775..6694b53 100644 --- a/lib/Target/ARM/Android.mk +++ b/lib/Target/ARM/Android.mk @@ -4,6 +4,7 @@ arm_codegen_TBLGEN_TABLES := \ ARMGenRegisterInfo.inc \ ARMGenInstrInfo.inc \ ARMGenCodeEmitter.inc \ + ARMGenCodeEmitter.inc \ ARMGenMCCodeEmitter.inc \ ARMGenMCPseudoLowering.inc \ ARMGenAsmWriter.inc \ @@ -41,10 +42,9 @@ arm_codegen_SRC_FILES := \ MLxExpansionPass.cpp \ Thumb1FrameLowering.cpp \ Thumb1InstrInfo.cpp \ - Thumb1RegisterInfo.cpp \ + ThumbRegisterInfo.cpp \ Thumb2ITBlockPass.cpp \ Thumb2InstrInfo.cpp \ - Thumb2RegisterInfo.cpp \ Thumb2SizeReduction.cpp # For the host diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 59461e8..2215efb 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -276,6 +276,9 @@ class ARMAsmParser : public MCTargetAsmParser { bool hasD16() const { return STI.getFeatureBits() & ARM::FeatureD16; } + bool hasV8_1a() const { + return STI.getFeatureBits() & ARM::FeatureV8_1a; + } void SwitchMode() { uint64_t FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb)); @@ -342,10 +345,10 @@ public: }; - ARMAsmParser(MCSubtargetInfo & _STI, MCAsmParser & _Parser, + ARMAsmParser(MCSubtargetInfo &STI, MCAsmParser &Parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(_STI), MII(MII), UC(_Parser) { - MCAsmParserExtension::Initialize(_Parser); + : STI(STI), MII(MII), UC(Parser) { + MCAsmParserExtension::Initialize(Parser); // Cache the MCRegisterInfo. MRI = getContext().getRegisterInfo(); diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt index 2530640..0b698197 100644 --- a/lib/Target/ARM/CMakeLists.txt +++ b/lib/Target/ARM/CMakeLists.txt @@ -40,10 +40,9 @@ add_llvm_target(ARMCodeGen MLxExpansionPass.cpp Thumb1FrameLowering.cpp Thumb1InstrInfo.cpp - Thumb1RegisterInfo.cpp + ThumbRegisterInfo.cpp Thumb2ITBlockPass.cpp Thumb2InstrInfo.cpp - Thumb2RegisterInfo.cpp Thumb2SizeReduction.cpp ) diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp index 16eea33..e15323d 100644 --- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp +++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp @@ -637,12 +637,12 @@ void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum, printRegName(O, MO1.getReg()); unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm()); - unsigned Op = ARM_AM::getAM5Op(MO2.getImm()); + ARM_AM::AddrOpc Op = ARM_AM::getAM5Op(MO2.getImm()); if (AlwaysPrintImm0 || ImmOffs || Op == ARM_AM::sub) { O << ", " << markup("<imm:") << "#" - << ARM_AM::getAddrOpcStr(ARM_AM::getAM5Op(MO2.getImm())) + << ARM_AM::getAddrOpcStr(Op) << ImmOffs * 4 << markup(">"); } diff --git a/lib/Target/ARM/MCTargetDesc/ARMArchName.def b/lib/Target/ARM/MCTargetDesc/ARMArchName.def index 9f007a0..96a0c1a 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMArchName.def +++ b/lib/Target/ARM/MCTargetDesc/ARMArchName.def @@ -30,6 +30,7 @@ ARM_ARCH_NAME("armv5t", ARMV5T, "5T", v5T) ARM_ARCH_NAME("armv5te", ARMV5TE, "5TE", v5TE) ARM_ARCH_NAME("armv6", ARMV6, "6", v6) ARM_ARCH_NAME("armv6j", ARMV6J, "6J", v6) +ARM_ARCH_NAME("armv6k", ARMV6K, "6K", v6K) ARM_ARCH_NAME("armv6t2", ARMV6T2, "6T2", v6T2) ARM_ARCH_NAME("armv6z", ARMV6Z, "6Z", v6KZ) ARM_ARCH_NAME("armv6zk", ARMV6ZK, "6ZK", v6KZ) @@ -43,6 +44,8 @@ ARM_ARCH_NAME("armv7-m", ARMV7M, "7-M", v7) ARM_ARCH_ALIAS("armv7m", ARMV7M) ARM_ARCH_NAME("armv8-a", ARMV8A, "8-A", v8) ARM_ARCH_ALIAS("armv8a", ARMV8A) +ARM_ARCH_NAME("armv8.1-a", ARMV8_1A, "8.1-A", v8) +ARM_ARCH_ALIAS("armv8.1a", ARMV8_1A) ARM_ARCH_NAME("iwmmxt", IWMMXT, "iwmmxt", v5TE) ARM_ARCH_NAME("iwmmxt2", IWMMXT2, "iwmmxt2", v5TE) diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index 2b65520..9648ffa 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -783,6 +783,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() { setAttributeItem(THUMB_ISA_use, AllowThumb32, false); break; + case ARM::ARMV6K: case ARM::ARMV6Z: case ARM::ARMV6ZK: setAttributeItem(ARM_ISA_use, Allowed, false); @@ -816,6 +817,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() { break; case ARM::ARMV8A: + case ARM::ARMV8_1A: setAttributeItem(CPU_arch_profile, ApplicationProfile, false); setAttributeItem(ARM_ISA_use, Allowed, false); setAttributeItem(THUMB_ISA_use, AllowThumb32, false); @@ -913,9 +915,8 @@ void ARMTargetELFStreamer::emitFPUDefaultAttributes() { setAttributeItem(ARMBuildAttrs::FP_arch, ARMBuildAttrs::AllowFPARMv8A, /* OverwriteExisting= */ false); - setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch, - ARMBuildAttrs::AllowNeonARMv8, - /* OverwriteExisting= */ false); + // 'Advanced_SIMD_arch' must be emitted not here, but within + // ARMAsmPrinter::emitAttributes(), depending on hasV8Ops() and hasV8_1a() break; case ARM::SOFTVFP: @@ -1362,25 +1363,29 @@ void ARMELFStreamer::emitUnwindRaw(int64_t Offset, namespace llvm { -MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS, - bool isVerboseAsm, bool useDwarfDirectory, - MCInstPrinter *InstPrint, MCCodeEmitter *CE, - MCAsmBackend *TAB, bool ShowInst) { - MCStreamer *S = llvm::createAsmStreamer( - Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst); - new ARMTargetAsmStreamer(*S, OS, *InstPrint, isVerboseAsm); - return S; +MCTargetStreamer *createARMTargetAsmStreamer(MCStreamer &S, + formatted_raw_ostream &OS, + MCInstPrinter *InstPrint, + bool isVerboseAsm) { + return new ARMTargetAsmStreamer(S, OS, *InstPrint, isVerboseAsm); } MCTargetStreamer *createARMNullTargetStreamer(MCStreamer &S) { return new ARMTargetStreamer(S); } +MCTargetStreamer *createARMObjectTargetStreamer(MCStreamer &S, + const MCSubtargetInfo &STI) { + Triple TT(STI.getTargetTriple()); + if (TT.getObjectFormat() == Triple::ELF) + return new ARMTargetELFStreamer(S); + return new ARMTargetStreamer(S); +} + MCELFStreamer *createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS, MCCodeEmitter *Emitter, bool RelaxAll, bool IsThumb) { ARMELFStreamer *S = new ARMELFStreamer(Context, TAB, OS, Emitter, IsThumb); - new ARMTargetELFStreamer(*S); // FIXME: This should eventually end up somewhere else where more // intelligent flag decisions can be made. For now we are just maintaining // the status quo for ARM and setting EF_ARM_EABI_VER5 as the default. diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp index 66a1618..caa8736 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp @@ -59,6 +59,7 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo(StringRef TT) { // Exceptions handling switch (TheTriple.getOS()) { + case Triple::Bitrig: case Triple::NetBSD: ExceptionsType = ExceptionHandling::DwarfCFI; break; diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp index efbebd3..e48cabb 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp @@ -441,14 +441,12 @@ public: MCCodeEmitter *llvm::createARMLEMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx) { return new ARMMCCodeEmitter(MCII, Ctx, true); } MCCodeEmitter *llvm::createARMBEMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx) { return new ARMMCCodeEmitter(MCII, Ctx, false); } diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp index 68d32b2..5b90de3 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp @@ -10,6 +10,7 @@ #include "ARMMCExpr.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCStreamer.h" using namespace llvm; #define DEBUG_TYPE "armmcexpr" diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h index 06bf6c9..2be98d2 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h +++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h @@ -26,8 +26,8 @@ private: const VariantKind Kind; const MCExpr *Expr; - explicit ARMMCExpr(VariantKind _Kind, const MCExpr *_Expr) - : Kind(_Kind), Expr(_Expr) {} + explicit ARMMCExpr(VariantKind Kind, const MCExpr *Expr) + : Kind(Kind), Expr(Expr) {} public: /// @name Construction diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp index 8c19785..7ff7f9a 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp @@ -153,6 +153,17 @@ std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) { // Use CPU to figure out the exact features ARMArchFeature = "+v8"; break; + case Triple::ARMSubArch_v8_1a: + if (NoCPU) + // v8.1a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2, + // FeatureMP, FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone, + // FeatureT2XtPk, FeatureCrypto, FeatureCRC, FeatureV8_1a + ARMArchFeature = "+v8.1a,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm," + "+trustzone,+t2xtpk,+crypto,+crc"; + else + // Use CPU to figure out the exact features + ARMArchFeature = "+v8.1a"; + break; case Triple::ARMSubArch_v7m: isThumb = true; if (NoCPU) @@ -195,6 +206,9 @@ std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) { case Triple::ARMSubArch_v6t2: ARMArchFeature = "+v6t2"; break; + case Triple::ARMSubArch_v6k: + ARMArchFeature = "+v6k"; + break; case Triple::ARMSubArch_v6m: isThumb = true; if (NoCPU) @@ -295,27 +309,18 @@ static MCCodeGenInfo *createARMMCCodeGenInfo(StringRef TT, Reloc::Model RM, return X; } -// This is duplicated code. Refactor this. -static MCStreamer *createMCStreamer(const Target &T, StringRef TT, - MCContext &Ctx, MCAsmBackend &MAB, - raw_ostream &OS, MCCodeEmitter *Emitter, - const MCSubtargetInfo &STI, bool RelaxAll) { - Triple TheTriple(TT); +static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx, + MCAsmBackend &MAB, raw_ostream &OS, + MCCodeEmitter *Emitter, bool RelaxAll) { + return createARMELFStreamer(Ctx, MAB, OS, Emitter, false, + T.getArch() == Triple::thumb); +} - switch (TheTriple.getObjectFormat()) { - default: llvm_unreachable("unsupported object format"); - case Triple::MachO: { - MCStreamer *S = createMachOStreamer(Ctx, MAB, OS, Emitter, false); - new ARMTargetStreamer(*S); - return S; - } - case Triple::COFF: - assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported"); - return createARMWinCOFFStreamer(Ctx, MAB, *Emitter, OS); - case Triple::ELF: - return createARMELFStreamer(Ctx, MAB, OS, Emitter, false, - TheTriple.getArch() == Triple::thumb); - } +static MCStreamer *createARMMachOStreamer(MCContext &Ctx, MCAsmBackend &MAB, + raw_ostream &OS, + MCCodeEmitter *Emitter, bool RelaxAll, + bool DWARFMustBeAtTheEnd) { + return createMachOStreamer(Ctx, MAB, OS, Emitter, false, DWARFMustBeAtTheEnd); } static MCInstPrinter *createARMMCInstPrinter(const Target &T, @@ -379,61 +384,53 @@ static MCInstrAnalysis *createARMMCInstrAnalysis(const MCInstrInfo *Info) { // Force static initialization. extern "C" void LLVMInitializeARMTargetMC() { - // Register the MC asm info. - RegisterMCAsmInfoFn X(TheARMLETarget, createARMMCAsmInfo); - RegisterMCAsmInfoFn Y(TheARMBETarget, createARMMCAsmInfo); - RegisterMCAsmInfoFn A(TheThumbLETarget, createARMMCAsmInfo); - RegisterMCAsmInfoFn B(TheThumbBETarget, createARMMCAsmInfo); - - // Register the MC codegen info. - TargetRegistry::RegisterMCCodeGenInfo(TheARMLETarget, createARMMCCodeGenInfo); - TargetRegistry::RegisterMCCodeGenInfo(TheARMBETarget, createARMMCCodeGenInfo); - TargetRegistry::RegisterMCCodeGenInfo(TheThumbLETarget, - createARMMCCodeGenInfo); - TargetRegistry::RegisterMCCodeGenInfo(TheThumbBETarget, - createARMMCCodeGenInfo); - - // Register the MC instruction info. - TargetRegistry::RegisterMCInstrInfo(TheARMLETarget, createARMMCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(TheARMBETarget, createARMMCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(TheThumbLETarget, createARMMCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(TheThumbBETarget, createARMMCInstrInfo); - - // Register the MC register info. - TargetRegistry::RegisterMCRegInfo(TheARMLETarget, createARMMCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(TheARMBETarget, createARMMCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(TheThumbLETarget, createARMMCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(TheThumbBETarget, createARMMCRegisterInfo); - - // Register the MC subtarget info. - TargetRegistry::RegisterMCSubtargetInfo(TheARMLETarget, - ARM_MC::createARMMCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(TheARMBETarget, - ARM_MC::createARMMCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(TheThumbLETarget, - ARM_MC::createARMMCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(TheThumbBETarget, - ARM_MC::createARMMCSubtargetInfo); - - // Register the MC instruction analyzer. - TargetRegistry::RegisterMCInstrAnalysis(TheARMLETarget, - createARMMCInstrAnalysis); - TargetRegistry::RegisterMCInstrAnalysis(TheARMBETarget, - createARMMCInstrAnalysis); - TargetRegistry::RegisterMCInstrAnalysis(TheThumbLETarget, - createARMMCInstrAnalysis); - TargetRegistry::RegisterMCInstrAnalysis(TheThumbBETarget, - createARMMCInstrAnalysis); + for (Target *T : {&TheARMLETarget, &TheARMBETarget, &TheThumbLETarget, + &TheThumbBETarget}) { + // Register the MC asm info. + RegisterMCAsmInfoFn X(*T, createARMMCAsmInfo); + + // Register the MC codegen info. + TargetRegistry::RegisterMCCodeGenInfo(*T, createARMMCCodeGenInfo); + + // Register the MC instruction info. + TargetRegistry::RegisterMCInstrInfo(*T, createARMMCInstrInfo); + + // Register the MC register info. + TargetRegistry::RegisterMCRegInfo(*T, createARMMCRegisterInfo); + + // Register the MC subtarget info. + TargetRegistry::RegisterMCSubtargetInfo(*T, + ARM_MC::createARMMCSubtargetInfo); + + // Register the MC instruction analyzer. + TargetRegistry::RegisterMCInstrAnalysis(*T, createARMMCInstrAnalysis); + + TargetRegistry::RegisterELFStreamer(*T, createELFStreamer); + TargetRegistry::RegisterCOFFStreamer(*T, createARMWinCOFFStreamer); + TargetRegistry::RegisterMachOStreamer(*T, createARMMachOStreamer); + + // Register the obj target streamer. + TargetRegistry::RegisterObjectTargetStreamer(*T, + createARMObjectTargetStreamer); + + // Register the asm streamer. + TargetRegistry::RegisterAsmTargetStreamer(*T, createARMTargetAsmStreamer); + + // Register the null TargetStreamer. + TargetRegistry::RegisterNullTargetStreamer(*T, createARMNullTargetStreamer); + + // Register the MCInstPrinter. + TargetRegistry::RegisterMCInstPrinter(*T, createARMMCInstPrinter); + + // Register the MC relocation info. + TargetRegistry::RegisterMCRelocationInfo(*T, createARMMCRelocationInfo); + } // Register the MC Code Emitter - TargetRegistry::RegisterMCCodeEmitter(TheARMLETarget, - createARMLEMCCodeEmitter); - TargetRegistry::RegisterMCCodeEmitter(TheARMBETarget, - createARMBEMCCodeEmitter); - TargetRegistry::RegisterMCCodeEmitter(TheThumbLETarget, - createARMLEMCCodeEmitter); - TargetRegistry::RegisterMCCodeEmitter(TheThumbBETarget, - createARMBEMCCodeEmitter); + for (Target *T : {&TheARMLETarget, &TheThumbLETarget}) + TargetRegistry::RegisterMCCodeEmitter(*T, createARMLEMCCodeEmitter); + for (Target *T : {&TheARMBETarget, &TheThumbBETarget}) + TargetRegistry::RegisterMCCodeEmitter(*T, createARMBEMCCodeEmitter); // Register the asm backend. TargetRegistry::RegisterMCAsmBackend(TheARMLETarget, createARMLEAsmBackend); @@ -442,44 +439,4 @@ extern "C" void LLVMInitializeARMTargetMC() { createThumbLEAsmBackend); TargetRegistry::RegisterMCAsmBackend(TheThumbBETarget, createThumbBEAsmBackend); - - // Register the object streamer. - TargetRegistry::RegisterMCObjectStreamer(TheARMLETarget, createMCStreamer); - TargetRegistry::RegisterMCObjectStreamer(TheARMBETarget, createMCStreamer); - TargetRegistry::RegisterMCObjectStreamer(TheThumbLETarget, createMCStreamer); - TargetRegistry::RegisterMCObjectStreamer(TheThumbBETarget, createMCStreamer); - - // Register the asm streamer. - TargetRegistry::RegisterAsmStreamer(TheARMLETarget, createMCAsmStreamer); - TargetRegistry::RegisterAsmStreamer(TheARMBETarget, createMCAsmStreamer); - TargetRegistry::RegisterAsmStreamer(TheThumbLETarget, createMCAsmStreamer); - TargetRegistry::RegisterAsmStreamer(TheThumbBETarget, createMCAsmStreamer); - - // Register the null TargetStreamer. - TargetRegistry::RegisterNullTargetStreamer(TheARMLETarget, - createARMNullTargetStreamer); - TargetRegistry::RegisterNullTargetStreamer(TheARMBETarget, - createARMNullTargetStreamer); - TargetRegistry::RegisterNullTargetStreamer(TheThumbLETarget, - createARMNullTargetStreamer); - TargetRegistry::RegisterNullTargetStreamer(TheThumbBETarget, - createARMNullTargetStreamer); - - // Register the MCInstPrinter. - TargetRegistry::RegisterMCInstPrinter(TheARMLETarget, createARMMCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(TheARMBETarget, createARMMCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(TheThumbLETarget, - createARMMCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(TheThumbBETarget, - createARMMCInstPrinter); - - // Register the MC relocation info. - TargetRegistry::RegisterMCRelocationInfo(TheARMLETarget, - createARMMCRelocationInfo); - TargetRegistry::RegisterMCRelocationInfo(TheARMBETarget, - createARMMCRelocationInfo); - TargetRegistry::RegisterMCRelocationInfo(TheThumbLETarget, - createARMMCRelocationInfo); - TargetRegistry::RegisterMCRelocationInfo(TheThumbBETarget, - createARMMCRelocationInfo); } diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h index c17e959..7e9ba66 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h +++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h @@ -32,6 +32,7 @@ class MCRelocationInfo; class MCTargetStreamer; class StringRef; class Target; +class Triple; class raw_ostream; extern Target TheARMLETarget, TheThumbLETarget; @@ -47,21 +48,20 @@ namespace ARM_MC { StringRef FS); } -MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS, - bool isVerboseAsm, bool useDwarfDirectory, - MCInstPrinter *InstPrint, MCCodeEmitter *CE, - MCAsmBackend *TAB, bool ShowInst); - MCTargetStreamer *createARMNullTargetStreamer(MCStreamer &S); +MCTargetStreamer *createARMTargetAsmStreamer(MCStreamer &S, + formatted_raw_ostream &OS, + MCInstPrinter *InstPrint, + bool isVerboseAsm); +MCTargetStreamer *createARMObjectTargetStreamer(MCStreamer &S, + const MCSubtargetInfo &STI); MCCodeEmitter *createARMLEMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx); MCCodeEmitter *createARMBEMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx); MCAsmBackend *createARMAsmBackend(const Target &T, const MCRegisterInfo &MRI, @@ -80,10 +80,11 @@ MCAsmBackend *createThumbLEAsmBackend(const Target &T, const MCRegisterInfo &MRI MCAsmBackend *createThumbBEAsmBackend(const Target &T, const MCRegisterInfo &MRI, StringRef TT, StringRef CPU); -/// createARMWinCOFFStreamer - Construct a PE/COFF machine code streamer which -/// will generate a PE/COFF object file. +// Construct a PE/COFF machine code streamer which will generate a PE/COFF +// object file. MCStreamer *createARMWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB, - MCCodeEmitter &Emitter, raw_ostream &OS); + raw_ostream &OS, MCCodeEmitter *Emitter, + bool RelaxAll); /// createARMELFObjectWriter - Construct an ELF Mach-O object writer. MCObjectWriter *createARMELFObjectWriter(raw_ostream &OS, diff --git a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp index 593fe34..173cc93 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp @@ -72,14 +72,10 @@ void UnwindOpcodeAssembler::EmitRegSave(uint32_t RegSave) { // opcode when r4 is not in .save directive. // Compute the consecutive registers from r4 to r11. - uint32_t Range = 0; - uint32_t Mask = (1u << 4); - for (uint32_t Bit = (1u << 5); Bit < (1u << 12); Bit <<= 1) { - if ((RegSave & Bit) == 0u) - break; - ++Range; - Mask |= Bit; - } + uint32_t Mask = RegSave & 0xff0u; + uint32_t Range = countTrailingOnes(Mask >> 5); // Exclude r4. + // Mask off non-consecutive registers. Keep r4. + Mask &= ~(0xffffffe0u << Range); // Emit this opcode when the mask covers every registers. uint32_t UnmaskedReg = RegSave & 0xfff0u & (~Mask); @@ -105,50 +101,24 @@ void UnwindOpcodeAssembler::EmitRegSave(uint32_t RegSave) { /// Emit unwind opcodes for .vsave directives void UnwindOpcodeAssembler::EmitVFPRegSave(uint32_t VFPRegSave) { - size_t i = 32; - - while (i > 16) { - uint32_t Bit = 1u << (i - 1); - if ((VFPRegSave & Bit) == 0u) { - --i; - continue; - } - - uint32_t Range = 0; - - --i; - Bit >>= 1; - - while (i > 16 && (VFPRegSave & Bit)) { - --i; - ++Range; - Bit >>= 1; + // We only have 4 bits to save the offset in the opcode so look at the lower + // and upper 16 bits separately. + for (uint32_t Regs : {VFPRegSave & 0xffff0000u, VFPRegSave & 0x0000ffffu}) { + while (Regs) { + // Now look for a run of set bits. Remember the MSB and LSB of the run. + auto RangeMSB = 32 - countLeadingZeros(Regs); + auto RangeLen = countLeadingOnes(Regs << (32 - RangeMSB)); + auto RangeLSB = RangeMSB - RangeLen; + + int Opcode = RangeLSB >= 16 + ? ARM::EHABI::UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD_D16 + : ARM::EHABI::UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD; + + EmitInt16(Opcode | ((RangeLSB % 16) << 4) | (RangeLen - 1)); + + // Zero out bits we're done with. + Regs &= ~(-1u << RangeLSB); } - - EmitInt16(ARM::EHABI::UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD_D16 | - ((i - 16) << 4) | Range); - } - - while (i > 0) { - uint32_t Bit = 1u << (i - 1); - if ((VFPRegSave & Bit) == 0u) { - --i; - continue; - } - - uint32_t Range = 0; - - --i; - Bit >>= 1; - - while (i > 0 && (VFPRegSave & Bit)) { - --i; - ++Range; - Bit >>= 1; - } - - EmitInt16(ARM::EHABI::UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD | (i << 4) | - Range); } } diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp index b344ced..dc707dc 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp @@ -37,10 +37,10 @@ void ARMWinCOFFStreamer::EmitThumbFunc(MCSymbol *Symbol) { } } -namespace llvm { -MCStreamer *createARMWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB, - MCCodeEmitter &Emitter, raw_ostream &OS) { - return new ARMWinCOFFStreamer(Context, MAB, Emitter, OS); -} +MCStreamer *llvm::createARMWinCOFFStreamer(MCContext &Context, + MCAsmBackend &MAB, raw_ostream &OS, + MCCodeEmitter *Emitter, + bool RelaxAll) { + return new ARMWinCOFFStreamer(Context, MAB, *Emitter, OS); } diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp index 51e519d..ed2deea 100644 --- a/lib/Target/ARM/MLxExpansionPass.cpp +++ b/lib/Target/ARM/MLxExpansionPass.cpp @@ -382,6 +382,9 @@ bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) { TRI = Fn.getSubtarget().getRegisterInfo(); MRI = &Fn.getRegInfo(); const ARMSubtarget *STI = &Fn.getSubtarget<ARMSubtarget>(); + // Only run this for CortexA9. + if (!STI->isCortexA9()) + return false; isLikeA9 = STI->isLikeA9() || STI->isSwift(); isSwift = STI->isSwift(); diff --git a/lib/Target/ARM/README-Thumb.txt b/lib/Target/ARM/README-Thumb.txt index f4d9be3..2d031d0 100644 --- a/lib/Target/ARM/README-Thumb.txt +++ b/lib/Target/ARM/README-Thumb.txt @@ -232,7 +232,7 @@ Make use of hi register variants of cmp: tCMPhir / tCMPZhir. //===---------------------------------------------------------------------===// Thumb1 immediate field sometimes keep pre-scaled values. See -Thumb1RegisterInfo::eliminateFrameIndex. This is inconsistent from ARM and +ThumbRegisterInfo::eliminateFrameIndex. This is inconsistent from ARM and Thumb2. //===---------------------------------------------------------------------===// diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index 7dcc64e..c496cd7 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -41,7 +41,7 @@ static void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, const TargetInstrInfo &TII, DebugLoc dl, - const Thumb1RegisterInfo &MRI, + const ThumbRegisterInfo &MRI, int NumBytes, unsigned MIFlags = MachineInstr::NoFlags) { emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII, MRI, MIFlags); @@ -53,8 +53,8 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { const Thumb1InstrInfo &TII = *static_cast<const Thumb1InstrInfo *>(STI.getInstrInfo()); - const Thumb1RegisterInfo *RegInfo = - static_cast<const Thumb1RegisterInfo *>(STI.getRegisterInfo()); + const ThumbRegisterInfo *RegInfo = + static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo()); if (!hasReservedCallFrame(MF)) { // If we have alloca, convert as follows: // ADJCALLSTACKDOWN -> sub, sp, sp, amount @@ -89,13 +89,12 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const { ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); MachineModuleInfo &MMI = MF.getMMI(); const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); - const Thumb1RegisterInfo *RegInfo = - static_cast<const Thumb1RegisterInfo *>(STI.getRegisterInfo()); + const ThumbRegisterInfo *RegInfo = + static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo()); const Thumb1InstrInfo &TII = *static_cast<const Thumb1InstrInfo *>(STI.getInstrInfo()); - unsigned Align = STI.getFrameLowering()->getStackAlignment(); - unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align); + unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); unsigned NumBytes = MFI->getStackSize(); assert(NumBytes >= ArgRegsSaveSize && "ArgRegsSaveSize is included in NumBytes"); @@ -328,17 +327,16 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, DebugLoc dl = MBBI->getDebugLoc(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - const Thumb1RegisterInfo *RegInfo = - static_cast<const Thumb1RegisterInfo *>(STI.getRegisterInfo()); + const ThumbRegisterInfo *RegInfo = + static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo()); const Thumb1InstrInfo &TII = *static_cast<const Thumb1InstrInfo *>(STI.getInstrInfo()); - unsigned Align = STI.getFrameLowering()->getStackAlignment(); - unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align); + unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); int NumBytes = (int)MFI->getStackSize(); assert((unsigned)NumBytes >= ArgRegsSaveSize && "ArgRegsSaveSize is included in NumBytes"); - const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(); + const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); unsigned FramePtr = RegInfo->getFrameRegister(MF); if (!AFI->hasStackFrame()) { diff --git a/lib/Target/ARM/Thumb1FrameLowering.h b/lib/Target/ARM/Thumb1FrameLowering.h index b785b28..cf93203 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.h +++ b/lib/Target/ARM/Thumb1FrameLowering.h @@ -16,7 +16,7 @@ #include "ARMFrameLowering.h" #include "Thumb1InstrInfo.h" -#include "Thumb1RegisterInfo.h" +#include "ThumbRegisterInfo.h" #include "llvm/Target/TargetFrameLowering.h" namespace llvm { diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp index c24f740..29aaa15 100644 --- a/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -22,8 +22,7 @@ using namespace llvm; Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI) - : ARMBaseInstrInfo(STI), RI(STI) { -} + : ARMBaseInstrInfo(STI), RI() {} /// getNoopForMachoTarget - Return the noop instruction to use for a noop. void Thumb1InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h index 9fba760..f3f493d 100644 --- a/lib/Target/ARM/Thumb1InstrInfo.h +++ b/lib/Target/ARM/Thumb1InstrInfo.h @@ -15,13 +15,13 @@ #define LLVM_LIB_TARGET_ARM_THUMB1INSTRINFO_H #include "ARMBaseInstrInfo.h" -#include "Thumb1RegisterInfo.h" +#include "ThumbRegisterInfo.h" namespace llvm { class ARMSubtarget; class Thumb1InstrInfo : public ARMBaseInstrInfo { - Thumb1RegisterInfo RI; + ThumbRegisterInfo RI; public: explicit Thumb1InstrInfo(const ARMSubtarget &STI); @@ -36,7 +36,7 @@ public: /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). /// - const Thumb1RegisterInfo &getRegisterInfo() const override { return RI; } + const ThumbRegisterInfo &getRegisterInfo() const override { return RI; } void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp index b657f2d..7bb2265 100644 --- a/lib/Target/ARM/Thumb2ITBlockPass.cpp +++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp @@ -255,6 +255,8 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) { bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) { const ARMSubtarget &STI = static_cast<const ARMSubtarget &>(Fn.getSubtarget()); + if (!STI.isThumb2()) + return false; AFI = Fn.getInfo<ARMFunctionInfo>(); TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo()); TRI = STI.getRegisterInfo(); diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp index 62c3752..26ca7e9 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -30,8 +30,7 @@ OldT2IfCvt("old-thumb2-ifcvt", cl::Hidden, cl::init(false)); Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI) - : ARMBaseInstrInfo(STI), RI(STI) { -} + : ARMBaseInstrInfo(STI), RI() {} /// getNoopForMachoTarget - Return the noop instruction to use for a noop. void Thumb2InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h index 46a1f6d..916ab06 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.h +++ b/lib/Target/ARM/Thumb2InstrInfo.h @@ -15,14 +15,14 @@ #define LLVM_LIB_TARGET_ARM_THUMB2INSTRINFO_H #include "ARMBaseInstrInfo.h" -#include "Thumb2RegisterInfo.h" +#include "ThumbRegisterInfo.h" namespace llvm { class ARMSubtarget; class ScheduleHazardRecognizer; class Thumb2InstrInfo : public ARMBaseInstrInfo { - Thumb2RegisterInfo RI; + ThumbRegisterInfo RI; public: explicit Thumb2InstrInfo(const ARMSubtarget &STI); @@ -60,7 +60,7 @@ public: /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). /// - const Thumb2RegisterInfo &getRegisterInfo() const override { return RI; } + const ThumbRegisterInfo &getRegisterInfo() const override { return RI; } private: void expandLoadStackGuard(MachineBasicBlock::iterator MI, diff --git a/lib/Target/ARM/Thumb2RegisterInfo.cpp b/lib/Target/ARM/Thumb2RegisterInfo.cpp deleted file mode 100644 index 0d5d85a..0000000 --- a/lib/Target/ARM/Thumb2RegisterInfo.cpp +++ /dev/null @@ -1,53 +0,0 @@ -//===-- Thumb2RegisterInfo.cpp - Thumb-2 Register Information -------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the Thumb-2 implementation of the TargetRegisterInfo -// class. -// -//===----------------------------------------------------------------------===// - -#include "Thumb2RegisterInfo.h" -#include "ARM.h" -#include "ARMSubtarget.h" -#include "llvm/CodeGen/MachineConstantPool.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Function.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" -using namespace llvm; - -Thumb2RegisterInfo::Thumb2RegisterInfo(const ARMSubtarget &sti) - : ARMBaseRegisterInfo(sti) { -} - -/// emitLoadConstPool - Emits a load from constpool to materialize the -/// specified immediate. -void -Thumb2RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - DebugLoc dl, - unsigned DestReg, unsigned SubIdx, - int Val, - ARMCC::CondCodes Pred, unsigned PredReg, - unsigned MIFlags) const { - MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - MachineConstantPool *ConstantPool = MF.getConstantPool(); - const Constant *C = ConstantInt::get( - Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val); - unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4); - - BuildMI(MBB, MBBI, dl, TII.get(ARM::t2LDRpci)) - .addReg(DestReg, getDefRegState(true), SubIdx) - .addConstantPoolIndex(Idx).addImm((int64_t)ARMCC::AL).addReg(0) - .setMIFlags(MIFlags); -} diff --git a/lib/Target/ARM/Thumb2RegisterInfo.h b/lib/Target/ARM/Thumb2RegisterInfo.h deleted file mode 100644 index 1dd94cc..0000000 --- a/lib/Target/ARM/Thumb2RegisterInfo.h +++ /dev/null @@ -1,38 +0,0 @@ -//===- Thumb2RegisterInfo.h - Thumb-2 Register Information Impl -*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the Thumb-2 implementation of the TargetRegisterInfo -// class. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_ARM_THUMB2REGISTERINFO_H -#define LLVM_LIB_TARGET_ARM_THUMB2REGISTERINFO_H - -#include "ARMBaseRegisterInfo.h" - -namespace llvm { - -class ARMSubtarget; - -struct Thumb2RegisterInfo : public ARMBaseRegisterInfo { -public: - Thumb2RegisterInfo(const ARMSubtarget &STI); - - /// emitLoadConstPool - Emits a load from constpool to materialize the - /// specified immediate. - void - emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - DebugLoc dl, unsigned DestReg, unsigned SubIdx, int Val, - ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0, - unsigned MIFlags = MachineInstr::NoFlags) const override; -}; -} - -#endif diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp index 2ee908b..e967e53 100644 --- a/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/Function.h" // To access Function attributes #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; @@ -1002,6 +1003,9 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) { STI = &static_cast<const ARMSubtarget &>(MF.getSubtarget()); + if (STI->isThumb1Only() || STI->prefers32BitThumb()) + return false; + TII = static_cast<const Thumb2InstrInfo *>(STI->getInstrInfo()); // Optimizing / minimizing size? diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/ThumbRegisterInfo.cpp index 5e2cbdc..b5f9d7e 100644 --- a/lib/Target/ARM/Thumb1RegisterInfo.cpp +++ b/lib/Target/ARM/ThumbRegisterInfo.cpp @@ -1,4 +1,4 @@ -//===-- Thumb1RegisterInfo.cpp - Thumb-1 Register Information -------------===// +//===-- ThumbRegisterInfo.cpp - Thumb-1 Register Information -------------===// // // The LLVM Compiler Infrastructure // @@ -12,7 +12,7 @@ // //===----------------------------------------------------------------------===// -#include "Thumb1RegisterInfo.h" +#include "ThumbRegisterInfo.h" #include "ARMBaseInstrInfo.h" #include "ARMMachineFunctionInfo.h" #include "ARMSubtarget.h" @@ -38,39 +38,35 @@ extern cl::opt<bool> ReuseFrameIndexVals; using namespace llvm; -Thumb1RegisterInfo::Thumb1RegisterInfo(const ARMSubtarget &sti) - : ARMBaseRegisterInfo(sti) { -} +ThumbRegisterInfo::ThumbRegisterInfo() : ARMBaseRegisterInfo() {} + +const TargetRegisterClass * +ThumbRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const { + if (!MF.getSubtarget<ARMSubtarget>().isThumb1Only()) + return ARMBaseRegisterInfo::getLargestLegalSuperClass(RC, MF); -const TargetRegisterClass* -Thumb1RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) - const { if (ARM::tGPRRegClass.hasSubClassEq(RC)) return &ARM::tGPRRegClass; - return ARMBaseRegisterInfo::getLargestLegalSuperClass(RC); + return ARMBaseRegisterInfo::getLargestLegalSuperClass(RC, MF); } const TargetRegisterClass * -Thumb1RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) - const { +ThumbRegisterInfo::getPointerRegClass(const MachineFunction &MF, + unsigned Kind) const { + if (!MF.getSubtarget<ARMSubtarget>().isThumb1Only()) + return ARMBaseRegisterInfo::getPointerRegClass(MF, Kind); return &ARM::tGPRRegClass; } -/// emitLoadConstPool - Emits a load from constpool to materialize the -/// specified immediate. -void -Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - DebugLoc dl, - unsigned DestReg, unsigned SubIdx, - int Val, - ARMCC::CondCodes Pred, unsigned PredReg, - unsigned MIFlags) const { - assert((isARMLowRegister(DestReg) || - isVirtualRegister(DestReg)) && - "Thumb1 does not have ldr to high register"); - +static void emitThumb1LoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, unsigned DestReg, + unsigned SubIdx, int Val, + ARMCC::CondCodes Pred, unsigned PredReg, + unsigned MIFlags) { MachineFunction &MF = *MBB.getParent(); + const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); const TargetInstrInfo &TII = *STI.getInstrInfo(); MachineConstantPool *ConstantPool = MF.getConstantPool(); const Constant *C = ConstantInt::get( @@ -83,6 +79,42 @@ Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB, .setMIFlags(MIFlags); } +static void emitThumb2LoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, unsigned DestReg, + unsigned SubIdx, int Val, + ARMCC::CondCodes Pred, unsigned PredReg, + unsigned MIFlags) { + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + MachineConstantPool *ConstantPool = MF.getConstantPool(); + const Constant *C = ConstantInt::get( + Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4); + + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2LDRpci)) + .addReg(DestReg, getDefRegState(true), SubIdx) + .addConstantPoolIndex(Idx).addImm((int64_t)ARMCC::AL).addReg(0) + .setMIFlags(MIFlags); +} + +/// emitLoadConstPool - Emits a load from constpool to materialize the +/// specified immediate. +void ThumbRegisterInfo::emitLoadConstPool( + MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, DebugLoc dl, + unsigned DestReg, unsigned SubIdx, int Val, ARMCC::CondCodes Pred, + unsigned PredReg, unsigned MIFlags) const { + MachineFunction &MF = *MBB.getParent(); + const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); + if (STI.isThumb1Only()) { + assert((isARMLowRegister(DestReg) || isVirtualRegister(DestReg)) && + "Thumb1 does not have ldr to high register"); + return emitThumb1LoadConstPool(MBB, MBBI, dl, DestReg, SubIdx, Val, Pred, + PredReg, MIFlags); + } + return emitThumb2LoadConstPool(MBB, MBBI, dl, DestReg, SubIdx, Val, Pred, + PredReg, MIFlags); +} /// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize /// a destreg = basereg + immediate in Thumb code. Materialize the immediate @@ -317,12 +349,14 @@ static unsigned convertToNonSPOpcode(unsigned Opcode) { return Opcode; } -bool Thumb1RegisterInfo:: -rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx, - unsigned FrameReg, int &Offset, - const ARMBaseInstrInfo &TII) const { +bool ThumbRegisterInfo::rewriteFrameIndex(MachineBasicBlock::iterator II, + unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII) const { MachineInstr &MI = *II; MachineBasicBlock &MBB = *MI.getParent(); + assert(MBB.getParent()->getSubtarget<ARMSubtarget>().isThumb1Only() && + "This isn't needed for thumb2!"); DebugLoc dl = MI.getDebugLoc(); MachineInstrBuilder MIB(*MBB.getParent(), &MI); unsigned Opcode = MI.getOpcode(); @@ -386,8 +420,13 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx, return Offset == 0; } -void Thumb1RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, +void ThumbRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, int64_t Offset) const { + const MachineFunction &MF = *MI.getParent()->getParent(); + const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); + if (!STI.isThumb1Only()) + return ARMBaseRegisterInfo::resolveFrameIndex(MI, BaseReg, Offset); + const ARMBaseInstrInfo &TII = *STI.getInstrInfo(); int Off = Offset; // ARM doesn't need the general 64-bit offsets unsigned i = 0; @@ -403,12 +442,15 @@ void Thumb1RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, /// saveScavengerRegister - Spill the register so it can be used by the /// register scavenger. Return true. -bool -Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - MachineBasicBlock::iterator &UseMI, - const TargetRegisterClass *RC, - unsigned Reg) const { +bool ThumbRegisterInfo::saveScavengerRegister( + MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + MachineBasicBlock::iterator &UseMI, const TargetRegisterClass *RC, + unsigned Reg) const { + + const ARMSubtarget &STI = MBB.getParent()->getSubtarget<ARMSubtarget>(); + if (!STI.isThumb1Only()) + return ARMBaseRegisterInfo::saveScavengerRegister(MBB, I, UseMI, RC, Reg); + // Thumb1 can't use the emergency spill slot on the stack because // ldr/str immediate offsets must be positive, and if we're referencing // off the frame pointer (if, for example, there are alloca() calls in @@ -452,14 +494,18 @@ Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB, return true; } -void -Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, - int SPAdj, unsigned FIOperandNum, - RegScavenger *RS) const { - unsigned VReg = 0; +void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, unsigned FIOperandNum, + RegScavenger *RS) const { MachineInstr &MI = *II; MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); + const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); + if (!STI.isThumb1Only()) + return ARMBaseRegisterInfo::eliminateFrameIndex(II, SPAdj, FIOperandNum, + RS); + + unsigned VReg = 0; const ARMBaseInstrInfo &TII = *STI.getInstrInfo(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); DebugLoc dl = MI.getDebugLoc(); diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/ThumbRegisterInfo.h index 5feaf52..23aaff3 100644 --- a/lib/Target/ARM/Thumb1RegisterInfo.h +++ b/lib/Target/ARM/ThumbRegisterInfo.h @@ -1,4 +1,4 @@ -//===- Thumb1RegisterInfo.h - Thumb-1 Register Information Impl -*- C++ -*-===// +//===- ThumbRegisterInfo.h - Thumb Register Information Impl -*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -7,8 +7,9 @@ // //===----------------------------------------------------------------------===// // -// This file contains the Thumb-1 implementation of the TargetRegisterInfo -// class. +// This file contains the Thumb implementation of the TargetRegisterInfo +// class. With the exception of emitLoadConstPool Thumb2 tracks +// ARMBaseRegisterInfo, Thumb1 overloads the functions below. // //===----------------------------------------------------------------------===// @@ -22,12 +23,13 @@ namespace llvm { class ARMSubtarget; class ARMBaseInstrInfo; -struct Thumb1RegisterInfo : public ARMBaseRegisterInfo { +struct ThumbRegisterInfo : public ARMBaseRegisterInfo { public: - Thumb1RegisterInfo(const ARMSubtarget &STI); + ThumbRegisterInfo(); const TargetRegisterClass * - getLargestLegalSuperClass(const TargetRegisterClass *RC) const override; + getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const override; const TargetRegisterClass * getPointerRegClass(const MachineFunction &MF, diff --git a/lib/Target/BPF/BPFISelDAGToDAG.cpp b/lib/Target/BPF/BPFISelDAGToDAG.cpp index 07f62a9..b91b0e1 100644 --- a/lib/Target/BPF/BPFISelDAGToDAG.cpp +++ b/lib/Target/BPF/BPFISelDAGToDAG.cpp @@ -17,16 +17,16 @@ #include "BPFSubtarget.h" #include "BPFTargetMachine.h" #include "llvm/CodeGen/MachineConstantPool.h" -#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" -#include "llvm/Target/TargetMachine.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Target/TargetMachine.h" using namespace llvm; #define DEBUG_TYPE "bpf-isel" diff --git a/lib/Target/BPF/BPFISelLowering.h b/lib/Target/BPF/BPFISelLowering.h index 04d7908..5a6f0f7 100644 --- a/lib/Target/BPF/BPFISelLowering.h +++ b/lib/Target/BPF/BPFISelLowering.h @@ -20,6 +20,7 @@ #include "llvm/Target/TargetLowering.h" namespace llvm { +class BPFSubtarget; namespace BPFISD { enum { FIRST_NUMBER = ISD::BUILTIN_OP_END, diff --git a/lib/Target/BPF/BPFRegisterInfo.h b/lib/Target/BPF/BPFRegisterInfo.h index 364d6f6..7072dd0 100644 --- a/lib/Target/BPF/BPFRegisterInfo.h +++ b/lib/Target/BPF/BPFRegisterInfo.h @@ -25,8 +25,7 @@ struct BPFRegisterInfo : public BPFGenRegisterInfo { BPFRegisterInfo(); - const MCPhysReg * - getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override; + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; BitVector getReservedRegs(const MachineFunction &MF) const override; diff --git a/lib/Target/BPF/BPFTargetMachine.cpp b/lib/Target/BPF/BPFTargetMachine.cpp index 5245395..9487427 100644 --- a/lib/Target/BPF/BPFTargetMachine.cpp +++ b/lib/Target/BPF/BPFTargetMachine.cpp @@ -35,9 +35,9 @@ BPFTargetMachine::BPFTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + : LLVMTargetMachine(T, "e-m:e-p:64:64-i64:64-n32:64-S128", TT, CPU, FS, + Options, RM, CM, OL), TLOF(make_unique<TargetLoweringObjectFileELF>()), - DL("e-m:e-p:64:64-i64:64-n32:64-S128"), Subtarget(TT, CPU, FS, *this) { initAsmInfo(); } diff --git a/lib/Target/BPF/BPFTargetMachine.h b/lib/Target/BPF/BPFTargetMachine.h index 821cffc..6aeafb9 100644 --- a/lib/Target/BPF/BPFTargetMachine.h +++ b/lib/Target/BPF/BPFTargetMachine.h @@ -20,7 +20,6 @@ namespace llvm { class BPFTargetMachine : public LLVMTargetMachine { std::unique_ptr<TargetLoweringObjectFile> TLOF; - const DataLayout DL; BPFSubtarget Subtarget; public: @@ -28,8 +27,10 @@ public: const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); - const DataLayout *getDataLayout() const override { return &DL; } - const BPFSubtarget *getSubtargetImpl() const override { return &Subtarget; } + const BPFSubtarget *getSubtargetImpl() const { return &Subtarget; } + const BPFSubtarget *getSubtargetImpl(const Function &) const override { + return &Subtarget; + } TargetPassConfig *createPassConfig(PassManagerBase &PM) override; diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp index b94693a..9c51d66 100644 --- a/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp +++ b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp @@ -60,7 +60,6 @@ public: MCCodeEmitter *llvm::createBPFMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx) { return new BPFMCCodeEmitter(MRI); } diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp index f82f009..fd04001 100644 --- a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp +++ b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp @@ -61,13 +61,11 @@ static MCCodeGenInfo *createBPFMCCodeGenInfo(StringRef TT, Reloc::Model RM, return X; } -static MCStreamer *createBPFMCStreamer(const Target &T, StringRef TT, +static MCStreamer *createBPFMCStreamer(const Triple &T, MCContext &Ctx, MCAsmBackend &MAB, - raw_ostream &_OS, - MCCodeEmitter *_Emitter, - const MCSubtargetInfo &STI, + raw_ostream &OS, MCCodeEmitter *Emitter, bool RelaxAll) { - return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll); + return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll); } static MCInstPrinter * @@ -104,7 +102,7 @@ extern "C" void LLVMInitializeBPFTargetMC() { TargetRegistry::RegisterMCAsmBackend(TheBPFTarget, createBPFAsmBackend); // Register the object streamer - TargetRegistry::RegisterMCObjectStreamer(TheBPFTarget, createBPFMCStreamer); + TargetRegistry::RegisterELFStreamer(TheBPFTarget, createBPFMCStreamer); // Register the MCInstPrinter. TargetRegistry::RegisterMCInstPrinter(TheBPFTarget, createBPFMCInstPrinter); diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h index 55901cc..1fd2bec 100644 --- a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h +++ b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h @@ -33,7 +33,6 @@ extern Target TheBPFTarget; MCCodeEmitter *createBPFMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx); MCAsmBackend *createBPFAsmBackend(const Target &T, const MCRegisterInfo &MRI, diff --git a/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp b/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp index 818a992..87716e6 100644 --- a/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp +++ b/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp @@ -14,5 +14,5 @@ using namespace llvm; Target llvm::TheBPFTarget; extern "C" void LLVMInitializeBPFTargetInfo() { - RegisterTarget<Triple::bpf> X(TheBPFTarget, "bpf", "BPF"); + RegisterTarget<Triple::bpf, /*HasJIT=*/true> X(TheBPFTarget, "bpf", "BPF"); } diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp index c7fec52..d0e2010 100644 --- a/lib/Target/CppBackend/CPPBackend.cpp +++ b/lib/Target/CppBackend/CPPBackend.cpp @@ -1981,7 +1981,8 @@ void CppWriter::printModule(const std::string& fname, printEscapedString(mName); Out << "\", getGlobalContext());"; if (!TheModule->getTargetTriple().empty()) { - nl(Out) << "mod->setDataLayout(\"" << TheModule->getDataLayout() << "\");"; + nl(Out) << "mod->setDataLayout(\"" << TheModule->getDataLayoutStr() + << "\");"; } if (!TheModule->getTargetTriple().empty()) { nl(Out) << "mod->setTargetTriple(\"" << TheModule->getTargetTriple() diff --git a/lib/Target/CppBackend/CPPTargetMachine.h b/lib/Target/CppBackend/CPPTargetMachine.h index 4bae7f8..678a932 100644 --- a/lib/Target/CppBackend/CPPTargetMachine.h +++ b/lib/Target/CppBackend/CPPTargetMachine.h @@ -22,20 +22,13 @@ namespace llvm { class formatted_raw_ostream; -class CPPSubtarget : public TargetSubtargetInfo { -}; - struct CPPTargetMachine : public TargetMachine { - CPPTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) - : TargetMachine(T, TT, CPU, FS, Options), Subtarget() {} -private: - CPPSubtarget Subtarget; + CPPTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL) + : TargetMachine(T, "", TT, CPU, FS, Options) {} public: - const CPPSubtarget *getSubtargetImpl() const override { return &Subtarget; } bool addPassesToEmitFile(PassManagerBase &PM, formatted_raw_ostream &Out, CodeGenFileType FileType, bool DisableVerify, AnalysisID StartAfter, diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt index eaa8bef..c6ffb96 100644 --- a/lib/Target/Hexagon/CMakeLists.txt +++ b/lib/Target/Hexagon/CMakeLists.txt @@ -31,7 +31,6 @@ add_llvm_target(HexagonCodeGen HexagonRemoveSZExtArgs.cpp HexagonSelectionDAGInfo.cpp HexagonSplitConst32AndConst64.cpp - HexagonSplitTFRCondSets.cpp HexagonSubtarget.cpp HexagonTargetMachine.cpp HexagonTargetObjectFile.cpp diff --git a/lib/Target/Hexagon/Hexagon.h b/lib/Target/Hexagon/Hexagon.h index e0a3b2f..dfe79f9 100644 --- a/lib/Target/Hexagon/Hexagon.h +++ b/lib/Target/Hexagon/Hexagon.h @@ -36,7 +36,6 @@ namespace llvm { FunctionPass *createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM); FunctionPass *createHexagonCFGOptimizer(); - FunctionPass *createHexagonSplitTFRCondSets(); FunctionPass *createHexagonSplitConst32AndConst64(); FunctionPass *createHexagonExpandPredSpillCode(); FunctionPass *createHexagonHardwareLoops(); diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td index f892c9f..53a687c 100644 --- a/lib/Target/Hexagon/Hexagon.td +++ b/lib/Target/Hexagon/Hexagon.td @@ -28,10 +28,10 @@ def ArchV5: SubtargetFeature<"v5", "HexagonArchVersion", "V5", "Hexagon V5">; //===----------------------------------------------------------------------===// // Hexagon Instruction Predicate Definitions. //===----------------------------------------------------------------------===// -def HasV5T : Predicate<"Subtarget->hasV5TOps()">; -def NoV5T : Predicate<"!Subtarget->hasV5TOps()">; -def UseMEMOP : Predicate<"Subtarget->useMemOps()">; -def IEEERndNearV5T : Predicate<"Subtarget->modeIEEERndNear()">; +def HasV5T : Predicate<"HST->hasV5TOps()">; +def NoV5T : Predicate<"!HST->hasV5TOps()">; +def UseMEMOP : Predicate<"HST->useMemOps()">; +def IEEERndNearV5T : Predicate<"HST->modeIEEERndNear()">; //===----------------------------------------------------------------------===// // Classes used for relation maps. @@ -168,14 +168,6 @@ def getRegForm : InstrMapping { let ValueCols = [["reg"]]; } -def getRegShlForm : InstrMapping { - let FilterClass = "ImmRegShl"; - let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"]; - let ColFields = ["InputType"]; - let KeyCol = ["imm"]; - let ValueCols = [["reg"]]; -} - //===----------------------------------------------------------------------===// // Register File, Calling Conv, Instruction Descriptions //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/lib/Target/Hexagon/HexagonCopyToCombine.cpp index dd193f9..5a26045 100644 --- a/lib/Target/Hexagon/HexagonCopyToCombine.cpp +++ b/lib/Target/Hexagon/HexagonCopyToCombine.cpp @@ -127,12 +127,21 @@ static bool isCombinableInstType(MachineInstr *MI, case Hexagon::A2_tfrsi: { // A transfer-immediate can be combined if its argument is a signed 8bit // value. - assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm()); - unsigned DestReg = MI->getOperand(0).getReg(); + const MachineOperand &Op0 = MI->getOperand(0); + const MachineOperand &Op1 = MI->getOperand(1); + assert(Op0.isReg()); + + unsigned DestReg = Op0.getReg(); + // Ensure that TargetFlags are MO_NO_FLAG for a global. This is a + // workaround for an ABI bug that prevents GOT relocations on combine + // instructions + if (!Op1.isImm() && Op1.getTargetFlags() != HexagonII::MO_NO_FLAG) + return false; - // Only combine constant extended TFRI if we are in aggressive mode. + // Only combine constant extended A2_tfrsi if we are in aggressive mode. + bool NotExt = Op1.isImm() && isInt<8>(Op1.getImm()); return Hexagon::IntRegsRegClass.contains(DestReg) && - (ShouldCombineAggressively || isInt<8>(MI->getOperand(1).getImm())); + (ShouldCombineAggressively || NotExt); } case Hexagon::TFRI_V4: { diff --git a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp index 8176598..40059fb 100644 --- a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp +++ b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp @@ -79,7 +79,166 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) { ++MII) { MachineInstr *MI = MII; int Opc = MI->getOpcode(); - if (Opc == Hexagon::STriw_pred) { + if (Opc == Hexagon::S2_storerb_pci_pseudo || + Opc == Hexagon::S2_storerh_pci_pseudo || + Opc == Hexagon::S2_storeri_pci_pseudo || + Opc == Hexagon::S2_storerd_pci_pseudo || + Opc == Hexagon::S2_storerf_pci_pseudo) { + unsigned Opcode; + if (Opc == Hexagon::S2_storerd_pci_pseudo) + Opcode = Hexagon::S2_storerd_pci; + else if (Opc == Hexagon::S2_storeri_pci_pseudo) + Opcode = Hexagon::S2_storeri_pci; + else if (Opc == Hexagon::S2_storerh_pci_pseudo) + Opcode = Hexagon::S2_storerh_pci; + else if (Opc == Hexagon::S2_storerf_pci_pseudo) + Opcode = Hexagon::S2_storerf_pci; + else if (Opc == Hexagon::S2_storerb_pci_pseudo) + Opcode = Hexagon::S2_storerb_pci; + else + llvm_unreachable("wrong Opc"); + MachineOperand &Op0 = MI->getOperand(0); + MachineOperand &Op1 = MI->getOperand(1); + MachineOperand &Op2 = MI->getOperand(2); + MachineOperand &Op3 = MI->getOperand(3); // Modifier value. + MachineOperand &Op4 = MI->getOperand(4); + // Emit a "C6 = Rn, C6 is the control register for M0". + BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::A2_tfrrcr), + Hexagon::C6)->addOperand(Op3); + // Replace the pseude circ_ldd by the real circ_ldd. + MachineInstr *NewMI = BuildMI(*MBB, MII, MI->getDebugLoc(), + TII->get(Opcode)); + NewMI->addOperand(Op0); + NewMI->addOperand(Op1); + NewMI->addOperand(Op4); + NewMI->addOperand(MachineOperand::CreateReg(Hexagon::M0, + false, /*isDef*/ + false, /*isImpl*/ + true /*isKill*/)); + NewMI->addOperand(Op2); + MII = MBB->erase(MI); + --MII; + } else if (Opc == Hexagon::L2_loadrd_pci_pseudo || + Opc == Hexagon::L2_loadri_pci_pseudo || + Opc == Hexagon::L2_loadrh_pci_pseudo || + Opc == Hexagon::L2_loadruh_pci_pseudo|| + Opc == Hexagon::L2_loadrb_pci_pseudo || + Opc == Hexagon::L2_loadrub_pci_pseudo) { + unsigned Opcode; + if (Opc == Hexagon::L2_loadrd_pci_pseudo) + Opcode = Hexagon::L2_loadrd_pci; + else if (Opc == Hexagon::L2_loadri_pci_pseudo) + Opcode = Hexagon::L2_loadri_pci; + else if (Opc == Hexagon::L2_loadrh_pci_pseudo) + Opcode = Hexagon::L2_loadrh_pci; + else if (Opc == Hexagon::L2_loadruh_pci_pseudo) + Opcode = Hexagon::L2_loadruh_pci; + else if (Opc == Hexagon::L2_loadrb_pci_pseudo) + Opcode = Hexagon::L2_loadrb_pci; + else if (Opc == Hexagon::L2_loadrub_pci_pseudo) + Opcode = Hexagon::L2_loadrub_pci; + else + llvm_unreachable("wrong Opc"); + + MachineOperand &Op0 = MI->getOperand(0); + MachineOperand &Op1 = MI->getOperand(1); + MachineOperand &Op2 = MI->getOperand(2); + MachineOperand &Op4 = MI->getOperand(4); // Modifier value. + MachineOperand &Op5 = MI->getOperand(5); + // Emit a "C6 = Rn, C6 is the control register for M0". + BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::A2_tfrrcr), + Hexagon::C6)->addOperand(Op4); + // Replace the pseude circ_ldd by the real circ_ldd. + MachineInstr *NewMI = BuildMI(*MBB, MII, MI->getDebugLoc(), + TII->get(Opcode)); + NewMI->addOperand(Op1); + NewMI->addOperand(Op0); + NewMI->addOperand(Op2); + NewMI->addOperand(Op5); + NewMI->addOperand(MachineOperand::CreateReg(Hexagon::M0, + false, /*isDef*/ + false, /*isImpl*/ + true /*isKill*/)); + MII = MBB->erase(MI); + --MII; + } else if (Opc == Hexagon::L2_loadrd_pbr_pseudo || + Opc == Hexagon::L2_loadri_pbr_pseudo || + Opc == Hexagon::L2_loadrh_pbr_pseudo || + Opc == Hexagon::L2_loadruh_pbr_pseudo|| + Opc == Hexagon::L2_loadrb_pbr_pseudo || + Opc == Hexagon::L2_loadrub_pbr_pseudo) { + unsigned Opcode; + if (Opc == Hexagon::L2_loadrd_pbr_pseudo) + Opcode = Hexagon::L2_loadrd_pbr; + else if (Opc == Hexagon::L2_loadri_pbr_pseudo) + Opcode = Hexagon::L2_loadri_pbr; + else if (Opc == Hexagon::L2_loadrh_pbr_pseudo) + Opcode = Hexagon::L2_loadrh_pbr; + else if (Opc == Hexagon::L2_loadruh_pbr_pseudo) + Opcode = Hexagon::L2_loadruh_pbr; + else if (Opc == Hexagon::L2_loadrb_pbr_pseudo) + Opcode = Hexagon::L2_loadrb_pbr; + else if (Opc == Hexagon::L2_loadrub_pbr_pseudo) + Opcode = Hexagon::L2_loadrub_pbr; + else + llvm_unreachable("wrong Opc"); + MachineOperand &Op0 = MI->getOperand(0); + MachineOperand &Op1 = MI->getOperand(1); + MachineOperand &Op2 = MI->getOperand(2); + MachineOperand &Op4 = MI->getOperand(4); // Modifier value. + // Emit a "C6 = Rn, C6 is the control register for M0". + BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::A2_tfrrcr), + Hexagon::C6)->addOperand(Op4); + // Replace the pseudo brev_ldd by the real brev_ldd. + MachineInstr *NewMI = BuildMI(*MBB, MII, MI->getDebugLoc(), + TII->get(Opcode)); + NewMI->addOperand(Op1); + NewMI->addOperand(Op0); + NewMI->addOperand(Op2); + NewMI->addOperand(MachineOperand::CreateReg(Hexagon::M0, + false, /*isDef*/ + false, /*isImpl*/ + true /*isKill*/)); + MII = MBB->erase(MI); + --MII; + } else if (Opc == Hexagon::S2_storerd_pbr_pseudo || + Opc == Hexagon::S2_storeri_pbr_pseudo || + Opc == Hexagon::S2_storerh_pbr_pseudo || + Opc == Hexagon::S2_storerb_pbr_pseudo || + Opc == Hexagon::S2_storerf_pbr_pseudo) { + unsigned Opcode; + if (Opc == Hexagon::S2_storerd_pbr_pseudo) + Opcode = Hexagon::S2_storerd_pbr; + else if (Opc == Hexagon::S2_storeri_pbr_pseudo) + Opcode = Hexagon::S2_storeri_pbr; + else if (Opc == Hexagon::S2_storerh_pbr_pseudo) + Opcode = Hexagon::S2_storerh_pbr; + else if (Opc == Hexagon::S2_storerf_pbr_pseudo) + Opcode = Hexagon::S2_storerf_pbr; + else if (Opc == Hexagon::S2_storerb_pbr_pseudo) + Opcode = Hexagon::S2_storerb_pbr; + else + llvm_unreachable("wrong Opc"); + MachineOperand &Op0 = MI->getOperand(0); + MachineOperand &Op1 = MI->getOperand(1); + MachineOperand &Op2 = MI->getOperand(2); + MachineOperand &Op3 = MI->getOperand(3); // Modifier value. + // Emit a "C6 = Rn, C6 is the control register for M0". + BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::A2_tfrrcr), + Hexagon::C6)->addOperand(Op3); + // Replace the pseudo brev_ldd by the real brev_ldd. + MachineInstr *NewMI = BuildMI(*MBB, MII, MI->getDebugLoc(), + TII->get(Opcode)); + NewMI->addOperand(Op0); + NewMI->addOperand(Op1); + NewMI->addOperand(MachineOperand::CreateReg(Hexagon::M0, + false, /*isDef*/ + false, /*isImpl*/ + true /*isKill*/)); + NewMI->addOperand(Op2); + MII = MBB->erase(MI); + --MII; + } else if (Opc == Hexagon::STriw_pred) { // STriw_pred [R30], ofst, SrcReg; unsigned FP = MI->getOperand(0).getReg(); assert(FP == QST.getRegisterInfo()->getFrameRegister() && diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp index 2b1992f..65d689b 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -140,7 +140,7 @@ bool HexagonFrameLowering::hasTailCall(MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); unsigned RetOpcode = MBBI->getOpcode(); - return RetOpcode == Hexagon::TCRETURNtg || RetOpcode == Hexagon::TCRETURNtext; + return RetOpcode == Hexagon::TCRETURNi || RetOpcode == Hexagon::TCRETURNr; } void HexagonFrameLowering::emitEpilogue(MachineFunction &MF, diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp index 1577c33..c47ee9c 100644 --- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp +++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp @@ -690,7 +690,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop, // If the induction variable bump is not a power of 2, quit. // Othwerise we'd need a general integer division. - if (!isPowerOf2_64(abs64(IVBump))) + if (!isPowerOf2_64(std::abs(IVBump))) return nullptr; MachineBasicBlock *PH = Loop->getLoopPreheader(); diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index fb056b5..aaccac8 100644 --- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -45,37 +45,25 @@ namespace llvm { /// namespace { class HexagonDAGToDAGISel : public SelectionDAGISel { - /// Subtarget - Keep a pointer to the Hexagon Subtarget around so that we can - /// make the right decision when generating code for different targets. - const HexagonSubtarget *Subtarget; - - // Keep a reference to HexagonTargetMachine. - const HexagonTargetMachine& TM; - DenseMap<const GlobalValue *, unsigned> GlobalAddressUseCountMap; + const HexagonTargetMachine& HTM; + const HexagonSubtarget *HST; public: - explicit HexagonDAGToDAGISel(HexagonTargetMachine &targetmachine, + explicit HexagonDAGToDAGISel(HexagonTargetMachine &tm, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(targetmachine, OptLevel), TM(targetmachine) { + : SelectionDAGISel(tm, OptLevel), HTM(tm) { initializeHexagonDAGToDAGISelPass(*PassRegistry::getPassRegistry()); } - bool hasNumUsesBelowThresGA(SDNode *N) const; - SDNode *Select(SDNode *N) override; + bool runOnMachineFunction(MachineFunction &MF) override { + // Reset the subtarget each time through. + HST = &MF.getSubtarget<HexagonSubtarget>(); + SelectionDAGISel::runOnMachineFunction(MF); + return true; + } - // Complex Pattern Selectors. - inline bool foldGlobalAddress(SDValue &N, SDValue &R); - inline bool foldGlobalAddressGP(SDValue &N, SDValue &R); - bool foldGlobalAddressImpl(SDValue &N, SDValue &R, bool ShouldLookForGP); - bool SelectADDRri(SDValue& N, SDValue &R1, SDValue &R2); - bool SelectADDRriS11_0(SDValue& N, SDValue &R1, SDValue &R2); - bool SelectADDRriS11_1(SDValue& N, SDValue &R1, SDValue &R2); - bool SelectADDRriS11_2(SDValue& N, SDValue &R1, SDValue &R2); - bool SelectMEMriS11_2(SDValue& Addr, SDValue &Base, SDValue &Offset); - bool SelectADDRriS11_3(SDValue& N, SDValue &R1, SDValue &R2); - bool SelectADDRrr(SDValue &Addr, SDValue &Base, SDValue &Offset); - bool SelectADDRriU6_0(SDValue& N, SDValue &R1, SDValue &R2); - bool SelectADDRriU6_1(SDValue& N, SDValue &R1, SDValue &R2); - bool SelectADDRriU6_2(SDValue& N, SDValue &R1, SDValue &R2); + virtual void PreprocessISelDAG() override; + + SDNode *Select(SDNode *N) override; // Complex Pattern Selectors. inline bool SelectAddrGA(SDValue &N, SDValue &R); @@ -87,18 +75,12 @@ public: return "Hexagon DAG->DAG Pattern Instruction Selection"; } - bool runOnMachineFunction(MachineFunction &MF) override { - Subtarget = &MF.getSubtarget<HexagonSubtarget>(); - return SelectionDAGISel::runOnMachineFunction(MF); - } - + SDNode *SelectFrameIndex(SDNode *N); /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for /// inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, - char ConstraintCode, + unsigned ConstraintID, std::vector<SDValue> &OutOps) override; - bool SelectAddr(SDNode *Op, SDValue Addr, SDValue &Base, SDValue &Offset); - SDNode *SelectLoad(SDNode *N); SDNode *SelectBaseOffsetLoad(LoadSDNode *LD, SDLoc dl); SDNode *SelectIndexedLoad(LoadSDNode *LD, SDLoc dl); @@ -110,99 +92,98 @@ public: SDNode *SelectIndexedStore(StoreSDNode *ST, SDLoc dl); SDNode *SelectStore(SDNode *N); SDNode *SelectSHL(SDNode *N); - SDNode *SelectSelect(SDNode *N); - SDNode *SelectTruncate(SDNode *N); SDNode *SelectMul(SDNode *N); SDNode *SelectZeroExtend(SDNode *N); - SDNode *SelectIntrinsicWOChain(SDNode *N); SDNode *SelectIntrinsicWChain(SDNode *N); + SDNode *SelectIntrinsicWOChain(SDNode *N); SDNode *SelectConstant(SDNode *N); SDNode *SelectConstantFP(SDNode *N); SDNode *SelectAdd(SDNode *N); - bool isConstExtProfitable(SDNode *N) const; - -// XformMskToBitPosU5Imm - Returns the bit position which -// the single bit 32 bit mask represents. -// Used in Clr and Set bit immediate memops. -SDValue XformMskToBitPosU5Imm(uint32_t Imm) { - int32_t bitPos; - bitPos = Log2_32(Imm); - assert(bitPos >= 0 && bitPos < 32 && - "Constant out of range for 32 BitPos Memops"); - return CurDAG->getTargetConstant(bitPos, MVT::i32); -} - -// XformMskToBitPosU4Imm - Returns the bit position which the single bit 16 bit -// mask represents. Used in Clr and Set bit immediate memops. -SDValue XformMskToBitPosU4Imm(uint16_t Imm) { - return XformMskToBitPosU5Imm(Imm); -} + SDNode *SelectBitOp(SDNode *N); + + // XformMskToBitPosU5Imm - Returns the bit position which + // the single bit 32 bit mask represents. + // Used in Clr and Set bit immediate memops. + SDValue XformMskToBitPosU5Imm(uint32_t Imm) { + int32_t bitPos; + bitPos = Log2_32(Imm); + assert(bitPos >= 0 && bitPos < 32 && + "Constant out of range for 32 BitPos Memops"); + return CurDAG->getTargetConstant(bitPos, MVT::i32); + } -// XformMskToBitPosU3Imm - Returns the bit position which the single bit 8 bit -// mask represents. Used in Clr and Set bit immediate memops. -SDValue XformMskToBitPosU3Imm(uint8_t Imm) { - return XformMskToBitPosU5Imm(Imm); -} + // XformMskToBitPosU4Imm - Returns the bit position which the single-bit + // 16 bit mask represents. Used in Clr and Set bit immediate memops. + SDValue XformMskToBitPosU4Imm(uint16_t Imm) { + return XformMskToBitPosU5Imm(Imm); + } -// Return true if there is exactly one bit set in V, i.e., if V is one of the -// following integers: 2^0, 2^1, ..., 2^31. -bool ImmIsSingleBit(uint32_t v) const { - return isPowerOf2_32(v); -} + // XformMskToBitPosU3Imm - Returns the bit position which the single-bit + // 8 bit mask represents. Used in Clr and Set bit immediate memops. + SDValue XformMskToBitPosU3Imm(uint8_t Imm) { + return XformMskToBitPosU5Imm(Imm); + } -// XformM5ToU5Imm - Return a target constant with the specified value, of type -// i32 where the negative literal is transformed into a positive literal for -// use in -= memops. -inline SDValue XformM5ToU5Imm(signed Imm) { - assert( (Imm >= -31 && Imm <= -1) && "Constant out of range for Memops"); - return CurDAG->getTargetConstant( - Imm, MVT::i32); -} + // Return true if there is exactly one bit set in V, i.e., if V is one of the + // following integers: 2^0, 2^1, ..., 2^31. + bool ImmIsSingleBit(uint32_t v) const { + return isPowerOf2_32(v); + } + // XformM5ToU5Imm - Return a target constant with the specified value, of + // type i32 where the negative literal is transformed into a positive literal + // for use in -= memops. + inline SDValue XformM5ToU5Imm(signed Imm) { + assert( (Imm >= -31 && Imm <= -1) && "Constant out of range for Memops"); + return CurDAG->getTargetConstant( - Imm, MVT::i32); + } -// XformU7ToU7M1Imm - Return a target constant decremented by 1, in range -// [1..128], used in cmpb.gtu instructions. -inline SDValue XformU7ToU7M1Imm(signed Imm) { - assert((Imm >= 1 && Imm <= 128) && "Constant out of range for cmpb op"); - return CurDAG->getTargetConstant(Imm - 1, MVT::i8); -} + // XformU7ToU7M1Imm - Return a target constant decremented by 1, in range + // [1..128], used in cmpb.gtu instructions. + inline SDValue XformU7ToU7M1Imm(signed Imm) { + assert((Imm >= 1 && Imm <= 128) && "Constant out of range for cmpb op"); + return CurDAG->getTargetConstant(Imm - 1, MVT::i8); + } -// XformS8ToS8M1Imm - Return a target constant decremented by 1. -inline SDValue XformSToSM1Imm(signed Imm) { - return CurDAG->getTargetConstant(Imm - 1, MVT::i32); -} + // XformS8ToS8M1Imm - Return a target constant decremented by 1. + inline SDValue XformSToSM1Imm(signed Imm) { + return CurDAG->getTargetConstant(Imm - 1, MVT::i32); + } -// XformU8ToU8M1Imm - Return a target constant decremented by 1. -inline SDValue XformUToUM1Imm(unsigned Imm) { - assert((Imm >= 1) && "Cannot decrement unsigned int less than 1"); - return CurDAG->getTargetConstant(Imm - 1, MVT::i32); -} + // XformU8ToU8M1Imm - Return a target constant decremented by 1. + inline SDValue XformUToUM1Imm(unsigned Imm) { + assert((Imm >= 1) && "Cannot decrement unsigned int less than 1"); + return CurDAG->getTargetConstant(Imm - 1, MVT::i32); + } -// XformSToSM2Imm - Return a target constant decremented by 2. -inline SDValue XformSToSM2Imm(unsigned Imm) { - return CurDAG->getTargetConstant(Imm - 2, MVT::i32); -} + // XformSToSM2Imm - Return a target constant decremented by 2. + inline SDValue XformSToSM2Imm(unsigned Imm) { + return CurDAG->getTargetConstant(Imm - 2, MVT::i32); + } -// XformSToSM3Imm - Return a target constant decremented by 3. -inline SDValue XformSToSM3Imm(unsigned Imm) { - return CurDAG->getTargetConstant(Imm - 3, MVT::i32); -} + // XformSToSM3Imm - Return a target constant decremented by 3. + inline SDValue XformSToSM3Imm(unsigned Imm) { + return CurDAG->getTargetConstant(Imm - 3, MVT::i32); + } -// Include the pieces autogenerated from the target description. -#include "HexagonGenDAGISel.inc" + // Include the pieces autogenerated from the target description. + #include "HexagonGenDAGISel.inc" private: - bool isValueExtension(SDValue const &Val, unsigned FromBits, SDValue &Src); -}; + bool isValueExtension(const SDValue &Val, unsigned FromBits, SDValue &Src); +}; // end HexagonDAGToDAGISel } // end anonymous namespace /// createHexagonISelDag - This pass converts a legalized DAG into a /// Hexagon-specific DAG, ready for instruction scheduling. /// -FunctionPass *llvm::createHexagonISelDag(HexagonTargetMachine &TM, - CodeGenOpt::Level OptLevel) { +namespace llvm { +FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM, + CodeGenOpt::Level OptLevel) { return new HexagonDAGToDAGISel(TM, OptLevel); } +} static void initializePassOnce(PassRegistry &Registry) { const char *Name = "Hexagon DAG->DAG Pattern Instruction Selection"; @@ -216,76 +197,6 @@ void llvm::initializeHexagonDAGToDAGISelPass(PassRegistry &Registry) { } -static bool IsS11_0_Offset(SDNode * S) { - ConstantSDNode *N = cast<ConstantSDNode>(S); - - // immS16 predicate - True if the immediate fits in a 16-bit sign extended - // field. - int64_t v = (int64_t)N->getSExtValue(); - return isInt<11>(v); -} - - -static bool IsS11_1_Offset(SDNode * S) { - ConstantSDNode *N = cast<ConstantSDNode>(S); - - // immS16 predicate - True if the immediate fits in a 16-bit sign extended - // field. - int64_t v = (int64_t)N->getSExtValue(); - return isShiftedInt<11,1>(v); -} - - -static bool IsS11_2_Offset(SDNode * S) { - ConstantSDNode *N = cast<ConstantSDNode>(S); - - // immS16 predicate - True if the immediate fits in a 16-bit sign extended - // field. - int64_t v = (int64_t)N->getSExtValue(); - return isShiftedInt<11,2>(v); -} - - -static bool IsS11_3_Offset(SDNode * S) { - ConstantSDNode *N = cast<ConstantSDNode>(S); - - // immS16 predicate - True if the immediate fits in a 16-bit sign extended - // field. - int64_t v = (int64_t)N->getSExtValue(); - return isShiftedInt<11,3>(v); -} - - -static bool IsU6_0_Offset(SDNode * S) { - ConstantSDNode *N = cast<ConstantSDNode>(S); - - // u6 predicate - True if the immediate fits in a 6-bit unsigned extended - // field. - int64_t v = (int64_t)N->getSExtValue(); - return isUInt<6>(v); -} - - -static bool IsU6_1_Offset(SDNode * S) { - ConstantSDNode *N = cast<ConstantSDNode>(S); - - // u6 predicate - True if the immediate fits in a 6-bit unsigned extended - // field. - int64_t v = (int64_t)N->getSExtValue(); - return isShiftedUInt<6,1>(v); -} - - -static bool IsU6_2_Offset(SDNode * S) { - ConstantSDNode *N = cast<ConstantSDNode>(S); - - // u6 predicate - True if the immediate fits in a 6-bit unsigned extended - // field. - int64_t v = (int64_t)N->getSExtValue(); - return isShiftedUInt<6,2>(v); -} - - // Intrinsics that return a a predicate. static unsigned doesIntrinsicReturnPredicate(unsigned ID) { @@ -332,216 +243,119 @@ static unsigned doesIntrinsicReturnPredicate(unsigned ID) } } -static bool OffsetFitsS11(EVT MemType, int64_t Offset) { - if (MemType == MVT::i64 && isShiftedInt<11,3>(Offset)) { - return true; - } - if (MemType == MVT::i32 && isShiftedInt<11,2>(Offset)) { - return true; - } - if (MemType == MVT::i16 && isShiftedInt<11,1>(Offset)) { - return true; - } - if (MemType == MVT::i8 && isInt<11>(Offset)) { - return true; - } - return false; -} - - -// -// Try to lower loads of GlobalAdresses into base+offset loads. Custom -// lowering for GlobalAddress nodes has already turned it into a -// CONST32. -// -SDNode *HexagonDAGToDAGISel::SelectBaseOffsetLoad(LoadSDNode *LD, SDLoc dl) { - SDValue Chain = LD->getChain(); - SDNode* Const32 = LD->getBasePtr().getNode(); - unsigned Opcode = 0; - - if (Const32->getOpcode() == HexagonISD::CONST32 && - ISD::isNormalLoad(LD)) { - SDValue Base = Const32->getOperand(0); - EVT LoadedVT = LD->getMemoryVT(); - int64_t Offset = cast<GlobalAddressSDNode>(Base)->getOffset(); - if (Offset != 0 && OffsetFitsS11(LoadedVT, Offset)) { - MVT PointerTy = getTargetLowering()->getPointerTy(); - const GlobalValue* GV = - cast<GlobalAddressSDNode>(Base)->getGlobal(); - SDValue TargAddr = - CurDAG->getTargetGlobalAddress(GV, dl, PointerTy, 0); - SDNode* NewBase = CurDAG->getMachineNode(Hexagon::CONST32_set, - dl, PointerTy, - TargAddr); - // Figure out base + offset opcode - if (LoadedVT == MVT::i64) Opcode = Hexagon::L2_loadrd_io; - else if (LoadedVT == MVT::i32) Opcode = Hexagon::L2_loadri_io; - else if (LoadedVT == MVT::i16) Opcode = Hexagon::L2_loadrh_io; - else if (LoadedVT == MVT::i8) Opcode = Hexagon::L2_loadrb_io; - else llvm_unreachable("unknown memory type"); - - // Build indexed load. - SDValue TargetConstOff = CurDAG->getTargetConstant(Offset, PointerTy); - SDNode* Result = CurDAG->getMachineNode(Opcode, dl, - LD->getValueType(0), - MVT::Other, - SDValue(NewBase,0), - TargetConstOff, - Chain); - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = LD->getMemOperand(); - cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1); - ReplaceUses(LD, Result); - return Result; - } - } - - return SelectCode(LD); -} - - SDNode *HexagonDAGToDAGISel::SelectIndexedLoadSignExtend64(LoadSDNode *LD, unsigned Opcode, - SDLoc dl) -{ + SDLoc dl) { SDValue Chain = LD->getChain(); EVT LoadedVT = LD->getMemoryVT(); SDValue Base = LD->getBasePtr(); SDValue Offset = LD->getOffset(); SDNode *OffsetNode = Offset.getNode(); int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue(); - SDValue N1 = LD->getOperand(1); - SDValue CPTmpN1_0; - SDValue CPTmpN1_1; - - if (SelectADDRriS11_2(N1, CPTmpN1_0, CPTmpN1_1) && - N1.getNode()->getValueType(0) == MVT::i32) { - const HexagonInstrInfo *TII = Subtarget->getInstrInfo(); - if (TII->isValidAutoIncImm(LoadedVT, Val)) { - SDValue TargetConst = CurDAG->getTargetConstant(Val, MVT::i32); - SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::i32, - MVT::Other, Base, TargetConst, - Chain); - SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::A2_sxtw, dl, MVT::i64, - SDValue(Result_1, 0)); - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = LD->getMemOperand(); - cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1); - const SDValue Froms[] = { SDValue(LD, 0), - SDValue(LD, 1), - SDValue(LD, 2) - }; - const SDValue Tos[] = { SDValue(Result_2, 0), - SDValue(Result_1, 1), - SDValue(Result_1, 2) - }; - ReplaceUses(Froms, Tos, 3); - return Result_2; - } - SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32); - SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32); - SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, - MVT::Other, Base, TargetConst0, + + const HexagonInstrInfo &TII = *HST->getInstrInfo(); + if (TII.isValidAutoIncImm(LoadedVT, Val)) { + SDValue TargetConst = CurDAG->getTargetConstant(Val, MVT::i32); + SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::i32, + MVT::Other, Base, TargetConst, Chain); - SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::A2_sxtw, dl, - MVT::i64, SDValue(Result_1, 0)); - SDNode* Result_3 = CurDAG->getMachineNode(Hexagon::A2_addi, dl, - MVT::i32, Base, TargetConstVal, - SDValue(Result_1, 1)); + SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::A2_sxtw, dl, MVT::i64, + SDValue(Result_1, 0)); MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); MemOp[0] = LD->getMemOperand(); cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1); const SDValue Froms[] = { SDValue(LD, 0), SDValue(LD, 1), - SDValue(LD, 2) - }; + SDValue(LD, 2) }; const SDValue Tos[] = { SDValue(Result_2, 0), - SDValue(Result_3, 0), - SDValue(Result_1, 1) - }; + SDValue(Result_1, 1), + SDValue(Result_1, 2) }; ReplaceUses(Froms, Tos, 3); return Result_2; } - return SelectCode(LD); + + SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32); + SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32); + SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::Other, + Base, TargetConst0, Chain); + SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::A2_sxtw, dl, MVT::i64, + SDValue(Result_1, 0)); + SDNode* Result_3 = CurDAG->getMachineNode(Hexagon::A2_addi, dl, MVT::i32, + Base, TargetConstVal, + SDValue(Result_1, 1)); + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = LD->getMemOperand(); + cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1); + const SDValue Froms[] = { SDValue(LD, 0), + SDValue(LD, 1), + SDValue(LD, 2) }; + const SDValue Tos[] = { SDValue(Result_2, 0), + SDValue(Result_3, 0), + SDValue(Result_1, 1) }; + ReplaceUses(Froms, Tos, 3); + return Result_2; } SDNode *HexagonDAGToDAGISel::SelectIndexedLoadZeroExtend64(LoadSDNode *LD, unsigned Opcode, - SDLoc dl) -{ + SDLoc dl) { SDValue Chain = LD->getChain(); EVT LoadedVT = LD->getMemoryVT(); SDValue Base = LD->getBasePtr(); SDValue Offset = LD->getOffset(); SDNode *OffsetNode = Offset.getNode(); int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue(); - SDValue N1 = LD->getOperand(1); - SDValue CPTmpN1_0; - SDValue CPTmpN1_1; - - if (SelectADDRriS11_2(N1, CPTmpN1_0, CPTmpN1_1) && - N1.getNode()->getValueType(0) == MVT::i32) { - const HexagonInstrInfo *TII = Subtarget->getInstrInfo(); - if (TII->isValidAutoIncImm(LoadedVT, Val)) { - SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32); - SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32); - SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, - MVT::i32, MVT::Other, Base, - TargetConstVal, Chain); - SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl, MVT::i32, - TargetConst0); - SDNode *Result_3 = CurDAG->getMachineNode(Hexagon::A2_combinew, dl, - MVT::i64, MVT::Other, - SDValue(Result_2,0), - SDValue(Result_1,0)); - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = LD->getMemOperand(); - cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1); - const SDValue Froms[] = { SDValue(LD, 0), - SDValue(LD, 1), - SDValue(LD, 2) - }; - const SDValue Tos[] = { SDValue(Result_3, 0), - SDValue(Result_1, 1), - SDValue(Result_1, 2) - }; - ReplaceUses(Froms, Tos, 3); - return Result_3; - } - // Generate an indirect load. - SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32); + const HexagonInstrInfo &TII = *HST->getInstrInfo(); + if (TII.isValidAutoIncImm(LoadedVT, Val)) { SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32); + SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32); SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, - MVT::Other, - Base, TargetConst0, Chain); - SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl, MVT::i32, - TargetConst0); - SDNode *Result_3 = CurDAG->getMachineNode(Hexagon::A2_combinew, dl, + MVT::i32, MVT::Other, Base, + TargetConstVal, Chain); + SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::A4_combineir, dl, MVT::i64, MVT::Other, - SDValue(Result_2,0), + TargetConst0, SDValue(Result_1,0)); - // Add offset to base. - SDNode* Result_4 = CurDAG->getMachineNode(Hexagon::A2_addi, dl, MVT::i32, - Base, TargetConstVal, - SDValue(Result_1, 1)); MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); MemOp[0] = LD->getMemOperand(); cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1); const SDValue Froms[] = { SDValue(LD, 0), SDValue(LD, 1), - SDValue(LD, 2) - }; - const SDValue Tos[] = { SDValue(Result_3, 0), // Load value. - SDValue(Result_4, 0), // New address. - SDValue(Result_1, 1) - }; + SDValue(LD, 2) }; + const SDValue Tos[] = { SDValue(Result_2, 0), + SDValue(Result_1, 1), + SDValue(Result_1, 2) }; ReplaceUses(Froms, Tos, 3); - return Result_3; + return Result_2; } - return SelectCode(LD); + // Generate an indirect load. + SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32); + SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32); + SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, + MVT::Other, Base, TargetConst0, + Chain); + SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::A4_combineir, dl, + MVT::i64, MVT::Other, + TargetConst0, + SDValue(Result_1,0)); + // Add offset to base. + SDNode* Result_3 = CurDAG->getMachineNode(Hexagon::A2_addi, dl, MVT::i32, + Base, TargetConstVal, + SDValue(Result_1, 1)); + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = LD->getMemOperand(); + cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1); + const SDValue Froms[] = { SDValue(LD, 0), + SDValue(LD, 1), + SDValue(LD, 2) }; + const SDValue Tos[] = { SDValue(Result_2, 0), // Load value. + SDValue(Result_3, 0), // New address. + SDValue(Result_1, 1) }; + ReplaceUses(Froms, Tos, 3); + return Result_2; } @@ -555,45 +369,44 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, SDLoc dl) { EVT LoadedVT = LD->getMemoryVT(); unsigned Opcode = 0; - // Check for zero ext loads. - bool zextval = (LD->getExtensionType() == ISD::ZEXTLOAD); + // Check for zero extended loads. Treat any-extend loads as zero extended + // loads. + ISD::LoadExtType ExtType = LD->getExtensionType(); + bool IsZeroExt = (ExtType == ISD::ZEXTLOAD || ExtType == ISD::EXTLOAD); // Figure out the opcode. - const HexagonInstrInfo *TII = Subtarget->getInstrInfo(); + const HexagonInstrInfo &TII = *HST->getInstrInfo(); if (LoadedVT == MVT::i64) { - if (TII->isValidAutoIncImm(LoadedVT, Val)) + if (TII.isValidAutoIncImm(LoadedVT, Val)) Opcode = Hexagon::L2_loadrd_pi; else Opcode = Hexagon::L2_loadrd_io; } else if (LoadedVT == MVT::i32) { - if (TII->isValidAutoIncImm(LoadedVT, Val)) + if (TII.isValidAutoIncImm(LoadedVT, Val)) Opcode = Hexagon::L2_loadri_pi; else Opcode = Hexagon::L2_loadri_io; } else if (LoadedVT == MVT::i16) { - if (TII->isValidAutoIncImm(LoadedVT, Val)) - Opcode = zextval ? Hexagon::L2_loadruh_pi : Hexagon::L2_loadrh_pi; + if (TII.isValidAutoIncImm(LoadedVT, Val)) + Opcode = IsZeroExt ? Hexagon::L2_loadruh_pi : Hexagon::L2_loadrh_pi; else - Opcode = zextval ? Hexagon::L2_loadruh_io : Hexagon::L2_loadrh_io; + Opcode = IsZeroExt ? Hexagon::L2_loadruh_io : Hexagon::L2_loadrh_io; } else if (LoadedVT == MVT::i8) { - if (TII->isValidAutoIncImm(LoadedVT, Val)) - Opcode = zextval ? Hexagon::L2_loadrub_pi : Hexagon::L2_loadrb_pi; + if (TII.isValidAutoIncImm(LoadedVT, Val)) + Opcode = IsZeroExt ? Hexagon::L2_loadrub_pi : Hexagon::L2_loadrb_pi; else - Opcode = zextval ? Hexagon::L2_loadrub_io : Hexagon::L2_loadrb_io; + Opcode = IsZeroExt ? Hexagon::L2_loadrub_io : Hexagon::L2_loadrb_io; } else llvm_unreachable("unknown memory type"); - // For zero ext i64 loads, we need to add combine instructions. - if (LD->getValueType(0) == MVT::i64 && - LD->getExtensionType() == ISD::ZEXTLOAD) { + // For zero extended i64 loads, we need to add combine instructions. + if (LD->getValueType(0) == MVT::i64 && IsZeroExt) return SelectIndexedLoadZeroExtend64(LD, Opcode, dl); - } - if (LD->getValueType(0) == MVT::i64 && - LD->getExtensionType() == ISD::SEXTLOAD) { - // Handle sign ext i64 loads. + // Handle sign extended i64 loads. + if (LD->getValueType(0) == MVT::i64 && ExtType == ISD::SEXTLOAD) return SelectIndexedLoadSignExtend64(LD, Opcode, dl); - } - if (TII->isValidAutoIncImm(LoadedVT, Val)) { + + if (TII.isValidAutoIncImm(LoadedVT, Val)) { SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32); SDNode* Result = CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0), @@ -649,7 +462,7 @@ SDNode *HexagonDAGToDAGISel::SelectLoad(SDNode *N) { if (AM != ISD::UNINDEXED) { result = SelectIndexedLoad(LD, dl); } else { - result = SelectBaseOffsetLoad(LD, dl); + result = SelectCode(LD); } return result; @@ -665,13 +478,12 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) { // Get the constant value. int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue(); EVT StoredVT = ST->getMemoryVT(); + EVT ValueVT = Value.getValueType(); // Offset value must be within representable range // and must have correct alignment properties. - const HexagonInstrInfo *TII = Subtarget->getInstrInfo(); - if (TII->isValidAutoIncImm(StoredVT, Val)) { - SDValue Ops[] = {Base, CurDAG->getTargetConstant(Val, MVT::i32), Value, - Chain}; + const HexagonInstrInfo &TII = *HST->getInstrInfo(); + if (TII.isValidAutoIncImm(StoredVT, Val)) { unsigned Opcode = 0; // Figure out the post inc version of opcode. @@ -681,6 +493,13 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) { else if (StoredVT == MVT::i8) Opcode = Hexagon::S2_storerb_pi; else llvm_unreachable("unknown memory type"); + if (ST->isTruncatingStore() && ValueVT.getSizeInBits() == 64) { + assert(StoredVT.getSizeInBits() < 64 && "Not a truncating store"); + Value = CurDAG->getTargetExtractSubreg(Hexagon::subreg_loreg, + dl, MVT::i32, Value); + } + SDValue Ops[] = {Base, CurDAG->getTargetConstant(Val, MVT::i32), Value, + Chain}; // Build post increment store. SDNode* Result = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::Other, Ops); @@ -694,7 +513,8 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) { } // Note: Order of operands matches the def of instruction: - // def STrid : STInst<(outs), (ins MEMri:$addr, DoubleRegs:$src1), ... + // def S2_storerd_io + // : STInst<(outs), (ins IntRegs:$base, imm:$offset, DoubleRegs:$src1), ... // and it differs for POST_ST* for instance. SDValue Ops[] = { Base, CurDAG->getTargetConstant(0, MVT::i32), Value, Chain}; @@ -724,61 +544,6 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) { return Result_2; } - -SDNode *HexagonDAGToDAGISel::SelectBaseOffsetStore(StoreSDNode *ST, - SDLoc dl) { - SDValue Chain = ST->getChain(); - SDNode* Const32 = ST->getBasePtr().getNode(); - SDValue Value = ST->getValue(); - unsigned Opcode = 0; - - // Try to lower stores of GlobalAdresses into indexed stores. Custom - // lowering for GlobalAddress nodes has already turned it into a - // CONST32. Avoid truncating stores for the moment. Post-inc stores - // do the same. Don't think there's a reason for it, so will file a - // bug to fix. - if ((Const32->getOpcode() == HexagonISD::CONST32) && - !(Value.getValueType() == MVT::i64 && ST->isTruncatingStore())) { - SDValue Base = Const32->getOperand(0); - if (Base.getOpcode() == ISD::TargetGlobalAddress) { - EVT StoredVT = ST->getMemoryVT(); - int64_t Offset = cast<GlobalAddressSDNode>(Base)->getOffset(); - if (Offset != 0 && OffsetFitsS11(StoredVT, Offset)) { - MVT PointerTy = getTargetLowering()->getPointerTy(); - const GlobalValue* GV = - cast<GlobalAddressSDNode>(Base)->getGlobal(); - SDValue TargAddr = - CurDAG->getTargetGlobalAddress(GV, dl, PointerTy, 0); - SDNode* NewBase = CurDAG->getMachineNode(Hexagon::CONST32_set, - dl, PointerTy, - TargAddr); - - // Figure out base + offset opcode - if (StoredVT == MVT::i64) Opcode = Hexagon::S2_storerd_io; - else if (StoredVT == MVT::i32) Opcode = Hexagon::S2_storeri_io; - else if (StoredVT == MVT::i16) Opcode = Hexagon::S2_storerh_io; - else if (StoredVT == MVT::i8) Opcode = Hexagon::S2_storerb_io; - else llvm_unreachable("unknown memory type"); - - SDValue Ops[] = {SDValue(NewBase,0), - CurDAG->getTargetConstant(Offset,PointerTy), - Value, Chain}; - // build indexed store - SDNode* Result = CurDAG->getMachineNode(Opcode, dl, - MVT::Other, Ops); - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = ST->getMemOperand(); - cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1); - ReplaceUses(ST, Result); - return Result; - } - } - } - - return SelectCode(ST); -} - - SDNode *HexagonDAGToDAGISel::SelectStore(SDNode *N) { SDLoc dl(N); StoreSDNode *ST = cast<StoreSDNode>(N); @@ -789,7 +554,7 @@ SDNode *HexagonDAGToDAGISel::SelectStore(SDNode *N) { return SelectIndexedStore(ST, dl); } - return SelectBaseOffsetStore(ST, dl); + return SelectCode(ST); } SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) { @@ -875,187 +640,6 @@ SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) { return SelectCode(N); } - -SDNode *HexagonDAGToDAGISel::SelectSelect(SDNode *N) { - SDLoc dl(N); - SDValue N0 = N->getOperand(0); - if (N0.getOpcode() == ISD::SETCC) { - SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == ISD::SIGN_EXTEND_INREG) { - SDValue N000 = N00.getOperand(0); - SDValue N001 = N00.getOperand(1); - if (cast<VTSDNode>(N001)->getVT() == MVT::i16) { - SDValue N01 = N0.getOperand(1); - SDValue N02 = N0.getOperand(2); - - // Pattern: (select:i32 (setcc:i1 (sext_inreg:i32 IntRegs:i32:$src2, - // i16:Other),IntRegs:i32:$src1, SETLT:Other),IntRegs:i32:$src1, - // IntRegs:i32:$src2) - // Emits: (MAXh_rr:i32 IntRegs:i32:$src1, IntRegs:i32:$src2) - // Pattern complexity = 9 cost = 1 size = 0. - if (cast<CondCodeSDNode>(N02)->get() == ISD::SETLT) { - SDValue N1 = N->getOperand(1); - if (N01 == N1) { - SDValue N2 = N->getOperand(2); - if (N000 == N2 && - N0.getNode()->getValueType(N0.getResNo()) == MVT::i1 && - N00.getNode()->getValueType(N00.getResNo()) == MVT::i32) { - SDNode *SextNode = CurDAG->getMachineNode(Hexagon::A2_sxth, dl, - MVT::i32, N000); - SDNode *Result = CurDAG->getMachineNode(Hexagon::A2_max, dl, - MVT::i32, - SDValue(SextNode, 0), - N1); - ReplaceUses(N, Result); - return Result; - } - } - } - - // Pattern: (select:i32 (setcc:i1 (sext_inreg:i32 IntRegs:i32:$src2, - // i16:Other), IntRegs:i32:$src1, SETGT:Other), IntRegs:i32:$src1, - // IntRegs:i32:$src2) - // Emits: (MINh_rr:i32 IntRegs:i32:$src1, IntRegs:i32:$src2) - // Pattern complexity = 9 cost = 1 size = 0. - if (cast<CondCodeSDNode>(N02)->get() == ISD::SETGT) { - SDValue N1 = N->getOperand(1); - if (N01 == N1) { - SDValue N2 = N->getOperand(2); - if (N000 == N2 && - N0.getNode()->getValueType(N0.getResNo()) == MVT::i1 && - N00.getNode()->getValueType(N00.getResNo()) == MVT::i32) { - SDNode *SextNode = CurDAG->getMachineNode(Hexagon::A2_sxth, dl, - MVT::i32, N000); - SDNode *Result = CurDAG->getMachineNode(Hexagon::A2_min, dl, - MVT::i32, - SDValue(SextNode, 0), - N1); - ReplaceUses(N, Result); - return Result; - } - } - } - } - } - } - - return SelectCode(N); -} - - -SDNode *HexagonDAGToDAGISel::SelectTruncate(SDNode *N) { - SDLoc dl(N); - SDValue Shift = N->getOperand(0); - - // - // %conv.i = sext i32 %tmp1 to i64 - // %conv2.i = sext i32 %add to i64 - // %mul.i = mul nsw i64 %conv2.i, %conv.i - // %shr5.i = lshr i64 %mul.i, 32 - // %conv3.i = trunc i64 %shr5.i to i32 - // - // --- match with the following --- - // - // %conv3.i = mpy (%tmp1, %add) - // - // Trunc to i32. - if (N->getValueType(0) == MVT::i32) { - // Trunc from i64. - if (Shift.getNode()->getValueType(0) == MVT::i64) { - // Trunc child is logical shift right. - if (Shift.getOpcode() != ISD::SRL) { - return SelectCode(N); - } - - SDValue ShiftOp0 = Shift.getOperand(0); - SDValue ShiftOp1 = Shift.getOperand(1); - - // Shift by const 32 - if (ShiftOp1.getOpcode() != ISD::Constant) { - return SelectCode(N); - } - - int32_t ShiftConst = - cast<ConstantSDNode>(ShiftOp1.getNode())->getSExtValue(); - if (ShiftConst != 32) { - return SelectCode(N); - } - - // Shifting a i64 signed multiply - SDValue Mul = ShiftOp0; - if (Mul.getOpcode() != ISD::MUL) { - return SelectCode(N); - } - - SDValue MulOp0 = Mul.getOperand(0); - SDValue MulOp1 = Mul.getOperand(1); - - SDValue OP0; - SDValue OP1; - - // Handle sign_extend and sextload - if (MulOp0.getOpcode() == ISD::SIGN_EXTEND) { - SDValue Sext0 = MulOp0.getOperand(0); - if (Sext0.getNode()->getValueType(0) != MVT::i32) { - return SelectCode(N); - } - - OP0 = Sext0; - } else if (MulOp0.getOpcode() == ISD::LOAD) { - LoadSDNode *LD = cast<LoadSDNode>(MulOp0.getNode()); - if (LD->getMemoryVT() != MVT::i32 || - LD->getExtensionType() != ISD::SEXTLOAD || - LD->getAddressingMode() != ISD::UNINDEXED) { - return SelectCode(N); - } - - SDValue Chain = LD->getChain(); - SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32); - OP0 = SDValue(CurDAG->getMachineNode(Hexagon::L2_loadri_io, dl, MVT::i32, - MVT::Other, - LD->getBasePtr(), - TargetConst0, Chain), 0); - } else { - return SelectCode(N); - } - - // Same goes for the second operand. - if (MulOp1.getOpcode() == ISD::SIGN_EXTEND) { - SDValue Sext1 = MulOp1.getOperand(0); - if (Sext1.getNode()->getValueType(0) != MVT::i32) - return SelectCode(N); - - OP1 = Sext1; - } else if (MulOp1.getOpcode() == ISD::LOAD) { - LoadSDNode *LD = cast<LoadSDNode>(MulOp1.getNode()); - if (LD->getMemoryVT() != MVT::i32 || - LD->getExtensionType() != ISD::SEXTLOAD || - LD->getAddressingMode() != ISD::UNINDEXED) { - return SelectCode(N); - } - - SDValue Chain = LD->getChain(); - SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32); - OP1 = SDValue(CurDAG->getMachineNode(Hexagon::L2_loadri_io, dl, MVT::i32, - MVT::Other, - LD->getBasePtr(), - TargetConst0, Chain), 0); - } else { - return SelectCode(N); - } - - // Generate a mpy instruction. - SDNode *Result = CurDAG->getMachineNode(Hexagon::M2_mpy_up, dl, MVT::i32, - OP0, OP1); - ReplaceUses(N, Result); - return Result; - } - } - - return SelectCode(N); -} - - SDNode *HexagonDAGToDAGISel::SelectSHL(SDNode *N) { SDLoc dl(N); if (N->getValueType(0) == MVT::i32) { @@ -1134,6 +718,36 @@ SDNode *HexagonDAGToDAGISel::SelectSHL(SDNode *N) { // SDNode *HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) { SDLoc dl(N); + + SDValue Op0 = N->getOperand(0); + EVT OpVT = Op0.getValueType(); + unsigned OpBW = OpVT.getSizeInBits(); + + // Special handling for zero-extending a vector of booleans. + if (OpVT.isVector() && OpVT.getVectorElementType() == MVT::i1 && OpBW <= 64) { + SDNode *Mask = CurDAG->getMachineNode(Hexagon::C2_mask, dl, MVT::i64, Op0); + unsigned NE = OpVT.getVectorNumElements(); + EVT ExVT = N->getValueType(0); + unsigned ES = ExVT.getVectorElementType().getSizeInBits(); + uint64_t MV = 0, Bit = 1; + for (unsigned i = 0; i < NE; ++i) { + MV |= Bit; + Bit <<= ES; + } + SDValue Ones = CurDAG->getTargetConstant(MV, MVT::i64); + SDNode *OnesReg = CurDAG->getMachineNode(Hexagon::CONST64_Int_Real, dl, + MVT::i64, Ones); + if (ExVT.getSizeInBits() == 32) { + SDNode *And = CurDAG->getMachineNode(Hexagon::A2_andp, dl, MVT::i64, + SDValue(Mask,0), SDValue(OnesReg,0)); + SDValue SubR = CurDAG->getTargetConstant(Hexagon::subreg_loreg, MVT::i32); + return CurDAG->getMachineNode(Hexagon::EXTRACT_SUBREG, dl, ExVT, + SDValue(And,0), SubR); + } + return CurDAG->getMachineNode(Hexagon::A2_andp, dl, ExVT, + SDValue(Mask,0), SDValue(OnesReg,0)); + } + SDNode *IsIntrinsic = N->getOperand(0).getNode(); if ((IsIntrinsic->getOpcode() == ISD::INTRINSIC_WO_CHAIN)) { unsigned ID = @@ -1141,7 +755,7 @@ SDNode *HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) { if (doesIntrinsicReturnPredicate(ID)) { // Now we need to differentiate target data types. if (N->getValueType(0) == MVT::i64) { - // Convert the zero_extend to Rs = Pd followed by COMBINE_rr(0,Rs). + // Convert the zero_extend to Rs = Pd followed by A2_combinew(0,Rs). SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32); SDNode *Result_1 = CurDAG->getMachineNode(Hexagon::C2_tfrpr, dl, MVT::i32, @@ -1171,6 +785,203 @@ SDNode *HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) { } // +// Checking for intrinsics circular load/store, and bitreverse load/store +// instrisics in order to select the correct lowered operation. +// +SDNode *HexagonDAGToDAGISel::SelectIntrinsicWChain(SDNode *N) { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + if (IntNo == Intrinsic::hexagon_circ_ldd || + IntNo == Intrinsic::hexagon_circ_ldw || + IntNo == Intrinsic::hexagon_circ_lduh || + IntNo == Intrinsic::hexagon_circ_ldh || + IntNo == Intrinsic::hexagon_circ_ldub || + IntNo == Intrinsic::hexagon_circ_ldb) { + SDLoc dl(N); + SDValue Chain = N->getOperand(0); + SDValue Base = N->getOperand(2); + SDValue Load = N->getOperand(3); + SDValue ModifierExpr = N->getOperand(4); + SDValue Offset = N->getOperand(5); + + // We need to add the rerurn type for the load. This intrinsic has + // two return types, one for the load and one for the post-increment. + // Only the *_ld instructions push the extra return type, and bump the + // result node operand number correspondingly. + std::vector<EVT> ResTys; + unsigned opc; + unsigned memsize, align; + MVT MvtSize = MVT::i32; + + if (IntNo == Intrinsic::hexagon_circ_ldd) { + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::i64); + opc = Hexagon::L2_loadrd_pci_pseudo; + memsize = 8; + align = 8; + } else if (IntNo == Intrinsic::hexagon_circ_ldw) { + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::i32); + opc = Hexagon::L2_loadri_pci_pseudo; + memsize = 4; + align = 4; + } else if (IntNo == Intrinsic::hexagon_circ_ldh) { + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::i32); + opc = Hexagon::L2_loadrh_pci_pseudo; + memsize = 2; + align = 2; + MvtSize = MVT::i16; + } else if (IntNo == Intrinsic::hexagon_circ_lduh) { + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::i32); + opc = Hexagon::L2_loadruh_pci_pseudo; + memsize = 2; + align = 2; + MvtSize = MVT::i16; + } else if (IntNo == Intrinsic::hexagon_circ_ldb) { + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::i32); + opc = Hexagon::L2_loadrb_pci_pseudo; + memsize = 1; + align = 1; + MvtSize = MVT::i8; + } else if (IntNo == Intrinsic::hexagon_circ_ldub) { + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::i32); + opc = Hexagon::L2_loadrub_pci_pseudo; + memsize = 1; + align = 1; + MvtSize = MVT::i8; + } else + llvm_unreachable("no opc"); + + ResTys.push_back(MVT::Other); + + // Copy over the arguments, which are the same mostly. + SmallVector<SDValue, 5> Ops; + Ops.push_back(Base); + Ops.push_back(Load); + Ops.push_back(ModifierExpr); + int32_t Val = cast<ConstantSDNode>(Offset.getNode())->getSExtValue(); + Ops.push_back(CurDAG->getTargetConstant(Val, MVT::i32)); + Ops.push_back(Chain); + SDNode* Result = CurDAG->getMachineNode(opc, dl, ResTys, Ops); + + SDValue ST; + MachineMemOperand *Mem = + MF->getMachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOStore, memsize, align); + if (MvtSize != MVT::i32) + ST = CurDAG->getTruncStore(Chain, dl, SDValue(Result, 1), Load, + MvtSize, Mem); + else + ST = CurDAG->getStore(Chain, dl, SDValue(Result, 1), Load, Mem); + + SDNode* Store = SelectStore(ST.getNode()); + + const SDValue Froms[] = { SDValue(N, 0), + SDValue(N, 1) }; + const SDValue Tos[] = { SDValue(Result, 0), + SDValue(Store, 0) }; + ReplaceUses(Froms, Tos, 2); + return Result; + } + + if (IntNo == Intrinsic::hexagon_brev_ldd || + IntNo == Intrinsic::hexagon_brev_ldw || + IntNo == Intrinsic::hexagon_brev_ldh || + IntNo == Intrinsic::hexagon_brev_lduh || + IntNo == Intrinsic::hexagon_brev_ldb || + IntNo == Intrinsic::hexagon_brev_ldub) { + SDLoc dl(N); + SDValue Chain = N->getOperand(0); + SDValue Base = N->getOperand(2); + SDValue Load = N->getOperand(3); + SDValue ModifierExpr = N->getOperand(4); + + // We need to add the rerurn type for the load. This intrinsic has + // two return types, one for the load and one for the post-increment. + std::vector<EVT> ResTys; + unsigned opc; + unsigned memsize, align; + MVT MvtSize = MVT::i32; + + if (IntNo == Intrinsic::hexagon_brev_ldd) { + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::i64); + opc = Hexagon::L2_loadrd_pbr_pseudo; + memsize = 8; + align = 8; + } else if (IntNo == Intrinsic::hexagon_brev_ldw) { + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::i32); + opc = Hexagon::L2_loadri_pbr_pseudo; + memsize = 4; + align = 4; + } else if (IntNo == Intrinsic::hexagon_brev_ldh) { + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::i32); + opc = Hexagon::L2_loadrh_pbr_pseudo; + memsize = 2; + align = 2; + MvtSize = MVT::i16; + } else if (IntNo == Intrinsic::hexagon_brev_lduh) { + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::i32); + opc = Hexagon::L2_loadruh_pbr_pseudo; + memsize = 2; + align = 2; + MvtSize = MVT::i16; + } else if (IntNo == Intrinsic::hexagon_brev_ldb) { + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::i32); + opc = Hexagon::L2_loadrb_pbr_pseudo; + memsize = 1; + align = 1; + MvtSize = MVT::i8; + } else if (IntNo == Intrinsic::hexagon_brev_ldub) { + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::i32); + opc = Hexagon::L2_loadrub_pbr_pseudo; + memsize = 1; + align = 1; + MvtSize = MVT::i8; + } else + llvm_unreachable("no opc"); + + ResTys.push_back(MVT::Other); + + // Copy over the arguments, which are the same mostly. + SmallVector<SDValue, 4> Ops; + Ops.push_back(Base); + Ops.push_back(Load); + Ops.push_back(ModifierExpr); + Ops.push_back(Chain); + SDNode* Result = CurDAG->getMachineNode(opc, dl, ResTys, Ops); + SDValue ST; + MachineMemOperand *Mem = + MF->getMachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOStore, memsize, align); + if (MvtSize != MVT::i32) + ST = CurDAG->getTruncStore(Chain, dl, SDValue(Result, 1), Load, + MvtSize, Mem); + else + ST = CurDAG->getStore(Chain, dl, SDValue(Result, 1), Load, Mem); + + SDNode* Store = SelectStore(ST.getNode()); + + const SDValue Froms[] = { SDValue(N, 0), + SDValue(N, 1) }; + const SDValue Tos[] = { SDValue(Result, 0), + SDValue(Store, 0) }; + ReplaceUses(Froms, Tos, 2); + return Result; + } + + return SelectCode(N); +} + +// // Checking for intrinsics which have predicate registers as operand(s) // and lowering to the actual intrinsic. // @@ -1217,37 +1028,20 @@ SDNode *HexagonDAGToDAGISel::SelectConstantFP(SDNode *N) { return SelectCode(N); } - // // Map predicate true (encoded as -1 in LLVM) to a XOR. // SDNode *HexagonDAGToDAGISel::SelectConstant(SDNode *N) { SDLoc dl(N); if (N->getValueType(0) == MVT::i1) { - SDNode* Result; + SDNode* Result = 0; int32_t Val = cast<ConstantSDNode>(N)->getSExtValue(); if (Val == -1) { - // Create the IntReg = 1 node. - SDNode* IntRegTFR = - CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl, MVT::i32, - CurDAG->getTargetConstant(0, MVT::i32)); - - // Pd = IntReg - SDNode* Pd = CurDAG->getMachineNode(Hexagon::C2_tfrrp, dl, MVT::i1, - SDValue(IntRegTFR, 0)); - - // not(Pd) - SDNode* NotPd = CurDAG->getMachineNode(Hexagon::C2_not, dl, MVT::i1, - SDValue(Pd, 0)); - - // xor(not(Pd)) - Result = CurDAG->getMachineNode(Hexagon::C2_xor, dl, MVT::i1, - SDValue(Pd, 0), SDValue(NotPd, 0)); - - // We have just built: - // Rs = Pd - // Pd = xor(not(Pd), Pd) - + Result = CurDAG->getMachineNode(Hexagon::TFR_PdTrue, dl, MVT::i1); + } else if (Val == 0) { + Result = CurDAG->getMachineNode(Hexagon::TFR_PdFalse, dl, MVT::i1); + } + if (Result) { ReplaceUses(N, Result); return Result; } @@ -1283,347 +1077,282 @@ SDNode *HexagonDAGToDAGISel::SelectAdd(SDNode *N) { return Result; } - -SDNode *HexagonDAGToDAGISel::Select(SDNode *N) { - if (N->isMachineOpcode()) { - N->setNodeId(-1); - return nullptr; // Already selected. - } - - - switch (N->getOpcode()) { - case ISD::Constant: - return SelectConstant(N); - - case ISD::ConstantFP: - return SelectConstantFP(N); - - case ISD::ADD: - return SelectAdd(N); - - case ISD::SHL: - return SelectSHL(N); - - case ISD::LOAD: - return SelectLoad(N); - - case ISD::STORE: - return SelectStore(N); - - case ISD::SELECT: - return SelectSelect(N); - - case ISD::TRUNCATE: - return SelectTruncate(N); - - case ISD::MUL: - return SelectMul(N); - - case ISD::ZERO_EXTEND: - return SelectZeroExtend(N); - - case ISD::INTRINSIC_WO_CHAIN: - return SelectIntrinsicWOChain(N); - } - - return SelectCode(N); -} - - // -// Hexagon_TODO: Five functions for ADDRri?! Surely there must be a better way -// to define these instructions. +// Map the following, where possible. +// AND/FABS -> clrbit +// OR -> setbit +// XOR/FNEG ->toggle_bit. // -bool HexagonDAGToDAGISel::SelectADDRri(SDValue& Addr, SDValue &Base, - SDValue &Offset) { - if (Addr.getOpcode() == ISD::TargetExternalSymbol || - Addr.getOpcode() == ISD::TargetGlobalAddress) - return false; // Direct calls. - - if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { - Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); - Offset = CurDAG->getTargetConstant(0, MVT::i32); - return true; - } - Base = Addr; - Offset = CurDAG->getTargetConstant(0, MVT::i32); - return true; -} +SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) { + SDLoc dl(N); + EVT ValueVT = N->getValueType(0); + // We handle only 32 and 64-bit bit ops. + if (!(ValueVT == MVT::i32 || ValueVT == MVT::i64 || + ValueVT == MVT::f32 || ValueVT == MVT::f64)) + return SelectCode(N); -bool HexagonDAGToDAGISel::SelectADDRriS11_0(SDValue& Addr, SDValue &Base, - SDValue &Offset) { - if (Addr.getOpcode() == ISD::TargetExternalSymbol || - Addr.getOpcode() == ISD::TargetGlobalAddress) - return false; // Direct calls. + // We handly only fabs and fneg for V5. + unsigned Opc = N->getOpcode(); + if ((Opc == ISD::FABS || Opc == ISD::FNEG) && !HST->hasV5TOps()) + return SelectCode(N); - if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { - Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); - Offset = CurDAG->getTargetConstant(0, MVT::i32); - return (IsS11_0_Offset(Offset.getNode())); + int64_t Val = 0; + if (Opc != ISD::FABS && Opc != ISD::FNEG) { + if (N->getOperand(1).getOpcode() == ISD::Constant) + Val = cast<ConstantSDNode>((N)->getOperand(1))->getSExtValue(); + else + return SelectCode(N); } - Base = Addr; - Offset = CurDAG->getTargetConstant(0, MVT::i32); - return (IsS11_0_Offset(Offset.getNode())); -} - -bool HexagonDAGToDAGISel::SelectADDRriS11_1(SDValue& Addr, SDValue &Base, - SDValue &Offset) { - if (Addr.getOpcode() == ISD::TargetExternalSymbol || - Addr.getOpcode() == ISD::TargetGlobalAddress) - return false; // Direct calls. - - if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { - Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); - Offset = CurDAG->getTargetConstant(0, MVT::i32); - return (IsS11_1_Offset(Offset.getNode())); + if (Opc == ISD::AND) { + if (((ValueVT == MVT::i32) && + (!((Val & 0x80000000) || (Val & 0x7fffffff)))) || + ((ValueVT == MVT::i64) && + (!((Val & 0x8000000000000000) || (Val & 0x7fffffff))))) + // If it's simple AND, do the normal op. + return SelectCode(N); + else + Val = ~Val; } - Base = Addr; - Offset = CurDAG->getTargetConstant(0, MVT::i32); - return (IsS11_1_Offset(Offset.getNode())); -} - - -bool HexagonDAGToDAGISel::SelectADDRriS11_2(SDValue& Addr, SDValue &Base, - SDValue &Offset) { - if (Addr.getOpcode() == ISD::TargetExternalSymbol || - Addr.getOpcode() == ISD::TargetGlobalAddress) - return false; // Direct calls. - if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { - Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); - Offset = CurDAG->getTargetConstant(0, MVT::i32); - return (IsS11_2_Offset(Offset.getNode())); + // If OR or AND is being fed by shl, srl and, sra don't do this change, + // because Hexagon provide |= &= on shl, srl, and sra. + // Traverse the DAG to see if there is shl, srl and sra. + if (Opc == ISD::OR || Opc == ISD::AND) { + switch (N->getOperand(0)->getOpcode()) { + default: break; + case ISD::SRA: + case ISD::SRL: + case ISD::SHL: + return SelectCode(N); + } } - Base = Addr; - Offset = CurDAG->getTargetConstant(0, MVT::i32); - return (IsS11_2_Offset(Offset.getNode())); -} + // Make sure it's power of 2. + unsigned bitpos = 0; + if (Opc != ISD::FABS && Opc != ISD::FNEG) { + if (((ValueVT == MVT::i32) && !isPowerOf2_32(Val)) || + ((ValueVT == MVT::i64) && !isPowerOf2_64(Val))) + return SelectCode(N); -bool HexagonDAGToDAGISel::SelectADDRriU6_0(SDValue& Addr, SDValue &Base, - SDValue &Offset) { - if (Addr.getOpcode() == ISD::TargetExternalSymbol || - Addr.getOpcode() == ISD::TargetGlobalAddress) - return false; // Direct calls. - - if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { - Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); - Offset = CurDAG->getTargetConstant(0, MVT::i32); - return (IsU6_0_Offset(Offset.getNode())); + // Get the bit position. + bitpos = countTrailingZeros(uint64_t(Val)); + } else { + // For fabs and fneg, it's always the 31st bit. + bitpos = 31; } - Base = Addr; - Offset = CurDAG->getTargetConstant(0, MVT::i32); - return (IsU6_0_Offset(Offset.getNode())); -} + unsigned BitOpc = 0; + // Set the right opcode for bitwise operations. + switch(Opc) { + default: llvm_unreachable("Only bit-wise/abs/neg operations are allowed."); + case ISD::AND: + case ISD::FABS: + BitOpc = Hexagon::S2_clrbit_i; + break; + case ISD::OR: + BitOpc = Hexagon::S2_setbit_i; + break; + case ISD::XOR: + case ISD::FNEG: + BitOpc = Hexagon::S2_togglebit_i; + break; + } -bool HexagonDAGToDAGISel::SelectADDRriU6_1(SDValue& Addr, SDValue &Base, - SDValue &Offset) { - if (Addr.getOpcode() == ISD::TargetExternalSymbol || - Addr.getOpcode() == ISD::TargetGlobalAddress) - return false; // Direct calls. + SDNode *Result; + // Get the right SDVal for the opcode. + SDValue SDVal = CurDAG->getTargetConstant(bitpos, MVT::i32); - if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { - Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); - Offset = CurDAG->getTargetConstant(0, MVT::i32); - return (IsU6_1_Offset(Offset.getNode())); + if (ValueVT == MVT::i32 || ValueVT == MVT::f32) { + Result = CurDAG->getMachineNode(BitOpc, dl, ValueVT, + N->getOperand(0), SDVal); + } else { + // 64-bit gymnastic to use REG_SEQUENCE. But it's worth it. + EVT SubValueVT; + if (ValueVT == MVT::i64) + SubValueVT = MVT::i32; + else + SubValueVT = MVT::f32; + + SDNode *Reg = N->getOperand(0).getNode(); + SDValue RegClass = CurDAG->getTargetConstant(Hexagon::DoubleRegsRegClassID, + MVT::i64); + + SDValue SubregHiIdx = CurDAG->getTargetConstant(Hexagon::subreg_hireg, + MVT::i32); + SDValue SubregLoIdx = CurDAG->getTargetConstant(Hexagon::subreg_loreg, + MVT::i32); + + SDValue SubregHI = CurDAG->getTargetExtractSubreg(Hexagon::subreg_hireg, dl, + MVT::i32, SDValue(Reg, 0)); + + SDValue SubregLO = CurDAG->getTargetExtractSubreg(Hexagon::subreg_loreg, dl, + MVT::i32, SDValue(Reg, 0)); + + // Clear/set/toggle hi or lo registers depending on the bit position. + if (SubValueVT != MVT::f32 && bitpos < 32) { + SDNode *Result0 = CurDAG->getMachineNode(BitOpc, dl, SubValueVT, + SubregLO, SDVal); + const SDValue Ops[] = { RegClass, SubregHI, SubregHiIdx, + SDValue(Result0, 0), SubregLoIdx }; + Result = CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, + dl, ValueVT, Ops); + } else { + if (Opc != ISD::FABS && Opc != ISD::FNEG) + SDVal = CurDAG->getTargetConstant(bitpos-32, MVT::i32); + SDNode *Result0 = CurDAG->getMachineNode(BitOpc, dl, SubValueVT, + SubregHI, SDVal); + const SDValue Ops[] = { RegClass, SDValue(Result0, 0), SubregHiIdx, + SubregLO, SubregLoIdx }; + Result = CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, + dl, ValueVT, Ops); + } } - Base = Addr; - Offset = CurDAG->getTargetConstant(0, MVT::i32); - return (IsU6_1_Offset(Offset.getNode())); + + ReplaceUses(N, Result); + return Result; } -bool HexagonDAGToDAGISel::SelectADDRriU6_2(SDValue& Addr, SDValue &Base, - SDValue &Offset) { - if (Addr.getOpcode() == ISD::TargetExternalSymbol || - Addr.getOpcode() == ISD::TargetGlobalAddress) - return false; // Direct calls. +SDNode *HexagonDAGToDAGISel::SelectFrameIndex(SDNode *N) { + int FX = cast<FrameIndexSDNode>(N)->getIndex(); + SDValue FI = CurDAG->getTargetFrameIndex(FX, MVT::i32); + SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32); + SDLoc DL(N); - if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { - Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); - Offset = CurDAG->getTargetConstant(0, MVT::i32); - return (IsU6_2_Offset(Offset.getNode())); - } - Base = Addr; - Offset = CurDAG->getTargetConstant(0, MVT::i32); - return (IsU6_2_Offset(Offset.getNode())); -} + SDNode *R = CurDAG->getMachineNode(Hexagon::TFR_FI, DL, MVT::i32, FI, Zero); + if (N->getHasDebugValue()) + CurDAG->TransferDbgValues(SDValue(N, 0), SDValue(R, 0)); + return R; +} -bool HexagonDAGToDAGISel::SelectMEMriS11_2(SDValue& Addr, SDValue &Base, - SDValue &Offset) { - if (Addr.getOpcode() != ISD::ADD) { - return(SelectADDRriS11_2(Addr, Base, Offset)); +SDNode *HexagonDAGToDAGISel::Select(SDNode *N) { + if (N->isMachineOpcode()) { + N->setNodeId(-1); + return nullptr; // Already selected. } - return SelectADDRriS11_2(Addr, Base, Offset); -} + switch (N->getOpcode()) { + case ISD::Constant: + return SelectConstant(N); + case ISD::ConstantFP: + return SelectConstantFP(N); -bool HexagonDAGToDAGISel::SelectADDRriS11_3(SDValue& Addr, SDValue &Base, - SDValue &Offset) { - if (Addr.getOpcode() == ISD::TargetExternalSymbol || - Addr.getOpcode() == ISD::TargetGlobalAddress) - return false; // Direct calls. + case ISD::FrameIndex: + return SelectFrameIndex(N); - if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { - Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); - Offset = CurDAG->getTargetConstant(0, MVT::i32); - return (IsS11_3_Offset(Offset.getNode())); - } - Base = Addr; - Offset = CurDAG->getTargetConstant(0, MVT::i32); - return (IsS11_3_Offset(Offset.getNode())); -} + case ISD::ADD: + return SelectAdd(N); -bool HexagonDAGToDAGISel::SelectADDRrr(SDValue &Addr, SDValue &R1, - SDValue &R2) { - if (Addr.getOpcode() == ISD::FrameIndex) return false; - if (Addr.getOpcode() == ISD::TargetExternalSymbol || - Addr.getOpcode() == ISD::TargetGlobalAddress) - return false; // Direct calls. - - if (Addr.getOpcode() == ISD::ADD) { - if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) - if (isInt<13>(CN->getSExtValue())) - return false; // Let the reg+imm pattern catch this! - R1 = Addr.getOperand(0); - R2 = Addr.getOperand(1); - return true; - } + case ISD::SHL: + return SelectSHL(N); - R1 = Addr; + case ISD::LOAD: + return SelectLoad(N); - return true; -} + case ISD::STORE: + return SelectStore(N); + case ISD::MUL: + return SelectMul(N); -// Handle generic address case. It is accessed from inlined asm =m constraints, -// which could have any kind of pointer. -bool HexagonDAGToDAGISel::SelectAddr(SDNode *Op, SDValue Addr, - SDValue &Base, SDValue &Offset) { - if (Addr.getOpcode() == ISD::TargetExternalSymbol || - Addr.getOpcode() == ISD::TargetGlobalAddress) - return false; // Direct calls. + case ISD::AND: + case ISD::OR: + case ISD::XOR: + case ISD::FABS: + case ISD::FNEG: + return SelectBitOp(N); - if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { - Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); - Offset = CurDAG->getTargetConstant(0, MVT::i32); - return true; - } + case ISD::ZERO_EXTEND: + return SelectZeroExtend(N); - if (Addr.getOpcode() == ISD::ADD) { - Base = Addr.getOperand(0); - Offset = Addr.getOperand(1); - return true; + case ISD::INTRINSIC_W_CHAIN: + return SelectIntrinsicWChain(N); + + case ISD::INTRINSIC_WO_CHAIN: + return SelectIntrinsicWOChain(N); } - Base = Addr; - Offset = CurDAG->getTargetConstant(0, MVT::i32); - return true; + return SelectCode(N); } bool HexagonDAGToDAGISel:: -SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, +SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) { - SDValue Op0, Op1; - - switch (ConstraintCode) { - case 'o': // Offsetable. - case 'v': // Not offsetable. - default: return true; - case 'm': // Memory. - if (!SelectAddr(Op.getNode(), Op, Op0, Op1)) - return true; + SDValue Inp = Op, Res; + + switch (ConstraintID) { + default: + return true; + case InlineAsm::Constraint_i: + case InlineAsm::Constraint_o: // Offsetable. + case InlineAsm::Constraint_v: // Not offsetable. + case InlineAsm::Constraint_m: // Memory. + if (SelectAddrFI(Inp, Res)) + OutOps.push_back(Res); + else + OutOps.push_back(Inp); break; } - OutOps.push_back(Op0); - OutOps.push_back(Op1); + OutOps.push_back(CurDAG->getTargetConstant(0, MVT::i32)); return false; } -bool HexagonDAGToDAGISel::isConstExtProfitable(SDNode *N) const { - unsigned UseCount = 0; - for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) { - UseCount++; - } - - return (UseCount <= 1); - -} - -//===--------------------------------------------------------------------===// -// Return 'true' if use count of the global address is below threshold. -//===--------------------------------------------------------------------===// -bool HexagonDAGToDAGISel::hasNumUsesBelowThresGA(SDNode *N) const { - assert(N->getOpcode() == ISD::TargetGlobalAddress && - "Expecting a target global address"); - - // Always try to fold the address. - if (TM.getOptLevel() == CodeGenOpt::Aggressive) - return true; - - GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N); - DenseMap<const GlobalValue *, unsigned>::const_iterator GI = - GlobalAddressUseCountMap.find(GA->getGlobal()); - - if (GI == GlobalAddressUseCountMap.end()) - return false; - - return GI->second <= MaxNumOfUsesForConstExtenders; -} - -//===--------------------------------------------------------------------===// -// Return true if the non-GP-relative global address can be folded. -//===--------------------------------------------------------------------===// -inline bool HexagonDAGToDAGISel::foldGlobalAddress(SDValue &N, SDValue &R) { - return foldGlobalAddressImpl(N, R, false); -} - -//===--------------------------------------------------------------------===// -// Return true if the GP-relative global address can be folded. -//===--------------------------------------------------------------------===// -inline bool HexagonDAGToDAGISel::foldGlobalAddressGP(SDValue &N, SDValue &R) { - return foldGlobalAddressImpl(N, R, true); -} +void HexagonDAGToDAGISel::PreprocessISelDAG() { + SelectionDAG &DAG = *CurDAG; + std::vector<SDNode*> Nodes; + for (auto I = DAG.allnodes_begin(), E = DAG.allnodes_end(); I != E; ++I) + Nodes.push_back(I); + + // Simplify: (or (select c x 0) z) -> (select c (or x z) z) + // (or (select c 0 y) z) -> (select c z (or y z)) + // This may not be the right thing for all targets, so do it here. + for (auto I: Nodes) { + if (I->getOpcode() != ISD::OR) + continue; + + auto IsZero = [] (const SDValue &V) -> bool { + if (ConstantSDNode *SC = dyn_cast<ConstantSDNode>(V.getNode())) + return SC->isNullValue(); + return false; + }; + auto IsSelect0 = [IsZero] (const SDValue &Op) -> bool { + if (Op.getOpcode() != ISD::SELECT) + return false; + return IsZero(Op.getOperand(1)) || IsZero(Op.getOperand(2)); + }; -//===--------------------------------------------------------------------===// -// Fold offset of the global address if number of uses are below threshold. -//===--------------------------------------------------------------------===// -bool HexagonDAGToDAGISel::foldGlobalAddressImpl(SDValue &N, SDValue &R, - bool ShouldLookForGP) { - if (N.getOpcode() == ISD::ADD) { - SDValue N0 = N.getOperand(0); - SDValue N1 = N.getOperand(1); - if ((ShouldLookForGP && (N0.getOpcode() == HexagonISD::CONST32_GP)) || - (!ShouldLookForGP && (N0.getOpcode() == HexagonISD::CONST32))) { - ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N1); - GlobalAddressSDNode *GA = - dyn_cast<GlobalAddressSDNode>(N0.getOperand(0)); - - if (Const && GA && - (GA->getOpcode() == ISD::TargetGlobalAddress)) { - if ((N0.getOpcode() == HexagonISD::CONST32) && - !hasNumUsesBelowThresGA(GA)) - return false; - R = CurDAG->getTargetGlobalAddress(GA->getGlobal(), - SDLoc(Const), - N.getValueType(), - GA->getOffset() + - (uint64_t)Const->getSExtValue()); - return true; + SDValue N0 = I->getOperand(0), N1 = I->getOperand(1); + EVT VT = I->getValueType(0); + bool SelN0 = IsSelect0(N0); + SDValue SOp = SelN0 ? N0 : N1; + SDValue VOp = SelN0 ? N1 : N0; + + if (SOp.getOpcode() == ISD::SELECT && SOp.getNode()->hasOneUse()) { + SDValue SC = SOp.getOperand(0); + SDValue SX = SOp.getOperand(1); + SDValue SY = SOp.getOperand(2); + SDLoc DLS = SOp; + if (IsZero(SY)) { + SDValue NewOr = DAG.getNode(ISD::OR, DLS, VT, SX, VOp); + SDValue NewSel = DAG.getNode(ISD::SELECT, DLS, VT, SC, NewOr, VOp); + DAG.ReplaceAllUsesWith(I, NewSel.getNode()); + } else if (IsZero(SX)) { + SDValue NewOr = DAG.getNode(ISD::OR, DLS, VT, SY, VOp); + SDValue NewSel = DAG.getNode(ISD::SELECT, DLS, VT, SC, VOp, NewOr); + DAG.ReplaceAllUsesWith(I, NewSel.getNode()); } } } - return false; } + bool HexagonDAGToDAGISel::SelectAddrFI(SDValue& N, SDValue &R) { if (N.getOpcode() != ISD::FrameIndex) return false; @@ -1681,8 +1410,8 @@ bool HexagonDAGToDAGISel::SelectGlobalAddress(SDValue &N, SDValue &R, return false; } -bool HexagonDAGToDAGISel::isValueExtension(SDValue const &Val, - unsigned FromBits, SDValue &Src) { +bool HexagonDAGToDAGISel::isValueExtension(const SDValue &Val, + unsigned FromBits, SDValue &Src) { unsigned Opc = Val.getOpcode(); switch (Opc) { case ISD::SIGN_EXTEND: diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index 0072994..a2209ab 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -164,6 +164,12 @@ CC_Hexagon (unsigned ValNo, MVT ValVT, LocInfo = CCValAssign::ZExt; else LocInfo = CCValAssign::AExt; + } else if (LocVT == MVT::v4i8 || LocVT == MVT::v2i16) { + LocVT = MVT::i32; + LocInfo = CCValAssign::BCvt; + } else if (LocVT == MVT::v8i8 || LocVT == MVT::v4i16 || LocVT == MVT::v2i32) { + LocVT = MVT::i64; + LocInfo = CCValAssign::BCvt; } if (LocVT == MVT::i32 || LocVT == MVT::f32) { @@ -239,6 +245,12 @@ static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT, LocInfo = CCValAssign::ZExt; else LocInfo = CCValAssign::AExt; + } else if (LocVT == MVT::v4i8 || LocVT == MVT::v2i16) { + LocVT = MVT::i32; + LocInfo = CCValAssign::BCvt; + } else if (LocVT == MVT::v8i8 || LocVT == MVT::v4i16 || LocVT == MVT::v2i32) { + LocVT = MVT::i64; + LocInfo = CCValAssign::BCvt; } if (LocVT == MVT::i32 || LocVT == MVT::f32) { @@ -764,7 +776,7 @@ LowerBR_JT(SDValue Op, SelectionDAG &DAG) const BlockAddress::get(const_cast<BasicBlock *>(MBB->getBasicBlock())); } - SDValue JumpTableBase = DAG.getNode(HexagonISD::WrapperJT, dl, + SDValue JumpTableBase = DAG.getNode(HexagonISD::JT, dl, getPointerTy(), TargetJT); SDValue ShiftIndex = DAG.getNode(ISD::SHL, dl, MVT::i32, Index, DAG.getConstant(2, MVT::i32)); @@ -944,6 +956,192 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { false, 0); } +// Creates a SPLAT instruction for a constant value VAL. +static SDValue createSplat(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue Val) { + if (VT.getSimpleVT() == MVT::v4i8) + return DAG.getNode(HexagonISD::VSPLATB, dl, VT, Val); + + if (VT.getSimpleVT() == MVT::v4i16) + return DAG.getNode(HexagonISD::VSPLATH, dl, VT, Val); + + return SDValue(); +} + +static bool isSExtFree(SDValue N) { + // A sign-extend of a truncate of a sign-extend is free. + if (N.getOpcode() == ISD::TRUNCATE && + N.getOperand(0).getOpcode() == ISD::AssertSext) + return true; + // We have sign-extended loads. + if (N.getOpcode() == ISD::LOAD) + return true; + return false; +} + +SDValue HexagonTargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + SDValue InpVal = Op.getOperand(0); + if (isa<ConstantSDNode>(InpVal)) { + uint64_t V = cast<ConstantSDNode>(InpVal)->getZExtValue(); + return DAG.getTargetConstant(countPopulation(V), MVT::i64); + } + SDValue PopOut = DAG.getNode(HexagonISD::POPCOUNT, dl, MVT::i32, InpVal); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, PopOut); +} + +SDValue HexagonTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue Cmp = Op.getOperand(2); + ISD::CondCode CC = cast<CondCodeSDNode>(Cmp)->get(); + + EVT VT = Op.getValueType(); + EVT LHSVT = LHS.getValueType(); + EVT RHSVT = RHS.getValueType(); + + if (LHSVT == MVT::v2i16) { + assert(ISD::isSignedIntSetCC(CC) || ISD::isUnsignedIntSetCC(CC)); + unsigned ExtOpc = ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND; + SDValue LX = DAG.getNode(ExtOpc, dl, MVT::v2i32, LHS); + SDValue RX = DAG.getNode(ExtOpc, dl, MVT::v2i32, RHS); + SDValue SC = DAG.getNode(ISD::SETCC, dl, MVT::v2i1, LX, RX, Cmp); + return SC; + } + + // Treat all other vector types as legal. + if (VT.isVector()) + return Op; + + // Equals and not equals should use sign-extend, not zero-extend, since + // we can represent small negative values in the compare instructions. + // The LLVM default is to use zero-extend arbitrarily in these cases. + if ((CC == ISD::SETEQ || CC == ISD::SETNE) && + (RHSVT == MVT::i8 || RHSVT == MVT::i16) && + (LHSVT == MVT::i8 || LHSVT == MVT::i16)) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS); + if (C && C->getAPIntValue().isNegative()) { + LHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, LHS); + RHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, RHS); + return DAG.getNode(ISD::SETCC, dl, Op.getValueType(), + LHS, RHS, Op.getOperand(2)); + } + if (isSExtFree(LHS) || isSExtFree(RHS)) { + LHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, LHS); + RHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, RHS); + return DAG.getNode(ISD::SETCC, dl, Op.getValueType(), + LHS, RHS, Op.getOperand(2)); + } + } + return SDValue(); +} + +SDValue HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) + const { + SDValue PredOp = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1), Op2 = Op.getOperand(2); + EVT OpVT = Op1.getValueType(); + SDLoc DL(Op); + + if (OpVT == MVT::v2i16) { + SDValue X1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i32, Op1); + SDValue X2 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i32, Op2); + SDValue SL = DAG.getNode(ISD::VSELECT, DL, MVT::v2i32, PredOp, X1, X2); + SDValue TR = DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i16, SL); + return TR; + } + + return SDValue(); +} + +// Handle only specific vector loads. +SDValue HexagonTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDLoc DL(Op); + LoadSDNode *LoadNode = cast<LoadSDNode>(Op); + SDValue Chain = LoadNode->getChain(); + SDValue Ptr = Op.getOperand(1); + SDValue LoweredLoad; + SDValue Result; + SDValue Base = LoadNode->getBasePtr(); + ISD::LoadExtType Ext = LoadNode->getExtensionType(); + unsigned Alignment = LoadNode->getAlignment(); + SDValue LoadChain; + + if(Ext == ISD::NON_EXTLOAD) + Ext = ISD::ZEXTLOAD; + + if (VT == MVT::v4i16) { + if (Alignment == 2) { + SDValue Loads[4]; + // Base load. + Loads[0] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Base, + LoadNode->getPointerInfo(), MVT::i16, + LoadNode->isVolatile(), + LoadNode->isNonTemporal(), + LoadNode->isInvariant(), + Alignment); + // Base+2 load. + SDValue Increment = DAG.getConstant(2, MVT::i32); + Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment); + Loads[1] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr, + LoadNode->getPointerInfo(), MVT::i16, + LoadNode->isVolatile(), + LoadNode->isNonTemporal(), + LoadNode->isInvariant(), + Alignment); + // SHL 16, then OR base and base+2. + SDValue ShiftAmount = DAG.getConstant(16, MVT::i32); + SDValue Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[1], ShiftAmount); + SDValue Tmp2 = DAG.getNode(ISD::OR, DL, MVT::i32, Tmp1, Loads[0]); + // Base + 4. + Increment = DAG.getConstant(4, MVT::i32); + Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment); + Loads[2] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr, + LoadNode->getPointerInfo(), MVT::i16, + LoadNode->isVolatile(), + LoadNode->isNonTemporal(), + LoadNode->isInvariant(), + Alignment); + // Base + 6. + Increment = DAG.getConstant(6, MVT::i32); + Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment); + Loads[3] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr, + LoadNode->getPointerInfo(), MVT::i16, + LoadNode->isVolatile(), + LoadNode->isNonTemporal(), + LoadNode->isInvariant(), + Alignment); + // SHL 16, then OR base+4 and base+6. + Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[3], ShiftAmount); + SDValue Tmp4 = DAG.getNode(ISD::OR, DL, MVT::i32, Tmp1, Loads[2]); + // Combine to i64. This could be optimised out later if we can + // affect reg allocation of this code. + Result = DAG.getNode(HexagonISD::COMBINE, DL, MVT::i64, Tmp4, Tmp2); + LoadChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + Loads[0].getValue(1), Loads[1].getValue(1), + Loads[2].getValue(1), Loads[3].getValue(1)); + } else { + // Perform default type expansion. + Result = DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(), + LoadNode->isVolatile(), LoadNode->isNonTemporal(), + LoadNode->isInvariant(), LoadNode->getAlignment()); + LoadChain = Result.getValue(1); + } + } else + llvm_unreachable("Custom lowering unsupported load"); + + Result = DAG.getNode(ISD::BITCAST, DL, VT, Result); + // Since we pretend to lower a load, we need the original chain + // info attached to the result. + SDValue Ops[] = { Result, LoadChain }; + + return DAG.getMergeValues(Ops, DL); +} + + SDValue HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { EVT ValTy = Op.getValueType(); @@ -1028,6 +1226,19 @@ SDValue HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op, return DAG.getNode(HexagonISD::CONST32, dl, getPointerTy(), Result); } +// Specifies that for loads and stores VT can be promoted to PromotedLdStVT. +void HexagonTargetLowering::promoteLdStType(EVT VT, EVT PromotedLdStVT) { + if (VT != PromotedLdStVT) { + setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); + AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), + PromotedLdStVT.getSimpleVT()); + + setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); + AddPromotedToType(ISD::STORE, VT.getSimpleVT(), + PromotedLdStVT.getSimpleVT()); + } +} + SDValue HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); @@ -1045,14 +1256,105 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, : TargetLowering(TM), Subtarget(&STI) { // Set up the register classes. + addRegisterClass(MVT::v2i1, &Hexagon::PredRegsRegClass); // bbbbaaaa + addRegisterClass(MVT::v4i1, &Hexagon::PredRegsRegClass); // ddccbbaa + addRegisterClass(MVT::v8i1, &Hexagon::PredRegsRegClass); // hgfedcba addRegisterClass(MVT::i32, &Hexagon::IntRegsRegClass); - addRegisterClass(MVT::i64, &Hexagon::DoubleRegsRegClass); + addRegisterClass(MVT::v4i8, &Hexagon::IntRegsRegClass); + addRegisterClass(MVT::v2i16, &Hexagon::IntRegsRegClass); + promoteLdStType(MVT::v4i8, MVT::i32); + promoteLdStType(MVT::v2i16, MVT::i32); if (Subtarget->hasV5TOps()) { addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass); addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass); } + addRegisterClass(MVT::i64, &Hexagon::DoubleRegsRegClass); + addRegisterClass(MVT::v8i8, &Hexagon::DoubleRegsRegClass); + addRegisterClass(MVT::v4i16, &Hexagon::DoubleRegsRegClass); + addRegisterClass(MVT::v2i32, &Hexagon::DoubleRegsRegClass); + promoteLdStType(MVT::v8i8, MVT::i64); + + // Custom lower v4i16 load only. Let v4i16 store to be + // promoted for now. + setOperationAction(ISD::LOAD, MVT::v4i16, Custom); + AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::i64); + setOperationAction(ISD::STORE, MVT::v4i16, Promote); + AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::i64); + promoteLdStType(MVT::v2i32, MVT::i64); + + for (unsigned i = (unsigned) MVT::FIRST_VECTOR_VALUETYPE; + i <= (unsigned) MVT::LAST_VECTOR_VALUETYPE; ++i) { + MVT::SimpleValueType VT = (MVT::SimpleValueType) i; + + // Hexagon does not have support for the following operations, + // so they need to be expanded. + setOperationAction(ISD::SELECT, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); + setOperationAction(ISD::FDIV, VT, Expand); + setOperationAction(ISD::FNEG, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); + setOperationAction(ISD::CTPOP, VT, Expand); + setOperationAction(ISD::CTLZ, VT, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::CTTZ, VT, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); + + // Expand all any extend loads. + for (unsigned j = (unsigned) MVT::FIRST_VECTOR_VALUETYPE; + j <= (unsigned) MVT::LAST_VECTOR_VALUETYPE; ++j) + setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType) j, VT, Expand); + + // Expand all trunc stores. + for (unsigned TargetVT = (unsigned) MVT::FIRST_VECTOR_VALUETYPE; + TargetVT <= (unsigned) MVT::LAST_VECTOR_VALUETYPE; ++TargetVT) + setTruncStoreAction(VT, (MVT::SimpleValueType) TargetVT, Expand); + + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); + setOperationAction(ISD::ConstantPool, VT, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); + setOperationAction(ISD::BUILD_VECTOR, VT, Expand); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Expand); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Expand); + setOperationAction(ISD::CONCAT_VECTORS, VT, Expand); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); + + if (!isTypeLegal(VT)) + continue; + + setOperationAction(ISD::ADD, VT, Legal); + setOperationAction(ISD::SUB, VT, Legal); + setOperationAction(ISD::MUL, VT, Legal); + + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + } + + setOperationAction(ISD::SETCC, MVT::v2i16, Custom); + setOperationAction(ISD::VSELECT, MVT::v2i16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); + + setOperationAction(ISD::ConstantPool, MVT::i32, Custom); + addRegisterClass(MVT::i1, &Hexagon::PredRegsRegClass); computeRegisterProperties(Subtarget->getRegisterInfo()); @@ -1308,9 +1610,14 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, // Turn FP extload into load/fextend. for (MVT VT : MVT::fp_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); - // Hexagon has a i1 sign extending load. - for (MVT VT : MVT::integer_valuetypes()) - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); + + // No extending loads from i32. + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); + } + // Turn FP truncstore into trunc + store. setTruncStoreAction(MVT::f64, MVT::f32, Expand); @@ -1358,6 +1665,10 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::f64, Expand); } + // Hexagon needs to optimize cases with negative constants. + setOperationAction(ISD::SETCC, MVT::i16, Custom); + setOperationAction(ISD::SETCC, MVT::i8, Custom); + if (EmitJumpTables) { setOperationAction(ISD::BR_JT, MVT::Other, Custom); } else { @@ -1415,9 +1726,17 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTLZ, MVT::i64, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); + setOperationAction(ISD::ROTL, MVT::i32, Expand); setOperationAction(ISD::ROTR, MVT::i32, Expand); setOperationAction(ISD::BSWAP, MVT::i32, Expand); + setOperationAction(ISD::ROTL, MVT::i64, Expand); + setOperationAction(ISD::ROTR, MVT::i64, Expand); + setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand); + setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand); + setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand); + setOperationAction(ISD::BR_CC, MVT::i64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); setOperationAction(ISD::FPOW, MVT::f64, Expand); @@ -1429,7 +1748,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); - + setOperationAction(ISD::MULHS, MVT::i64, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); @@ -1463,27 +1782,63 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { switch (Opcode) { - default: return nullptr; - case HexagonISD::CONST32: return "HexagonISD::CONST32"; - case HexagonISD::CONST32_GP: return "HexagonISD::CONST32_GP"; - case HexagonISD::CONST32_Int_Real: return "HexagonISD::CONST32_Int_Real"; - case HexagonISD::ADJDYNALLOC: return "HexagonISD::ADJDYNALLOC"; - case HexagonISD::CMPICC: return "HexagonISD::CMPICC"; - case HexagonISD::CMPFCC: return "HexagonISD::CMPFCC"; - case HexagonISD::BRICC: return "HexagonISD::BRICC"; - case HexagonISD::BRFCC: return "HexagonISD::BRFCC"; - case HexagonISD::SELECT_ICC: return "HexagonISD::SELECT_ICC"; - case HexagonISD::SELECT_FCC: return "HexagonISD::SELECT_FCC"; - case HexagonISD::Hi: return "HexagonISD::Hi"; - case HexagonISD::Lo: return "HexagonISD::Lo"; - case HexagonISD::FTOI: return "HexagonISD::FTOI"; - case HexagonISD::ITOF: return "HexagonISD::ITOF"; - case HexagonISD::CALLv3: return "HexagonISD::CALLv3"; - case HexagonISD::CALLv3nr: return "HexagonISD::CALLv3nr"; - case HexagonISD::CALLR: return "HexagonISD::CALLR"; - case HexagonISD::RET_FLAG: return "HexagonISD::RET_FLAG"; - case HexagonISD::BR_JT: return "HexagonISD::BR_JT"; - case HexagonISD::TC_RETURN: return "HexagonISD::TC_RETURN"; + default: return nullptr; + case HexagonISD::CONST32: return "HexagonISD::CONST32"; + case HexagonISD::CONST32_GP: return "HexagonISD::CONST32_GP"; + case HexagonISD::CONST32_Int_Real: return "HexagonISD::CONST32_Int_Real"; + case HexagonISD::ADJDYNALLOC: return "HexagonISD::ADJDYNALLOC"; + case HexagonISD::CMPICC: return "HexagonISD::CMPICC"; + case HexagonISD::CMPFCC: return "HexagonISD::CMPFCC"; + case HexagonISD::BRICC: return "HexagonISD::BRICC"; + case HexagonISD::BRFCC: return "HexagonISD::BRFCC"; + case HexagonISD::SELECT_ICC: return "HexagonISD::SELECT_ICC"; + case HexagonISD::SELECT_FCC: return "HexagonISD::SELECT_FCC"; + case HexagonISD::Hi: return "HexagonISD::Hi"; + case HexagonISD::Lo: return "HexagonISD::Lo"; + case HexagonISD::JT: return "HexagonISD::JT"; + case HexagonISD::CP: return "HexagonISD::CP"; + case HexagonISD::POPCOUNT: return "HexagonISD::POPCOUNT"; + case HexagonISD::COMBINE: return "HexagonISD::COMBINE"; + case HexagonISD::PACKHL: return "HexagonISD::PACKHL"; + case HexagonISD::VSPLATB: return "HexagonISD::VSPLTB"; + case HexagonISD::VSPLATH: return "HexagonISD::VSPLATH"; + case HexagonISD::SHUFFEB: return "HexagonISD::SHUFFEB"; + case HexagonISD::SHUFFEH: return "HexagonISD::SHUFFEH"; + case HexagonISD::SHUFFOB: return "HexagonISD::SHUFFOB"; + case HexagonISD::SHUFFOH: return "HexagonISD::SHUFFOH"; + case HexagonISD::VSXTBH: return "HexagonISD::VSXTBH"; + case HexagonISD::VSXTBW: return "HexagonISD::VSXTBW"; + case HexagonISD::VSRAW: return "HexagonISD::VSRAW"; + case HexagonISD::VSRAH: return "HexagonISD::VSRAH"; + case HexagonISD::VSRLW: return "HexagonISD::VSRLW"; + case HexagonISD::VSRLH: return "HexagonISD::VSRLH"; + case HexagonISD::VSHLW: return "HexagonISD::VSHLW"; + case HexagonISD::VSHLH: return "HexagonISD::VSHLH"; + case HexagonISD::VCMPBEQ: return "HexagonISD::VCMPBEQ"; + case HexagonISD::VCMPBGT: return "HexagonISD::VCMPBGT"; + case HexagonISD::VCMPBGTU: return "HexagonISD::VCMPBGTU"; + case HexagonISD::VCMPHEQ: return "HexagonISD::VCMPHEQ"; + case HexagonISD::VCMPHGT: return "HexagonISD::VCMPHGT"; + case HexagonISD::VCMPHGTU: return "HexagonISD::VCMPHGTU"; + case HexagonISD::VCMPWEQ: return "HexagonISD::VCMPWEQ"; + case HexagonISD::VCMPWGT: return "HexagonISD::VCMPWGT"; + case HexagonISD::VCMPWGTU: return "HexagonISD::VCMPWGTU"; + case HexagonISD::INSERT_ri: return "HexagonISD::INSERT_ri"; + case HexagonISD::INSERT_rd: return "HexagonISD::INSERT_rd"; + case HexagonISD::INSERT_riv: return "HexagonISD::INSERT_riv"; + case HexagonISD::INSERT_rdv: return "HexagonISD::INSERT_rdv"; + case HexagonISD::EXTRACTU_ri: return "HexagonISD::EXTRACTU_ri"; + case HexagonISD::EXTRACTU_rd: return "HexagonISD::EXTRACTU_rd"; + case HexagonISD::EXTRACTU_riv: return "HexagonISD::EXTRACTU_riv"; + case HexagonISD::EXTRACTU_rdv: return "HexagonISD::EXTRACTU_rdv"; + case HexagonISD::FTOI: return "HexagonISD::FTOI"; + case HexagonISD::ITOF: return "HexagonISD::ITOF"; + case HexagonISD::CALLv3: return "HexagonISD::CALLv3"; + case HexagonISD::CALLv3nr: return "HexagonISD::CALLv3nr"; + case HexagonISD::CALLR: return "HexagonISD::CALLR"; + case HexagonISD::RET_FLAG: return "HexagonISD::RET_FLAG"; + case HexagonISD::BR_JT: return "HexagonISD::BR_JT"; + case HexagonISD::TC_RETURN: return "HexagonISD::TC_RETURN"; case HexagonISD::EH_RETURN: return "HexagonISD::EH_RETURN"; } } @@ -1505,6 +1860,474 @@ bool HexagonTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { return ((VT1.getSimpleVT() == MVT::i64) && (VT2.getSimpleVT() == MVT::i32)); } +// shouldExpandBuildVectorWithShuffles +// Should we expand the build vector with shuffles? +bool +HexagonTargetLowering::shouldExpandBuildVectorWithShuffles(EVT VT, + unsigned DefinedValues) const { + + // Hexagon vector shuffle operates on element sizes of bytes or halfwords + EVT EltVT = VT.getVectorElementType(); + int EltBits = EltVT.getSizeInBits(); + if ((EltBits != 8) && (EltBits != 16)) + return false; + + return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues); +} + +// LowerVECTOR_SHUFFLE - Lower a vector shuffle (V1, V2, V3). V1 and +// V2 are the two vectors to select data from, V3 is the permutation. +static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { + const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op); + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + + if (V2.getOpcode() == ISD::UNDEF) + V2 = V1; + + if (SVN->isSplat()) { + int Lane = SVN->getSplatIndex(); + if (Lane == -1) Lane = 0; + + // Test if V1 is a SCALAR_TO_VECTOR. + if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) + return createSplat(DAG, dl, VT, V1.getOperand(0)); + + // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR + // (and probably will turn into a SCALAR_TO_VECTOR once legalization + // reaches it). + if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && + !isa<ConstantSDNode>(V1.getOperand(0))) { + bool IsScalarToVector = true; + for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) + if (V1.getOperand(i).getOpcode() != ISD::UNDEF) { + IsScalarToVector = false; + break; + } + if (IsScalarToVector) + return createSplat(DAG, dl, VT, V1.getOperand(0)); + } + return createSplat(DAG, dl, VT, DAG.getConstant(Lane, MVT::i32)); + } + + // FIXME: We need to support more general vector shuffles. See + // below the comment from the ARM backend that deals in the general + // case with the vector shuffles. For now, let expand handle these. + return SDValue(); + + // If the shuffle is not directly supported and it has 4 elements, use + // the PerfectShuffle-generated table to synthesize it from other shuffles. +} + +// If BUILD_VECTOR has same base element repeated several times, +// report true. +static bool isCommonSplatElement(BuildVectorSDNode *BVN) { + unsigned NElts = BVN->getNumOperands(); + SDValue V0 = BVN->getOperand(0); + + for (unsigned i = 1, e = NElts; i != e; ++i) { + if (BVN->getOperand(i) != V0) + return false; + } + return true; +} + +// LowerVECTOR_SHIFT - Lower a vector shift. Try to convert +// <VT> = SHL/SRA/SRL <VT> by <VT> to Hexagon specific +// <VT> = SHL/SRA/SRL <VT> by <IT/i32>. +static SDValue LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) { + BuildVectorSDNode *BVN = 0; + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + SDValue V3; + SDLoc dl(Op); + EVT VT = Op.getValueType(); + + if ((BVN = dyn_cast<BuildVectorSDNode>(V1.getNode())) && + isCommonSplatElement(BVN)) + V3 = V2; + else if ((BVN = dyn_cast<BuildVectorSDNode>(V2.getNode())) && + isCommonSplatElement(BVN)) + V3 = V1; + else + return SDValue(); + + SDValue CommonSplat = BVN->getOperand(0); + SDValue Result; + + if (VT.getSimpleVT() == MVT::v4i16) { + switch (Op.getOpcode()) { + case ISD::SRA: + Result = DAG.getNode(HexagonISD::VSRAH, dl, VT, V3, CommonSplat); + break; + case ISD::SHL: + Result = DAG.getNode(HexagonISD::VSHLH, dl, VT, V3, CommonSplat); + break; + case ISD::SRL: + Result = DAG.getNode(HexagonISD::VSRLH, dl, VT, V3, CommonSplat); + break; + default: + return SDValue(); + } + } else if (VT.getSimpleVT() == MVT::v2i32) { + switch (Op.getOpcode()) { + case ISD::SRA: + Result = DAG.getNode(HexagonISD::VSRAW, dl, VT, V3, CommonSplat); + break; + case ISD::SHL: + Result = DAG.getNode(HexagonISD::VSHLW, dl, VT, V3, CommonSplat); + break; + case ISD::SRL: + Result = DAG.getNode(HexagonISD::VSRLW, dl, VT, V3, CommonSplat); + break; + default: + return SDValue(); + } + } else { + return SDValue(); + } + + return DAG.getNode(ISD::BITCAST, dl, VT, Result); +} + +SDValue +HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { + BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + + unsigned Size = VT.getSizeInBits(); + + // A vector larger than 64 bits cannot be represented in Hexagon. + // Expand will split the vector. + if (Size > 64) + return SDValue(); + + APInt APSplatBits, APSplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + unsigned NElts = BVN->getNumOperands(); + + // Try to generate a SPLAT instruction. + if ((VT.getSimpleVT() == MVT::v4i8 || VT.getSimpleVT() == MVT::v4i16) && + (BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize, + HasAnyUndefs, 0, true) && SplatBitSize <= 16)) { + unsigned SplatBits = APSplatBits.getZExtValue(); + int32_t SextVal = ((int32_t) (SplatBits << (32 - SplatBitSize)) >> + (32 - SplatBitSize)); + return createSplat(DAG, dl, VT, DAG.getConstant(SextVal, MVT::i32)); + } + + // Try to generate COMBINE to build v2i32 vectors. + if (VT.getSimpleVT() == MVT::v2i32) { + SDValue V0 = BVN->getOperand(0); + SDValue V1 = BVN->getOperand(1); + + if (V0.getOpcode() == ISD::UNDEF) + V0 = DAG.getConstant(0, MVT::i32); + if (V1.getOpcode() == ISD::UNDEF) + V1 = DAG.getConstant(0, MVT::i32); + + ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(V0); + ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(V1); + // If the element isn't a constant, it is in a register: + // generate a COMBINE Register Register instruction. + if (!C0 || !C1) + return DAG.getNode(HexagonISD::COMBINE, dl, VT, V1, V0); + + // If one of the operands is an 8 bit integer constant, generate + // a COMBINE Immediate Immediate instruction. + if (isInt<8>(C0->getSExtValue()) || + isInt<8>(C1->getSExtValue())) + return DAG.getNode(HexagonISD::COMBINE, dl, VT, V1, V0); + } + + // Try to generate a S2_packhl to build v2i16 vectors. + if (VT.getSimpleVT() == MVT::v2i16) { + for (unsigned i = 0, e = NElts; i != e; ++i) { + if (BVN->getOperand(i).getOpcode() == ISD::UNDEF) + continue; + ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(BVN->getOperand(i)); + // If the element isn't a constant, it is in a register: + // generate a S2_packhl instruction. + if (!Cst) { + SDValue pack = DAG.getNode(HexagonISD::PACKHL, dl, MVT::v4i16, + BVN->getOperand(1), BVN->getOperand(0)); + + return DAG.getTargetExtractSubreg(Hexagon::subreg_loreg, dl, MVT::v2i16, + pack); + } + } + } + + // In the general case, generate a CONST32 or a CONST64 for constant vectors, + // and insert_vector_elt for all the other cases. + uint64_t Res = 0; + unsigned EltSize = Size / NElts; + SDValue ConstVal; + uint64_t Mask = ~uint64_t(0ULL) >> (64 - EltSize); + bool HasNonConstantElements = false; + + for (unsigned i = 0, e = NElts; i != e; ++i) { + // LLVM's BUILD_VECTOR operands are in Little Endian mode, whereas Hexagon's + // combine, const64, etc. are Big Endian. + unsigned OpIdx = NElts - i - 1; + SDValue Operand = BVN->getOperand(OpIdx); + if (Operand.getOpcode() == ISD::UNDEF) + continue; + + int64_t Val = 0; + if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Operand)) + Val = Cst->getSExtValue(); + else + HasNonConstantElements = true; + + Val &= Mask; + Res = (Res << EltSize) | Val; + } + + if (Size == 64) + ConstVal = DAG.getConstant(Res, MVT::i64); + else + ConstVal = DAG.getConstant(Res, MVT::i32); + + // When there are non constant operands, add them with INSERT_VECTOR_ELT to + // ConstVal, the constant part of the vector. + if (HasNonConstantElements) { + EVT EltVT = VT.getVectorElementType(); + SDValue Width = DAG.getConstant(EltVT.getSizeInBits(), MVT::i64); + SDValue Shifted = DAG.getNode(ISD::SHL, dl, MVT::i64, Width, + DAG.getConstant(32, MVT::i64)); + + for (unsigned i = 0, e = NElts; i != e; ++i) { + // LLVM's BUILD_VECTOR operands are in Little Endian mode, whereas Hexagon + // is Big Endian. + unsigned OpIdx = NElts - i - 1; + SDValue Operand = BVN->getOperand(OpIdx); + if (dyn_cast<ConstantSDNode>(Operand)) + // This operand is already in ConstVal. + continue; + + if (VT.getSizeInBits() == 64 && + Operand.getValueType().getSizeInBits() == 32) { + SDValue C = DAG.getConstant(0, MVT::i32); + Operand = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, Operand); + } + + SDValue Idx = DAG.getConstant(OpIdx, MVT::i64); + SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i64, Idx, Width); + SDValue Combined = DAG.getNode(ISD::OR, dl, MVT::i64, Shifted, Offset); + const SDValue Ops[] = {ConstVal, Operand, Combined}; + + if (VT.getSizeInBits() == 32) + ConstVal = DAG.getNode(HexagonISD::INSERT_riv, dl, MVT::i32, Ops); + else + ConstVal = DAG.getNode(HexagonISD::INSERT_rdv, dl, MVT::i64, Ops); + } + } + + return DAG.getNode(ISD::BITCAST, dl, VT, ConstVal); +} + +SDValue +HexagonTargetLowering::LowerCONCAT_VECTORS(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + unsigned NElts = Op.getNumOperands(); + SDValue Vec = Op.getOperand(0); + EVT VecVT = Vec.getValueType(); + SDValue Width = DAG.getConstant(VecVT.getSizeInBits(), MVT::i64); + SDValue Shifted = DAG.getNode(ISD::SHL, dl, MVT::i64, Width, + DAG.getConstant(32, MVT::i64)); + SDValue ConstVal = DAG.getConstant(0, MVT::i64); + + ConstantSDNode *W = dyn_cast<ConstantSDNode>(Width); + ConstantSDNode *S = dyn_cast<ConstantSDNode>(Shifted); + + if ((VecVT.getSimpleVT() == MVT::v2i16) && (NElts == 2) && W && S) { + if ((W->getZExtValue() == 32) && ((S->getZExtValue() >> 32) == 32)) { + // We are trying to concat two v2i16 to a single v4i16. + SDValue Vec0 = Op.getOperand(1); + SDValue Combined = DAG.getNode(HexagonISD::COMBINE, dl, VT, Vec0, Vec); + return DAG.getNode(ISD::BITCAST, dl, VT, Combined); + } + } + + if ((VecVT.getSimpleVT() == MVT::v4i8) && (NElts == 2) && W && S) { + if ((W->getZExtValue() == 32) && ((S->getZExtValue() >> 32) == 32)) { + // We are trying to concat two v4i8 to a single v8i8. + SDValue Vec0 = Op.getOperand(1); + SDValue Combined = DAG.getNode(HexagonISD::COMBINE, dl, VT, Vec0, Vec); + return DAG.getNode(ISD::BITCAST, dl, VT, Combined); + } + } + + for (unsigned i = 0, e = NElts; i != e; ++i) { + unsigned OpIdx = NElts - i - 1; + SDValue Operand = Op.getOperand(OpIdx); + + if (VT.getSizeInBits() == 64 && + Operand.getValueType().getSizeInBits() == 32) { + SDValue C = DAG.getConstant(0, MVT::i32); + Operand = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, Operand); + } + + SDValue Idx = DAG.getConstant(OpIdx, MVT::i64); + SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i64, Idx, Width); + SDValue Combined = DAG.getNode(ISD::OR, dl, MVT::i64, Shifted, Offset); + const SDValue Ops[] = {ConstVal, Operand, Combined}; + + if (VT.getSizeInBits() == 32) + ConstVal = DAG.getNode(HexagonISD::INSERT_riv, dl, MVT::i32, Ops); + else + ConstVal = DAG.getNode(HexagonISD::INSERT_rdv, dl, MVT::i64, Ops); + } + + return DAG.getNode(ISD::BITCAST, dl, VT, ConstVal); +} + +SDValue +HexagonTargetLowering::LowerEXTRACT_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + int VTN = VT.isVector() ? VT.getVectorNumElements() : 1; + SDLoc dl(Op); + SDValue Idx = Op.getOperand(1); + SDValue Vec = Op.getOperand(0); + EVT VecVT = Vec.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + int EltSize = EltVT.getSizeInBits(); + SDValue Width = DAG.getConstant(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT ? + EltSize : VTN * EltSize, MVT::i64); + + // Constant element number. + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Idx)) { + SDValue Offset = DAG.getConstant(C->getZExtValue() * EltSize, MVT::i32); + const SDValue Ops[] = {Vec, Width, Offset}; + + ConstantSDNode *W = dyn_cast<ConstantSDNode>(Width); + assert(W && "Non constant width in LowerEXTRACT_VECTOR"); + + SDValue N; + // For certain extracts, it is a simple _hi/_lo subreg. + if (VecVT.getSimpleVT() == MVT::v2i32) { + // v2i32 -> i32 vselect. + if (C->getZExtValue() == 0) + N = DAG.getTargetExtractSubreg(Hexagon::subreg_loreg, dl, + MVT::i32, Vec); + else if (C->getZExtValue() == 1) + N = DAG.getTargetExtractSubreg(Hexagon::subreg_hireg, dl, + MVT::i32, Vec); + else + llvm_unreachable("Bad offset"); + } else if ((VecVT.getSimpleVT() == MVT::v4i16) && + (W->getZExtValue() == 32)) { + // v4i16 -> v2i16/i32 vselect. + if (C->getZExtValue() == 0) + N = DAG.getTargetExtractSubreg(Hexagon::subreg_loreg, dl, + MVT::i32, Vec); + else if (C->getZExtValue() == 2) + N = DAG.getTargetExtractSubreg(Hexagon::subreg_hireg, dl, + MVT::i32, Vec); + else + llvm_unreachable("Bad offset"); + } else if ((VecVT.getSimpleVT() == MVT::v8i8) && + (W->getZExtValue() == 32)) { + // v8i8 -> v4i8/i32 vselect. + if (C->getZExtValue() == 0) + N = DAG.getTargetExtractSubreg(Hexagon::subreg_loreg, dl, + MVT::i32, Vec); + else if (C->getZExtValue() == 4) + N = DAG.getTargetExtractSubreg(Hexagon::subreg_hireg, dl, + MVT::i32, Vec); + else + llvm_unreachable("Bad offset"); + } else if (VecVT.getSizeInBits() == 32) { + N = DAG.getNode(HexagonISD::EXTRACTU_ri, dl, MVT::i32, Ops); + } else { + N = DAG.getNode(HexagonISD::EXTRACTU_rd, dl, MVT::i64, Ops); + if (VT.getSizeInBits() == 32) + N = DAG.getTargetExtractSubreg(Hexagon::subreg_loreg, dl, MVT::i32, N); + } + + return DAG.getNode(ISD::BITCAST, dl, VT, N); + } + + // Variable element number. + SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i32, Idx, + DAG.getConstant(EltSize, MVT::i32)); + SDValue Shifted = DAG.getNode(ISD::SHL, dl, MVT::i64, Width, + DAG.getConstant(32, MVT::i64)); + SDValue Combined = DAG.getNode(ISD::OR, dl, MVT::i64, Shifted, Offset); + + const SDValue Ops[] = {Vec, Combined}; + + SDValue N; + if (VecVT.getSizeInBits() == 32) { + N = DAG.getNode(HexagonISD::EXTRACTU_riv, dl, MVT::i32, Ops); + } else { + N = DAG.getNode(HexagonISD::EXTRACTU_rdv, dl, MVT::i64, Ops); + if (VT.getSizeInBits() == 32) + N = DAG.getTargetExtractSubreg(Hexagon::subreg_loreg, dl, MVT::i32, N); + } + return DAG.getNode(ISD::BITCAST, dl, VT, N); +} + +SDValue +HexagonTargetLowering::LowerINSERT_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + int VTN = VT.isVector() ? VT.getVectorNumElements() : 1; + SDLoc dl(Op); + SDValue Vec = Op.getOperand(0); + SDValue Val = Op.getOperand(1); + SDValue Idx = Op.getOperand(2); + EVT VecVT = Vec.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + int EltSize = EltVT.getSizeInBits(); + SDValue Width = DAG.getConstant(Op.getOpcode() == ISD::INSERT_VECTOR_ELT ? + EltSize : VTN * EltSize, MVT::i64); + + if (ConstantSDNode *C = cast<ConstantSDNode>(Idx)) { + SDValue Offset = DAG.getConstant(C->getSExtValue() * EltSize, MVT::i32); + const SDValue Ops[] = {Vec, Val, Width, Offset}; + + SDValue N; + if (VT.getSizeInBits() == 32) + N = DAG.getNode(HexagonISD::INSERT_ri, dl, MVT::i32, Ops); + else + N = DAG.getNode(HexagonISD::INSERT_rd, dl, MVT::i64, Ops); + + return DAG.getNode(ISD::BITCAST, dl, VT, N); + } + + // Variable element number. + SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i32, Idx, + DAG.getConstant(EltSize, MVT::i32)); + SDValue Shifted = DAG.getNode(ISD::SHL, dl, MVT::i64, Width, + DAG.getConstant(32, MVT::i64)); + SDValue Combined = DAG.getNode(ISD::OR, dl, MVT::i64, Shifted, Offset); + + if (VT.getSizeInBits() == 64 && + Val.getValueType().getSizeInBits() == 32) { + SDValue C = DAG.getConstant(0, MVT::i32); + Val = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, Val); + } + + const SDValue Ops[] = {Vec, Val, Combined}; + + SDValue N; + if (VT.getSizeInBits() == 32) + N = DAG.getNode(HexagonISD::INSERT_riv, dl, MVT::i32, Ops); + else + N = DAG.getNode(HexagonISD::INSERT_rdv, dl, MVT::i64, Ops); + + return DAG.getNode(ISD::BITCAST, dl, VT, N); +} + bool HexagonTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { // Assuming the caller does not have either a signext or zeroext modifier, and @@ -1549,7 +2372,19 @@ SDValue HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Should not custom lower this!"); - case ISD::ConstantPool: return LowerConstantPool(Op, DAG); + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + case ISD::INSERT_SUBVECTOR: return LowerINSERT_VECTOR(Op, DAG); + case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR(Op, DAG); + case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_VECTOR(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR(Op, DAG); + case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); + case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::SRA: + case ISD::SHL: + case ISD::SRL: + return LowerVECTOR_SHIFT(Op, DAG); + case ISD::ConstantPool: + return LowerConstantPool(Op, DAG); case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); // Frame & Return address. Currently unimplemented. case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); @@ -1561,9 +2396,14 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::BR_JT: return LowerBR_JT(Op, DAG); + // Custom lower some vector loads. + case ISD::LOAD: return LowerLOAD(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::SELECT: return Op; + case ISD::SETCC: return LowerSETCC(Op, DAG); + case ISD::VSELECT: return LowerVSELECT(Op, DAG); + case ISD::CTPOP: return LowerCTPOP(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::INLINEASM: return LowerINLINEASM(Op, DAG); diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h index 151c28f..34b1ebb 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.h +++ b/lib/Target/Hexagon/HexagonISelLowering.h @@ -37,6 +37,10 @@ bool isPositiveHalfWord(SDNode *N); ADJDYNALLOC, ARGEXTEND, + PIC_ADD, + AT_GOT, + AT_PCREL, + CMPICC, // Compare two GPR operands, set icc. CMPFCC, // Compare two FP operands, set fcc. BRICC, // Branch to dest on icc condition @@ -54,23 +58,44 @@ bool isPositiveHalfWord(SDNode *N); CALLR, RET_FLAG, // Return with a flag operand. - BR_JT, // Jump table. - BARRIER, // Memory barrier + BR_JT, // Branch through jump table. + BARRIER, // Memory barrier. + JT, // Jump table. + CP, // Constant pool. POPCOUNT, COMBINE, - WrapperJT, - WrapperCP, - WrapperCombineII, - WrapperCombineRR, - WrapperCombineRI_V4, - WrapperCombineIR_V4, - WrapperPackhl, - WrapperSplatB, - WrapperSplatH, - WrapperShuffEB, - WrapperShuffEH, - WrapperShuffOB, - WrapperShuffOH, + PACKHL, + VSPLATB, + VSPLATH, + SHUFFEB, + SHUFFEH, + SHUFFOB, + SHUFFOH, + VSXTBH, + VSXTBW, + VSRAW, + VSRAH, + VSRLW, + VSRLH, + VSHLW, + VSHLH, + VCMPBEQ, + VCMPBGT, + VCMPBGTU, + VCMPHEQ, + VCMPHGT, + VCMPHGTU, + VCMPWEQ, + VCMPWGT, + VCMPWGTU, + INSERT_ri, + INSERT_rd, + INSERT_riv, + INSERT_rdv, + EXTRACTU_ri, + EXTRACTU_rd, + EXTRACTU_riv, + EXTRACTU_rdv, TC_RETURN, EH_RETURN, DCFETCH @@ -85,6 +110,8 @@ bool isPositiveHalfWord(SDNode *N); bool CanReturnSmallStruct(const Function* CalleeFn, unsigned& RetSize) const; + void promoteLdStType(EVT VT, EVT PromotedLdStVT); + public: const HexagonSubtarget *Subtarget; explicit HexagonTargetLowering(const TargetMachine &TM, @@ -110,10 +137,17 @@ bool isPositiveHalfWord(SDNode *N); bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override; - SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + // Should we expand the build vector with shuffles? + bool shouldExpandBuildVectorWithShuffles(EVT VT, + unsigned DefinedValues) const override; + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; const char *getTargetNodeName(unsigned Opcode) const override; - SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEXTRACT_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_LABEL(SDValue Op, SelectionDAG &DAG) const; @@ -137,9 +171,13 @@ bool isPositiveHalfWord(SDNode *N); const SmallVectorImpl<SDValue> &OutVals, SDValue Callee) const; + SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const; SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -170,6 +208,15 @@ bool isPositiveHalfWord(SDNode *N); const std::string &Constraint, MVT VT) const override; + unsigned getInlineAsmMemConstraint( + const std::string &ConstraintCode) const override { + if (ConstraintCode == "o") + return InlineAsm::Constraint_o; + else if (ConstraintCode == "v") + return InlineAsm::Constraint_v; + return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); + } + // Intrinsics SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; /// isLegalAddressingMode - Return true if the addressing mode represented diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td index 3d04678..36a7e9f 100644 --- a/lib/Target/Hexagon/HexagonInstrFormats.td +++ b/lib/Target/Hexagon/HexagonInstrFormats.td @@ -76,7 +76,7 @@ class OpcodeHexagon { class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern, string cstr, InstrItinClass itin, IType type> - : Instruction, OpcodeHexagon { + : Instruction { let Namespace = "Hexagon"; dag OutOperandList = outs; @@ -84,18 +84,18 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern, let AsmString = asmstr; let Pattern = pattern; let Constraints = cstr; - let Itinerary = itin;
- let Size = 4;
-
- // SoftFail is a field the disassembler can use to provide a way for
- // instructions to not match without killing the whole decode process. It is
- // mainly used for ARM, but Tablegen expects this field to exist or it fails
- // to build the decode table.
- field bits<32> SoftFail = 0;
-
- // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
-
- // Instruction type according to the ISA.
+ let Itinerary = itin; + let Size = 4; + + // SoftFail is a field the disassembler can use to provide a way for + // instructions to not match without killing the whole decode process. It is + // mainly used for ARM, but Tablegen expects this field to exist or it fails + // to build the decode table. + field bits<32> SoftFail = 0; + + // *** Must match MCTargetDesc/HexagonBaseInfo.h *** + + // Instruction type according to the ISA. IType Type = type; let TSFlags{4-0} = Type.Value; @@ -197,7 +197,7 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern, let mayLoad = 1 in class LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin = LD_tc_ld_SLOT01> - : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>; + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>, OpcodeHexagon; let mayLoad = 1 in class LDInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [], @@ -217,7 +217,7 @@ class LDInstPost<dag outs, dag ins, string asmstr, list<dag> pattern = [], let mayLoad = 1 in class LD0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin=LD_tc_ld_SLOT0> - : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>; + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>, OpcodeHexagon; // ST Instruction Class in V2/V3 can take SLOT0 only. // ST Instruction Class in V4 can take SLOT0 & SLOT1. @@ -225,7 +225,7 @@ class LD0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [], let mayStore = 1 in class STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin = ST_tc_st_SLOT01> - : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>; + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>, OpcodeHexagon; class STInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = ""> @@ -234,7 +234,7 @@ class STInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [], let mayStore = 1 in class ST0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin = ST_tc_ld_SLOT0> - : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>; + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>, OpcodeHexagon; // ST Instruction Class in V2/V3 can take SLOT0 only. // ST Instruction Class in V4 can take SLOT0 & SLOT1. @@ -247,13 +247,14 @@ class STInstPost<dag outs, dag ins, string asmstr, list<dag> pattern = [], // In V2/V3 we used ST for this but in v4 ST can take SLOT0 or SLOT1. class SYSInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin = ST_tc_3stall_SLOT0> - : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeSYSTEM>; + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeSYSTEM>, + OpcodeHexagon; // ALU32 Instruction Class in V2/V3/V4. // Definition of the instruction class NOT CHANGED. class ALU32Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123> - : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeALU32>; + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeALU32>, OpcodeHexagon; // ALU64 Instruction Class in V2/V3. // XTYPE Instruction Class in V4. @@ -261,7 +262,8 @@ class ALU32Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [], // Name of the Instruction Class changed from ALU64 to XTYPE from V2/V3 to V4. class ALU64Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin = ALU64_tc_2_SLOT23> - : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>; + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>, + OpcodeHexagon; class ALU64_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin = ALU64_tc_2_SLOT23> @@ -274,7 +276,8 @@ class ALU64_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [], // Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4. class MInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin = M_tc_3x_SLOT23> - : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>; + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>, + OpcodeHexagon; // M Instruction Class in V2/V3. // XTYPE Instruction Class in V4. @@ -290,7 +293,8 @@ class MInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [], // Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4. class SInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin = S_2op_tc_1_SLOT23> - : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>; + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>, + OpcodeHexagon; // S Instruction Class in V2/V3. // XTYPE Instruction Class in V4. @@ -304,34 +308,37 @@ class SInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [], // Definition of the instruction class NOT CHANGED. class JInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin = J_tc_2early_SLOT23> - : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJ>; + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJ>, OpcodeHexagon; // JR Instruction Class in V2/V3/V4. // Definition of the instruction class NOT CHANGED. class JRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin = J_tc_2early_SLOT2> - : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJR>; + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJR>, OpcodeHexagon; // CR Instruction Class in V2/V3/V4. // Definition of the instruction class NOT CHANGED. class CRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin = CR_tc_2early_SLOT3> - : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCR>; + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCR>, OpcodeHexagon; let isCodeGenOnly = 1, isPseudo = 1 in class Endloop<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin = J_tc_2early_SLOT0123> - : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeENDLOOP>; + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeENDLOOP>, + OpcodeHexagon; let isCodeGenOnly = 1, isPseudo = 1 in class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = ""> - : InstHexagon<outs, ins, asmstr, pattern, cstr, PSEUDO, TypePSEUDO>; + : InstHexagon<outs, ins, asmstr, pattern, cstr, PSEUDO, TypePSEUDO>, + OpcodeHexagon; let isCodeGenOnly = 1, isPseudo = 1 in class PseudoM<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr=""> - : InstHexagon<outs, ins, asmstr, pattern, cstr, PSEUDOM, TypePSEUDO>; + : InstHexagon<outs, ins, asmstr, pattern, cstr, PSEUDOM, TypePSEUDO>, + OpcodeHexagon; //===----------------------------------------------------------------------===// // Instruction Classes Definitions - diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/lib/Target/Hexagon/HexagonInstrFormatsV4.td index 5fec80b..7f7b2c9 100644 --- a/lib/Target/Hexagon/HexagonInstrFormatsV4.td +++ b/lib/Target/Hexagon/HexagonInstrFormatsV4.td @@ -17,10 +17,88 @@ // *** Must match BaseInfo.h *** //----------------------------------------------------------------------------// -def TypeMEMOP : IType<9>; -def TypeNV : IType<10>; +def TypeMEMOP : IType<9>; +def TypeNV : IType<10>; +def TypeDUPLEX : IType<11>; def TypeCOMPOUND : IType<12>; -def TypePREFIX : IType<30>; +def TypeAG_VX : IType<28>; +def TypeAG_VM : IType<29>; +def TypePREFIX : IType<30>; + +// Duplex Instruction Class Declaration +//===----------------------------------------------------------------------===// + +class OpcodeDuplex { + field bits<32> Inst = ?; // Default to an invalid insn. + bits<4> IClass = 0; // ICLASS + bits<13> ISubHi = 0; // Low sub-insn + bits<13> ISubLo = 0; // High sub-insn + + let Inst{31-29} = IClass{3-1}; + let Inst{13} = IClass{0}; + let Inst{15-14} = 0; + let Inst{28-16} = ISubHi; + let Inst{12-0} = ISubLo; +} + +class InstDuplex<bits<4> iClass, list<dag> pattern = [], + string cstr = ""> + : Instruction, OpcodeDuplex { + let Namespace = "Hexagon"; + IType Type = TypeDUPLEX; // uses slot 0,1 + let isCodeGenOnly = 1; + let hasSideEffects = 0; + dag OutOperandList = (outs); + dag InOperandList = (ins); + let IClass = iClass; + let Constraints = cstr; + let Itinerary = DUPLEX; + let Size = 4; + + // SoftFail is a field the disassembler can use to provide a way for + // instructions to not match without killing the whole decode process. It is + // mainly used for ARM, but Tablegen expects this field to exist or it fails + // to build the decode table. + field bits<32> SoftFail = 0; + + // *** Must match MCTargetDesc/HexagonBaseInfo.h *** + + let TSFlags{4-0} = Type.Value; + + // Predicated instructions. + bits<1> isPredicated = 0; + let TSFlags{6} = isPredicated; + bits<1> isPredicatedFalse = 0; + let TSFlags{7} = isPredicatedFalse; + bits<1> isPredicatedNew = 0; + let TSFlags{8} = isPredicatedNew; + + // New-value insn helper fields. + bits<1> isNewValue = 0; + let TSFlags{9} = isNewValue; // New-value consumer insn. + bits<1> hasNewValue = 0; + let TSFlags{10} = hasNewValue; // New-value producer insn. + bits<3> opNewValue = 0; + let TSFlags{13-11} = opNewValue; // New-value produced operand. + bits<1> isNVStorable = 0; + let TSFlags{14} = isNVStorable; // Store that can become new-value store. + bits<1> isNVStore = 0; + let TSFlags{15} = isNVStore; // New-value store insn. + + // Immediate extender helper fields. + bits<1> isExtendable = 0; + let TSFlags{16} = isExtendable; // Insn may be extended. + bits<1> isExtended = 0; + let TSFlags{17} = isExtended; // Insn must be extended. + bits<3> opExtendable = 0; + let TSFlags{20-18} = opExtendable; // Which operand may be extended. + bits<1> isExtentSigned = 0; + let TSFlags{21} = isExtentSigned; // Signed or unsigned range. + bits<5> opExtentBits = 0; + let TSFlags{26-22} = opExtentBits; //Number of bits of range before extending. + bits<2> opExtentAlign = 0; + let TSFlags{28-27} = opExtentAlign; // Alignment exponent before extending. +} //----------------------------------------------------------------------------// // Instruction Classes Definitions @@ -31,7 +109,7 @@ def TypePREFIX : IType<30>; // class NVInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin = NCJ_tc_3or4stall_SLOT0> - : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeNV>; + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeNV>, OpcodeHexagon; class NVInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin = NCJ_tc_3or4stall_SLOT0> @@ -56,7 +134,8 @@ class NCJInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], let mayLoad = 1, mayStore = 1 in class MEMInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin = V4LDST_tc_st_SLOT0> - : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeMEMOP>; + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeMEMOP>, + OpcodeHexagon; class MEMInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin = V4LDST_tc_st_SLOT0> @@ -65,8 +144,9 @@ class MEMInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [], let isCodeGenOnly = 1 in class EXTENDERInst<dag outs, dag ins, string asmstr, list<dag> pattern = []> : InstHexagon<outs, ins, asmstr, pattern, "", EXTENDER_tc_1_SLOT0123, - TypePREFIX>; + TypePREFIX>, OpcodeHexagon; class CJInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = ""> - : InstHexagon<outs, ins, asmstr, pattern, cstr, COMPOUND, TypeCOMPOUND>; + : InstHexagon<outs, ins, asmstr, pattern, cstr, COMPOUND, TypeCOMPOUND>, + OpcodeHexagon; diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp index 9bae12c..fbf1ca9 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -62,10 +62,8 @@ const int Hexagon_MEMB_AUTOINC_MIN = -8; void HexagonInstrInfo::anchor() {} HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST) - : HexagonGenInstrInfo(Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP), - RI(ST), Subtarget(ST) { -} - + : HexagonGenInstrInfo(Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP), + RI(), Subtarget(ST) {} /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of @@ -159,15 +157,19 @@ HexagonInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB, } BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB); } else { - BuildMI(&MBB, DL, - get(BccOpc)).addReg(Cond[regPos].getReg()).addMBB(TBB); + // If Cond[0] is a basic block, insert ENDLOOP0. + if (Cond[0].isMBB()) + BuildMI(&MBB, DL, get(Hexagon::ENDLOOP0)).addMBB(Cond[0].getMBB()); + else + BuildMI(&MBB, DL, + get(BccOpc)).addReg(Cond[regPos].getReg()).addMBB(TBB); } return 1; } + // We don't handle ENDLOOP0 with a conditional branch in AnalyzeBranch. BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[regPos].getReg()).addMBB(TBB); BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB); - return 2; } @@ -211,9 +213,11 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, return false; --I; } - + + bool JumpToBlock = I->getOpcode() == Hexagon::J2_jump && + I->getOperand(0).isMBB(); // Delete the JMP if it's equivalent to a fall-through. - if (AllowModify && I->getOpcode() == Hexagon::J2_jump && + if (AllowModify && JumpToBlock && MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) { DEBUG(dbgs()<< "\nErasing the jump to successor block\n";); I->eraseFromParent(); @@ -243,6 +247,14 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, } while(I); int LastOpcode = LastInst->getOpcode(); + int SecLastOpcode = SecondLastInst ? SecondLastInst->getOpcode() : 0; + // If the branch target is not a basic block, it could be a tail call. + // (It is, if the target is a function.) + if (LastOpcode == Hexagon::J2_jump && !LastInst->getOperand(0).isMBB()) + return true; + if (SecLastOpcode == Hexagon::J2_jump && + !SecondLastInst->getOperand(0).isMBB()) + return true; bool LastOpcodeHasJMP_c = PredOpcodeHasJMP_c(LastOpcode); bool LastOpcodeHasNot = PredOpcodeHasNot(LastOpcode); @@ -270,8 +282,6 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, return true; } - int SecLastOpcode = SecondLastInst->getOpcode(); - bool SecLastOpcodeHasJMP_c = PredOpcodeHasJMP_c(SecLastOpcode); bool SecLastOpcodeHasNot = PredOpcodeHasNot(SecLastOpcode); if (SecLastOpcodeHasJMP_c && (LastOpcode == Hexagon::J2_jump)) { @@ -308,30 +318,35 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, unsigned HexagonInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { - int BOpc = Hexagon::J2_jump; - int BccOpc = Hexagon::J2_jumpt; - int BccOpcNot = Hexagon::J2_jumpf; - MachineBasicBlock::iterator I = MBB.end(); if (I == MBB.begin()) return 0; --I; - if (I->getOpcode() != BOpc && I->getOpcode() != BccOpc && - I->getOpcode() != BccOpcNot) - return 0; - - // Remove the branch. - I->eraseFromParent(); + unsigned Opc1 = I->getOpcode(); + switch (Opc1) { + case Hexagon::J2_jump: + case Hexagon::J2_jumpt: + case Hexagon::J2_jumpf: + case Hexagon::ENDLOOP0: + I->eraseFromParent(); + break; + default: + return 0; + } I = MBB.end(); if (I == MBB.begin()) return 1; --I; - if (I->getOpcode() != BccOpc && I->getOpcode() != BccOpcNot) - return 1; - - // Remove the branch. - I->eraseFromParent(); - return 2; + unsigned Opc2 = I->getOpcode(); + switch (Opc2) { + case Hexagon::J2_jumpt: + case Hexagon::J2_jumpf: + case Hexagon::ENDLOOP0: + I->eraseFromParent(); + return 2; + default: + return 1; + } } @@ -549,12 +564,95 @@ void HexagonInstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg, SmallVectorImpl<MachineInstr*> &NewMIs) const { llvm_unreachable("Unimplemented"); } +bool +HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { + const HexagonRegisterInfo &TRI = getRegisterInfo(); + MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case Hexagon::TFR_PdTrue: { + unsigned Reg = MI->getOperand(0).getReg(); + BuildMI(MBB, MI, DL, get(Hexagon::C2_orn), Reg) + .addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); + MBB.erase(MI); + return true; + } + case Hexagon::TFR_PdFalse: { + unsigned Reg = MI->getOperand(0).getReg(); + BuildMI(MBB, MI, DL, get(Hexagon::C2_andn), Reg) + .addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); + MBB.erase(MI); + return true; + } + case Hexagon::VMULW: { + // Expand a 64-bit vector multiply into 2 32-bit scalar multiplies. + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned Src1Reg = MI->getOperand(1).getReg(); + unsigned Src2Reg = MI->getOperand(2).getReg(); + unsigned Src1SubHi = TRI.getSubReg(Src1Reg, Hexagon::subreg_hireg); + unsigned Src1SubLo = TRI.getSubReg(Src1Reg, Hexagon::subreg_loreg); + unsigned Src2SubHi = TRI.getSubReg(Src2Reg, Hexagon::subreg_hireg); + unsigned Src2SubLo = TRI.getSubReg(Src2Reg, Hexagon::subreg_loreg); + BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_mpyi), + TRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi) + .addReg(Src2SubHi); + BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_mpyi), + TRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo) + .addReg(Src2SubLo); + MBB.erase(MI); + MRI.clearKillFlags(Src1SubHi); + MRI.clearKillFlags(Src1SubLo); + MRI.clearKillFlags(Src2SubHi); + MRI.clearKillFlags(Src2SubLo); + return true; + } + case Hexagon::VMULW_ACC: { + // Expand 64-bit vector multiply with addition into 2 scalar multiplies. + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned Src1Reg = MI->getOperand(1).getReg(); + unsigned Src2Reg = MI->getOperand(2).getReg(); + unsigned Src3Reg = MI->getOperand(3).getReg(); + unsigned Src1SubHi = TRI.getSubReg(Src1Reg, Hexagon::subreg_hireg); + unsigned Src1SubLo = TRI.getSubReg(Src1Reg, Hexagon::subreg_loreg); + unsigned Src2SubHi = TRI.getSubReg(Src2Reg, Hexagon::subreg_hireg); + unsigned Src2SubLo = TRI.getSubReg(Src2Reg, Hexagon::subreg_loreg); + unsigned Src3SubHi = TRI.getSubReg(Src3Reg, Hexagon::subreg_hireg); + unsigned Src3SubLo = TRI.getSubReg(Src3Reg, Hexagon::subreg_loreg); + BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_maci), + TRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi) + .addReg(Src2SubHi).addReg(Src3SubHi); + BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_maci), + TRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo) + .addReg(Src2SubLo).addReg(Src3SubLo); + MBB.erase(MI); + MRI.clearKillFlags(Src1SubHi); + MRI.clearKillFlags(Src1SubLo); + MRI.clearKillFlags(Src2SubHi); + MRI.clearKillFlags(Src2SubLo); + MRI.clearKillFlags(Src3SubHi); + MRI.clearKillFlags(Src3SubLo); + return true; + } + case Hexagon::TCRETURNi: + MI->setDesc(get(Hexagon::J2_jump)); + return true; + case Hexagon::TCRETURNr: + MI->setDesc(get(Hexagon::J2_jumpr)); + return true; + } + + return false; +} MachineInstr *HexagonInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, - const SmallVectorImpl<unsigned> &Ops, - int FI) const { + MachineInstr *MI, + ArrayRef<unsigned> Ops, + int FI) const { // Hexagon_TODO: Implement. return nullptr; } @@ -641,7 +739,7 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const { switch(Opc) { case Hexagon::A2_tfrsi: - return isInt<12>(MI->getOperand(1).getImm()); + return (isOperandExtended(MI, 1) && isConstExtended(MI)) || isInt<12>(MI->getOperand(1).getImm()); case Hexagon::S2_storerd_io: return isShiftedUInt<6,3>(MI->getOperand(1).getImm()); @@ -1036,6 +1134,8 @@ SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, // bool HexagonInstrInfo:: ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { + if (!Cond.empty() && Cond[0].isMBB()) + return true; if (!Cond.empty() && Cond[0].isImm() && Cond[0].getImm() == 0) { Cond.erase(Cond.begin()); } else { @@ -1521,7 +1621,6 @@ int HexagonInstrInfo::GetDotNewOp(const MachineInstr* MI) const { switch (MI->getOpcode()) { default: llvm_unreachable("Unknown .new type"); - // store new value byte case Hexagon::S4_storerb_ur: return Hexagon::S4_storerbnew_ur; @@ -1531,6 +1630,20 @@ int HexagonInstrInfo::GetDotNewOp(const MachineInstr* MI) const { case Hexagon::S4_storeri_ur: return Hexagon::S4_storerinew_ur; + case Hexagon::S2_storerb_pci: + return Hexagon::S2_storerb_pci; + + case Hexagon::S2_storeri_pci: + return Hexagon::S2_storeri_pci; + + case Hexagon::S2_storerh_pci: + return Hexagon::S2_storerh_pci; + + case Hexagon::S2_storerd_pci: + return Hexagon::S2_storerd_pci; + + case Hexagon::S2_storerf_pci: + return Hexagon::S2_storerf_pci; } return 0; } @@ -1647,7 +1760,7 @@ bool HexagonInstrInfo::isConstExtended(MachineInstr *MI) const { // We currently only handle isGlobal() because it is the only kind of // object we are going to end up with here for now. // In the future we probably should add isSymbol(), etc. - if (MO.isGlobal() || MO.isSymbol()) + if (MO.isGlobal() || MO.isSymbol() || MO.isBlockAddress()) return true; // If the extendable operand is not 'Immediate' type, the instruction should diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h index 6acfbec..2644248 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/lib/Target/Hexagon/HexagonInstrInfo.h @@ -26,7 +26,7 @@ namespace llvm { struct EVT; - +class HexagonSubtarget; class HexagonInstrInfo : public HexagonGenInstrInfo { virtual void anchor(); const HexagonRegisterInfo RI; @@ -102,15 +102,21 @@ public: const TargetRegisterClass *RC, SmallVectorImpl<MachineInstr*> &NewMIs) const; - MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, - const SmallVectorImpl<unsigned> &Ops, + /// expandPostRAPseudo - This function is called for all pseudo instructions + /// that remain after register allocation. Many pseudo instructions are + /// created to help register allocation. This is the place to convert them + /// into real instructions. The target can edit MI in place, or it can insert + /// new instructions and erase MI. The function should return true if + /// anything was changed. + bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, int FrameIndex) const override; - MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, - const SmallVectorImpl<unsigned> &Ops, - MachineInstr* LoadMI) const override { + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, + MachineInstr *LoadMI) const override { return nullptr; } diff --git a/lib/Target/Hexagon/HexagonInstrInfo.td b/lib/Target/Hexagon/HexagonInstrInfo.td index 60635cf..19cf993 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.td +++ b/lib/Target/Hexagon/HexagonInstrInfo.td @@ -104,10 +104,16 @@ def : T_CMP_pat <C2_cmpgtui, setugt, u9ImmPred>; //===----------------------------------------------------------------------===// // ALU32/ALU + //===----------------------------------------------------------------------===// +// Add. + +def SDT_Int32Leaf : SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>; +def SDT_Int32Unary : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; + def SDTHexagonI64I32I32 : SDTypeProfile<1, 2, [SDTCisVT<0, i64>, SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>; def HexagonCOMBINE : SDNode<"HexagonISD::COMBINE", SDTHexagonI64I32I32>; +def HexagonPACKHL : SDNode<"HexagonISD::PACKHL", SDTHexagonI64I32I32>; let hasSideEffects = 0, hasNewValue = 1, InputType = "reg" in class T_ALU32_3op<string mnemonic, bits<3> MajOp, bits<3> MinOp, bit OpsRev, @@ -243,6 +249,9 @@ let OutOperandList = (outs DoubleRegs:$Rd), hasNewValue = 0 in { def C2_ccombinewnewf : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 1, 1>; } +def: BinOp32_pat<HexagonCOMBINE, A2_combinew, i64>; +def: BinOp32_pat<HexagonPACKHL, S2_packhl, i64>; + let hasSideEffects = 0, hasNewValue = 1, isCompare = 1, InputType = "reg" in class T_ALU32_3op_cmp<string mnemonic, bits<2> MinOp, bit IsNeg, bit IsComm> : ALU32_rr<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt), @@ -321,7 +330,7 @@ let isReMaterializable = 1, isMoveImm = 1, isAsCheapAsAMove = 1, def A2_combineii: ALU32Inst <(outs DoubleRegs:$Rdd), (ins s8Ext:$s8, s8Imm:$S8), "$Rdd = combine(#$s8, #$S8)", [(set (i64 DoubleRegs:$Rdd), - (i64 (HexagonCOMBINE(i32 s8ExtPred:$s8), (i32 s8ImmPred:$S8))))]> { + (i64 (HexagonCOMBINE(i32 s32ImmPred:$s8), (i32 s8ImmPred:$S8))))]> { bits<5> Rdd; bits<8> s8; bits<8> S8; @@ -406,7 +415,7 @@ multiclass Addri_base<string mnemonic, SDNode OpNode> { defm addi : Addri_base<"add", add>, ImmRegRel, PredNewRel; -def: Pat<(i32 (add I32:$Rs, s16ExtPred:$s16)), +def: Pat<(i32 (add I32:$Rs, s32ImmPred:$s16)), (i32 (A2_addi I32:$Rs, imm:$s16))>; //===----------------------------------------------------------------------===// @@ -420,7 +429,7 @@ class T_ALU32ri_logical <string mnemonic, SDNode OpNode, bits<2> MinOp> : ALU32_ri <(outs IntRegs:$Rd), (ins IntRegs:$Rs, s10Ext:$s10), "$Rd = "#mnemonic#"($Rs, #$s10)" , - [(set (i32 IntRegs:$Rd), (OpNode (i32 IntRegs:$Rs), s10ExtPred:$s10))]> { + [(set (i32 IntRegs:$Rd), (OpNode (i32 IntRegs:$Rs), s32ImmPred:$s10))]> { bits<5> Rd; bits<5> Rs; bits<10> s10; @@ -465,7 +474,7 @@ def A2_nop: ALU32Inst <(outs), (ins), "nop" > { let Inst{27-24} = 0b1111; } -def: Pat<(sub s10ExtPred:$s10, IntRegs:$Rs), +def: Pat<(sub s32ImmPred:$s10, IntRegs:$Rs), (A2_subri imm:$s10, IntRegs:$Rs)>; // Rd = not(Rs) gets mapped to Rd=sub(#-1, Rs). @@ -613,7 +622,7 @@ let InputType = "imm", isExtendable = 1, isExtentSigned = 1, isAsCheapAsAMove = 1 , opExtendable = 1, opExtentBits = 16, isMoveImm = 1, isPredicated = 0, isPredicable = 1, isReMaterializable = 1 in def A2_tfrsi : ALU32Inst<(outs IntRegs:$Rd), (ins s16Ext:$s16), "$Rd = #$s16", - [(set (i32 IntRegs:$Rd), s16ExtPred:$s16)], "", ALU32_2op_tc_1_SLOT0123>, + [(set (i32 IntRegs:$Rd), s32ImmPred:$s16)], "", ALU32_2op_tc_1_SLOT0123>, ImmRegRel, PredRel { bits<5> Rd; bits<16> s16; @@ -637,9 +646,13 @@ def A2_tfrpi : ALU64_rr<(outs DoubleRegs:$dst), (ins s8Imm64:$src1), // TODO: see if this instruction can be deleted.. let isExtendable = 1, opExtendable = 1, opExtentBits = 6, - isAsmParserOnly = 1 in -def TFRI64_V4 : ALU64_rr<(outs DoubleRegs:$dst), (ins u6Ext:$src1), + isAsmParserOnly = 1 in { +def TFRI64_V4 : ALU64_rr<(outs DoubleRegs:$dst), (ins u64Imm:$src1), "$dst = #$src1">; +def TFRI64_V2_ext : ALU64_rr<(outs DoubleRegs:$dst), + (ins s8Ext:$src1, s8Imm:$src2), + "$dst = combine(##$src1, #$src2)">; +} //===----------------------------------------------------------------------===// // ALU32/ALU - @@ -677,11 +690,11 @@ let opExtendable = 3 in def C2_muxir : T_MUX1<0b0, (ins PredRegs:$Pu, IntRegs:$Rs, s8Ext:$s8), "$Rd = mux($Pu, $Rs, #$s8)">; -def : Pat<(i32 (select I1:$Pu, s8ExtPred:$s8, I32:$Rs)), - (C2_muxri I1:$Pu, s8ExtPred:$s8, I32:$Rs)>; +def : Pat<(i32 (select I1:$Pu, s32ImmPred:$s8, I32:$Rs)), + (C2_muxri I1:$Pu, s32ImmPred:$s8, I32:$Rs)>; -def : Pat<(i32 (select I1:$Pu, I32:$Rs, s8ExtPred:$s8)), - (C2_muxir I1:$Pu, I32:$Rs, s8ExtPred:$s8)>; +def : Pat<(i32 (select I1:$Pu, I32:$Rs, s32ImmPred:$s8)), + (C2_muxir I1:$Pu, I32:$Rs, s32ImmPred:$s8)>; // C2_muxii: Scalar mux immediates. let isExtentSigned = 1, hasNewValue = 1, isExtendable = 1, @@ -690,7 +703,7 @@ def C2_muxii: ALU32Inst <(outs IntRegs:$Rd), (ins PredRegs:$Pu, s8Ext:$s8, s8Imm:$S8), "$Rd = mux($Pu, #$s8, #$S8)" , [(set (i32 IntRegs:$Rd), - (i32 (select I1:$Pu, s8ExtPred:$s8, s8ImmPred:$S8)))] > { + (i32 (select I1:$Pu, s32ImmPred:$s8, s8ImmPred:$S8)))] > { bits<5> Rd; bits<2> Pu; bits<8> s8; @@ -706,6 +719,12 @@ def C2_muxii: ALU32Inst <(outs IntRegs:$Rd), let Inst{4-0} = Rd; } +let isCodeGenOnly = 1, isPseudo = 1 in +def MUX64_rr : ALU64_rr<(outs DoubleRegs:$Rd), + (ins PredRegs:$Pu, DoubleRegs:$Rs, DoubleRegs:$Rt), + ".error \"should not emit\" ", []>; + + //===----------------------------------------------------------------------===// // template class for non-predicated alu32_2op instructions // - aslh, asrh, sxtb, sxth, zxth @@ -987,6 +1006,17 @@ def: T_vcmp_pat<A2_vcmpwgtu, setugt, v2i32>; //===----------------------------------------------------------------------===// // ALU32/PRED + //===----------------------------------------------------------------------===// +// No bits needed. If cmp.ge is found the assembler parser will +// transform it to cmp.gt subtracting 1 from the immediate. +let isPseudo = 1 in { +def C2_cmpgei: ALU32Inst < + (outs PredRegs:$Pd), (ins IntRegs:$Rs, s8Ext:$s8), + "$Pd = cmp.ge($Rs, #$s8)">; +def C2_cmpgeui: ALU32Inst < + (outs PredRegs:$Pd), (ins IntRegs:$Rs, u8Ext:$s8), + "$Pd = cmp.geu($Rs, #$s8)">; +} + //===----------------------------------------------------------------------===// // ALU32/PRED - @@ -1742,27 +1772,29 @@ def L2_loadalignb_io: T_loadalign_io <"memb_fifo", 0b0100, s11_0Ext>; multiclass Loadx_pat<PatFrag Load, ValueType VT, PatLeaf ImmPred, InstHexagon MI> { def: Pat<(VT (Load AddrFI:$fi)), (VT (MI AddrFI:$fi, 0))>; + def: Pat<(VT (Load (add (i32 AddrFI:$fi), ImmPred:$Off))), + (VT (MI AddrFI:$fi, imm:$Off))>; def: Pat<(VT (Load (add (i32 IntRegs:$Rs), ImmPred:$Off))), (VT (MI IntRegs:$Rs, imm:$Off))>; def: Pat<(VT (Load (i32 IntRegs:$Rs))), (VT (MI IntRegs:$Rs, 0))>; } let AddedComplexity = 20 in { - defm: Loadx_pat<load, i32, s11_2ExtPred, L2_loadri_io>; - defm: Loadx_pat<load, i64, s11_3ExtPred, L2_loadrd_io>; - defm: Loadx_pat<atomic_load_8 , i32, s11_0ExtPred, L2_loadrub_io>; - defm: Loadx_pat<atomic_load_16, i32, s11_1ExtPred, L2_loadruh_io>; - defm: Loadx_pat<atomic_load_32, i32, s11_2ExtPred, L2_loadri_io>; - defm: Loadx_pat<atomic_load_64, i64, s11_3ExtPred, L2_loadrd_io>; - - defm: Loadx_pat<extloadi1, i32, s11_0ExtPred, L2_loadrub_io>; - defm: Loadx_pat<extloadi8, i32, s11_0ExtPred, L2_loadrub_io>; - defm: Loadx_pat<extloadi16, i32, s11_1ExtPred, L2_loadruh_io>; - defm: Loadx_pat<sextloadi8, i32, s11_0ExtPred, L2_loadrb_io>; - defm: Loadx_pat<sextloadi16, i32, s11_1ExtPred, L2_loadrh_io>; - defm: Loadx_pat<zextloadi1, i32, s11_0ExtPred, L2_loadrub_io>; - defm: Loadx_pat<zextloadi8, i32, s11_0ExtPred, L2_loadrub_io>; - defm: Loadx_pat<zextloadi16, i32, s11_1ExtPred, L2_loadruh_io>; + defm: Loadx_pat<load, i32, s30_2ImmPred, L2_loadri_io>; + defm: Loadx_pat<load, i64, s29_3ImmPred, L2_loadrd_io>; + defm: Loadx_pat<atomic_load_8 , i32, s32_0ImmPred, L2_loadrub_io>; + defm: Loadx_pat<atomic_load_16, i32, s31_1ImmPred, L2_loadruh_io>; + defm: Loadx_pat<atomic_load_32, i32, s30_2ImmPred, L2_loadri_io>; + defm: Loadx_pat<atomic_load_64, i64, s29_3ImmPred, L2_loadrd_io>; + + defm: Loadx_pat<extloadi1, i32, s32_0ImmPred, L2_loadrub_io>; + defm: Loadx_pat<extloadi8, i32, s32_0ImmPred, L2_loadrub_io>; + defm: Loadx_pat<extloadi16, i32, s31_1ImmPred, L2_loadruh_io>; + defm: Loadx_pat<sextloadi8, i32, s32_0ImmPred, L2_loadrb_io>; + defm: Loadx_pat<sextloadi16, i32, s31_1ImmPred, L2_loadrh_io>; + defm: Loadx_pat<zextloadi1, i32, s32_0ImmPred, L2_loadrub_io>; + defm: Loadx_pat<zextloadi8, i32, s32_0ImmPred, L2_loadrub_io>; + defm: Loadx_pat<zextloadi16, i32, s31_1ImmPred, L2_loadruh_io>; // No sextloadi1. } @@ -2707,7 +2739,7 @@ class T_MType_mpy_ri <bit isNeg, Operand ImmOp, list<dag> pattern> let isExtendable = 1, opExtentBits = 8, opExtendable = 2 in def M2_mpysip : T_MType_mpy_ri <0, u8Ext, - [(set (i32 IntRegs:$Rd), (mul IntRegs:$Rs, u8ExtPred:$u8))]>; + [(set (i32 IntRegs:$Rd), (mul IntRegs:$Rs, u32ImmPred:$u8))]>; def M2_mpysin : T_MType_mpy_ri <1, u8Imm, [(set (i32 IntRegs:$Rd), (ineg (mul IntRegs:$Rs, @@ -2729,7 +2761,7 @@ let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 9, def M2_mpysmi : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9Ext:$src2), "$dst = mpyi($src1, #$src2)", [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1), - s9ExtPred:$src2))]>, ImmRegRel; + s32ImmPred:$src2))]>, ImmRegRel; let hasNewValue = 1, isExtendable = 1, opExtentBits = 8, opExtendable = 3, InputType = "imm" in @@ -2780,7 +2812,7 @@ class T_MType_acc_rr <string mnemonic, bits<3> MajOp, bits<3> MinOp, let CextOpcode = "MPYI_acc", Itinerary = M_tc_3x_SLOT23 in { def M2_macsip : T_MType_acc_ri <"+= mpyi", 0b010, u8Ext, [(set (i32 IntRegs:$dst), - (add (mul IntRegs:$src2, u8ExtPred:$src3), + (add (mul IntRegs:$src2, u32ImmPred:$src3), IntRegs:$src1))]>, ImmRegRel; def M2_maci : T_MType_acc_rr <"+= mpyi", 0b000, 0b000, 0, @@ -2793,7 +2825,7 @@ let CextOpcode = "ADD_acc" in { let isExtentSigned = 1 in def M2_accii : T_MType_acc_ri <"+= add", 0b100, s8Ext, [(set (i32 IntRegs:$dst), - (add (add (i32 IntRegs:$src2), s8_16ExtPred:$src3), + (add (add (i32 IntRegs:$src2), s16_16ImmPred:$src3), (i32 IntRegs:$src1)))]>, ImmRegRel; def M2_acci : T_MType_acc_rr <"+= add", 0b000, 0b001, 0, @@ -2825,9 +2857,9 @@ class T_MType_acc_pat2 <InstHexagon MI, SDNode firstOp, SDNode secOp> (MI IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>; def : T_MType_acc_pat2 <M2_xor_xacc, xor, xor>; -def : T_MType_acc_pat1 <M2_macsin, mul, sub, u8ExtPred>; +def : T_MType_acc_pat1 <M2_macsin, mul, sub, u32ImmPred>; -def : T_MType_acc_pat1 <M2_naccii, add, sub, s8_16ExtPred>; +def : T_MType_acc_pat1 <M2_naccii, add, sub, s16_16ImmPred>; def : T_MType_acc_pat2 <M2_nacci, add, sub>; //===----------------------------------------------------------------------===// @@ -3514,7 +3546,8 @@ let addrMode = BaseImmOffset, InputType = "imm" in { } // Patterns for generating stores, where the address takes different forms: -// - frameindex,, +// - frameindex, +// - frameindex + offset, // - base + offset, // - simple (base address without offset). // These would usually be used together (via Storex_pat defined below), but @@ -3522,6 +3555,10 @@ let addrMode = BaseImmOffset, InputType = "imm" in { // AddedComplexity) to the individual patterns. class Storex_fi_pat<PatFrag Store, PatFrag Value, InstHexagon MI> : Pat<(Store Value:$Rs, AddrFI:$fi), (MI AddrFI:$fi, 0, Value:$Rs)>; +class Storex_fi_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred, + InstHexagon MI> + : Pat<(Store Value:$Rs, (add (i32 AddrFI:$fi), ImmPred:$Off)), + (MI AddrFI:$fi, imm:$Off, Value:$Rs)>; class Storex_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred, InstHexagon MI> : Pat<(Store Value:$Rt, (add (i32 IntRegs:$Rs), ImmPred:$Off)), @@ -3537,6 +3574,10 @@ class Storexm_fi_pat<PatFrag Store, PatFrag Value, PatFrag ValueMod, InstHexagon MI> : Pat<(Store Value:$Rs, AddrFI:$fi), (MI AddrFI:$fi, 0, (ValueMod Value:$Rs))>; +class Storexm_fi_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred, + PatFrag ValueMod, InstHexagon MI> + : Pat<(Store Value:$Rs, (add (i32 AddrFI:$fi), ImmPred:$Off)), + (MI AddrFI:$fi, imm:$Off, (ValueMod Value:$Rs))>; class Storexm_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred, PatFrag ValueMod, InstHexagon MI> : Pat<(Store Value:$Rt, (add (i32 IntRegs:$Rs), ImmPred:$Off)), @@ -3548,14 +3589,16 @@ class Storexm_simple_pat<PatFrag Store, PatFrag Value, PatFrag ValueMod, multiclass Storex_pat<PatFrag Store, PatFrag Value, PatLeaf ImmPred, InstHexagon MI> { - def: Storex_fi_pat <Store, Value, MI>; - def: Storex_add_pat <Store, Value, ImmPred, MI>; + def: Storex_fi_pat <Store, Value, MI>; + def: Storex_fi_add_pat <Store, Value, ImmPred, MI>; + def: Storex_add_pat <Store, Value, ImmPred, MI>; } multiclass Storexm_pat<PatFrag Store, PatFrag Value, PatLeaf ImmPred, PatFrag ValueMod, InstHexagon MI> { - def: Storexm_fi_pat <Store, Value, ValueMod, MI>; - def: Storexm_add_pat <Store, Value, ImmPred, ValueMod, MI>; + def: Storexm_fi_pat <Store, Value, ValueMod, MI>; + def: Storexm_fi_add_pat <Store, Value, ImmPred, ValueMod, MI>; + def: Storexm_add_pat <Store, Value, ImmPred, ValueMod, MI>; } // Regular stores in the DAG have two operands: value and address. @@ -3567,15 +3610,15 @@ class SwapSt<PatFrag F> : PatFrag<(ops node:$val, node:$ptr), F.Fragment>; let AddedComplexity = 20 in { - defm: Storex_pat<truncstorei8, I32, s11_0ExtPred, S2_storerb_io>; - defm: Storex_pat<truncstorei16, I32, s11_1ExtPred, S2_storerh_io>; - defm: Storex_pat<store, I32, s11_2ExtPred, S2_storeri_io>; - defm: Storex_pat<store, I64, s11_3ExtPred, S2_storerd_io>; + defm: Storex_pat<truncstorei8, I32, s32_0ImmPred, S2_storerb_io>; + defm: Storex_pat<truncstorei16, I32, s31_1ImmPred, S2_storerh_io>; + defm: Storex_pat<store, I32, s30_2ImmPred, S2_storeri_io>; + defm: Storex_pat<store, I64, s29_3ImmPred, S2_storerd_io>; - defm: Storex_pat<SwapSt<atomic_store_8>, I32, s11_0ExtPred, S2_storerb_io>; - defm: Storex_pat<SwapSt<atomic_store_16>, I32, s11_1ExtPred, S2_storerh_io>; - defm: Storex_pat<SwapSt<atomic_store_32>, I32, s11_2ExtPred, S2_storeri_io>; - defm: Storex_pat<SwapSt<atomic_store_64>, I64, s11_3ExtPred, S2_storerd_io>; + defm: Storex_pat<SwapSt<atomic_store_8>, I32, s32_0ImmPred, S2_storerb_io>; + defm: Storex_pat<SwapSt<atomic_store_16>, I32, s31_1ImmPred, S2_storerh_io>; + defm: Storex_pat<SwapSt<atomic_store_32>, I32, s30_2ImmPred, S2_storeri_io>; + defm: Storex_pat<SwapSt<atomic_store_64>, I64, s29_3ImmPred, S2_storerd_io>; } // Simple patterns should be tried with the least priority. @@ -3590,9 +3633,9 @@ def: Storex_simple_pat<SwapSt<atomic_store_32>, I32, S2_storeri_io>; def: Storex_simple_pat<SwapSt<atomic_store_64>, I64, S2_storerd_io>; let AddedComplexity = 20 in { - defm: Storexm_pat<truncstorei8, I64, s11_0ExtPred, LoReg, S2_storerb_io>; - defm: Storexm_pat<truncstorei16, I64, s11_1ExtPred, LoReg, S2_storerh_io>; - defm: Storexm_pat<truncstorei32, I64, s11_2ExtPred, LoReg, S2_storeri_io>; + defm: Storexm_pat<truncstorei8, I64, s32_0ImmPred, LoReg, S2_storerb_io>; + defm: Storexm_pat<truncstorei16, I64, s31_1ImmPred, LoReg, S2_storerh_io>; + defm: Storexm_pat<truncstorei32, I64, s30_2ImmPred, LoReg, S2_storeri_io>; } def: Storexm_simple_pat<truncstorei8, I64, LoReg, S2_storerb_io>; @@ -4321,6 +4364,14 @@ def: Pat<(i1 (seteq (and (i32 IntRegs:$Rs), (i32 IntRegs:$Rt)), IntRegs:$Rt)), // XTYPE/PERM + //===----------------------------------------------------------------------===// +def: Pat<(or (or (shl (or (shl (i32 (extloadi8 (add (i32 IntRegs:$b), 3))), + (i32 8)), + (i32 (zextloadi8 (add (i32 IntRegs:$b), 2)))), + (i32 16)), + (shl (i32 (zextloadi8 (add (i32 IntRegs:$b), 1))), (i32 8))), + (zextloadi8 (i32 IntRegs:$b))), + (A2_swiz (L2_loadri_io IntRegs:$b, 0))>; + //===----------------------------------------------------------------------===// // XTYPE/PERM - //===----------------------------------------------------------------------===// @@ -4364,7 +4415,7 @@ def C2_pxfer_map: SInst<(outs PredRegs:$dst), (ins PredRegs:$src), // Patterns for loads of i1: def: Pat<(i1 (load AddrFI:$fi)), (C2_tfrrp (L2_loadrub_io AddrFI:$fi, 0))>; -def: Pat<(i1 (load (add (i32 IntRegs:$Rs), s11_0ExtPred:$Off))), +def: Pat<(i1 (load (add (i32 IntRegs:$Rs), s32ImmPred:$Off))), (C2_tfrrp (L2_loadrub_io IntRegs:$Rs, imm:$Off))>; def: Pat<(i1 (load (i32 IntRegs:$Rs))), (C2_tfrrp (L2_loadrub_io IntRegs:$Rs, 0))>; @@ -4375,7 +4426,7 @@ def I1toI32: OutPatFrag<(ops node:$Rs), def I32toI1: OutPatFrag<(ops node:$Rs), (i1 (C2_tfrrp (i32 $Rs)))>; -defm: Storexm_pat<store, I1, s11_0ExtPred, I1toI32, S2_storerb_io>; +defm: Storexm_pat<store, I1, s32ImmPred, I1toI32, S2_storerb_io>; def: Storexm_simple_pat<store, I1, I1toI32, S2_storerb_io>; //===----------------------------------------------------------------------===// @@ -4474,6 +4525,12 @@ def Y2_barrier : SYSInst<(outs), (ins), //===----------------------------------------------------------------------===// // SYSTEM/SUPER - //===----------------------------------------------------------------------===// + +// Generate frameindex addresses. +let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1, + isPseudo = 1, isCodeGenOnly = 1, hasSideEffects = 0 in +def TFR_FI: ALU32_ri<(outs IntRegs:$Rd), (ins IntRegs:$fi, s32Imm:$Off), "">; + //===----------------------------------------------------------------------===// // CRUSER - Type. //===----------------------------------------------------------------------===// @@ -4519,6 +4576,11 @@ class LOOP_rBase<string mnemonic, Operand brOp, bit mustExtend = 0> multiclass LOOP_ri<string mnemonic> { def i : LOOP_iBase<mnemonic, brtarget>; def r : LOOP_rBase<mnemonic, brtarget>; + + let isCodeGenOnly = 1, isExtended = 1, opExtendable = 0 in { + def iext: LOOP_iBase<mnemonic, brtargetExt, 1>; + def rext: LOOP_rBase<mnemonic, brtargetExt, 1>; + } } @@ -4676,36 +4738,6 @@ def Y4_trace: CRInst <(outs), (ins IntRegs:$Rs), let Inst{20-16} = Rs; } -let AddedComplexity = 100, isPredicated = 1, isCodeGenOnly = 1 in -def TFR_condset_ri : ALU32_rr<(outs IntRegs:$dst), - (ins PredRegs:$src1, IntRegs:$src2, s12Imm:$src3), - "Error; should not emit", - [(set (i32 IntRegs:$dst), - (i32 (select (i1 PredRegs:$src1), (i32 IntRegs:$src2), - s12ImmPred:$src3)))]>; - -let AddedComplexity = 100, isPredicated = 1, isCodeGenOnly = 1 in -def TFR_condset_ir : ALU32_rr<(outs IntRegs:$dst), - (ins PredRegs:$src1, s12Imm:$src2, IntRegs:$src3), - "Error; should not emit", - [(set (i32 IntRegs:$dst), - (i32 (select (i1 PredRegs:$src1), s12ImmPred:$src2, - (i32 IntRegs:$src3))))]>; - -let AddedComplexity = 100, isPredicated = 1, isCodeGenOnly = 1 in -def TFR_condset_ii : ALU32_rr<(outs IntRegs:$dst), - (ins PredRegs:$src1, s12Imm:$src2, s12Imm:$src3), - "Error; should not emit", - [(set (i32 IntRegs:$dst), - (i32 (select (i1 PredRegs:$src1), s12ImmPred:$src2, - s12ImmPred:$src3)))]>; - -// Generate frameindex addresses. -let isReMaterializable = 1, isCodeGenOnly = 1 in -def TFR_FI : ALU32_ri<(outs IntRegs:$dst), (ins FrameIndex:$src1), - "$dst = add($src1)", - [(set (i32 IntRegs:$dst), ADDRri:$src1)]>; - // Support for generating global address. // Taken from X86InstrInfo.td. def SDTHexagonCONST32 : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, @@ -4750,30 +4782,29 @@ def HI_PIC : ALU32_ri<(outs IntRegs:$dst), (ins bblabel:$label), "$dst.h = #HI($label@GOTREL)", []>; -let isReMaterializable = 1, isMoveImm = 1, hasSideEffects = 0, - isAsmParserOnly = 1 in -def LOi : ALU32_ri<(outs IntRegs:$dst), (ins i32imm:$imm_value), - "$dst.l = #LO($imm_value)", - []>; - +let isReMaterializable = 1, isMoveImm = 1, + isCodeGenOnly = 1, hasSideEffects = 0 in +def HI_GOT : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$global), + "$dst.h = #HI($global@GOT)", + []>; -let isReMaterializable = 1, isMoveImm = 1, hasSideEffects = 0, - isAsmParserOnly = 1 in -def HIi : ALU32_ri<(outs IntRegs:$dst), (ins i32imm:$imm_value), - "$dst.h = #HI($imm_value)", - []>; +let isReMaterializable = 1, isMoveImm = 1, + isCodeGenOnly = 1, hasSideEffects = 0 in +def LO_GOT : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$global), + "$dst.l = #LO($global@GOT)", + []>; -let isReMaterializable = 1, isMoveImm = 1, hasSideEffects = 0, - isAsmParserOnly = 1 in -def LO_jt : ALU32_ri<(outs IntRegs:$dst), (ins jumptablebase:$jt), - "$dst.l = #LO($jt)", - []>; +let isReMaterializable = 1, isMoveImm = 1, + isCodeGenOnly = 1, hasSideEffects = 0 in +def HI_GOTREL : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$global), + "$dst.h = #HI($global@GOTREL)", + []>; -let isReMaterializable = 1, isMoveImm = 1, hasSideEffects = 0, - isAsmParserOnly = 1 in -def HI_jt : ALU32_ri<(outs IntRegs:$dst), (ins jumptablebase:$jt), - "$dst.h = #HI($jt)", - []>; +let isReMaterializable = 1, isMoveImm = 1, + isCodeGenOnly = 1, hasSideEffects = 0 in +def LO_GOTREL : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$global), + "$dst.l = #LO($global@GOTREL)", + []>; // This pattern is incorrect. When we add small data, we should change // this pattern to use memw(#foo). @@ -4785,31 +4816,19 @@ def CONST32 : CONSTLDInst<(outs IntRegs:$dst), (ins globaladdress:$global), (load (HexagonCONST32 tglobaltlsaddr:$global)))]>; let isReMaterializable = 1, isMoveImm = 1 in -def CONST32_set : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global), - "$dst = CONST32(#$global)", - [(set (i32 IntRegs:$dst), - (HexagonCONST32 tglobaladdr:$global))]>; - -let isReMaterializable = 1, isMoveImm = 1, isAsmParserOnly = 1 in def CONST32_set_jt : CONSTLDInst<(outs IntRegs:$dst), (ins jumptablebase:$jt), "$dst = CONST32(#$jt)", [(set (i32 IntRegs:$dst), (HexagonCONST32 tjumptable:$jt))]>; let isReMaterializable = 1, isMoveImm = 1, isAsmParserOnly = 1 in -def CONST32GP_set : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global), - "$dst = CONST32(#$global)", - [(set (i32 IntRegs:$dst), - (HexagonCONST32_GP tglobaladdr:$global))]>; - -let isReMaterializable = 1, isMoveImm = 1, isAsmParserOnly = 1 in def CONST32_Int_Real : CONSTLDInst<(outs IntRegs:$dst), (ins i32imm:$global), "$dst = CONST32(#$global)", [(set (i32 IntRegs:$dst), imm:$global) ]>; -// Map BlockAddress lowering to CONST32_Int_Real -def : Pat<(HexagonCONST32_GP tblockaddress:$addr), - (CONST32_Int_Real tblockaddress:$addr)>; +// Map TLS addressses to a CONST32 instruction +def: Pat<(HexagonCONST32 tglobaltlsaddr:$addr), (A2_tfrsi s16Ext:$addr)>; +def: Pat<(HexagonCONST32 bbl:$label), (A2_tfrsi s16Ext:$label)>; let isReMaterializable = 1, isMoveImm = 1, isAsmParserOnly = 1 in def CONST32_Label : LDInst2<(outs IntRegs:$dst), (ins bblabel:$label), @@ -4869,21 +4888,17 @@ let isPseudo = 1, isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0, def TCRETURNr : T_JMPr; // Direct tail-calls. -let isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0, -isTerminator = 1, isCodeGenOnly = 1 in { - def TCRETURNtg : JInst<(outs), (ins calltarget:$dst), "jump $dst", - [], "", J_tc_2early_SLOT23>; - def TCRETURNtext : JInst<(outs), (ins calltarget:$dst), "jump $dst", - [], "", J_tc_2early_SLOT23>; -} +let isPseudo = 1, isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0, + isTerminator = 1, isCodeGenOnly = 1 in +def TCRETURNi : JInst<(outs), (ins calltarget:$dst), "", []>; //Tail calls. def: Pat<(HexagonTCRet tglobaladdr:$dst), - (TCRETURNtg tglobaladdr:$dst)>; + (TCRETURNi tglobaladdr:$dst)>; def: Pat<(HexagonTCRet texternalsym:$dst), - (TCRETURNtext texternalsym:$dst)>; + (TCRETURNi texternalsym:$dst)>; def: Pat<(HexagonTCRet (i32 IntRegs:$dst)), - (TCRETURNr (i32 IntRegs:$dst))>; + (TCRETURNr IntRegs:$dst)>; // Map from r0 = and(r1, 65535) to r0 = zxth(r1) def: Pat<(and (i32 IntRegs:$src1), 65535), @@ -4900,19 +4915,19 @@ def: Pat<(add (i1 PredRegs:$src1), -1), (C2_not PredRegs:$src1)>; // Map from p0 = pnot(p0); r0 = mux(p0, #i, #j) => r0 = mux(p0, #j, #i). -def: Pat<(select (not (i1 PredRegs:$src1)), s8ImmPred:$src2, s8ExtPred:$src3), - (C2_muxii PredRegs:$src1, s8ExtPred:$src3, s8ImmPred:$src2)>; +def: Pat<(select (not (i1 PredRegs:$src1)), s8ImmPred:$src2, s32ImmPred:$src3), + (C2_muxii PredRegs:$src1, s32ImmPred:$src3, s8ImmPred:$src2)>; // Map from p0 = pnot(p0); r0 = select(p0, #i, r1) // => r0 = C2_muxir(p0, r1, #i) -def: Pat<(select (not (i1 PredRegs:$src1)), s8ExtPred:$src2, +def: Pat<(select (not (i1 PredRegs:$src1)), s32ImmPred:$src2, (i32 IntRegs:$src3)), - (C2_muxir PredRegs:$src1, IntRegs:$src3, s8ExtPred:$src2)>; + (C2_muxir PredRegs:$src1, IntRegs:$src3, s32ImmPred:$src2)>; // Map from p0 = pnot(p0); r0 = mux(p0, r1, #i) // => r0 = C2_muxri (p0, #i, r1) -def: Pat<(select (not (i1 PredRegs:$src1)), IntRegs:$src2, s8ExtPred:$src3), - (C2_muxri PredRegs:$src1, s8ExtPred:$src3, IntRegs:$src2)>; +def: Pat<(select (not (i1 PredRegs:$src1)), IntRegs:$src2, s32ImmPred:$src3), + (C2_muxri PredRegs:$src1, s32ImmPred:$src3, IntRegs:$src2)>; // Map from p0 = pnot(p0); if (p0) jump => if (!p0) jump. def: Pat<(brcond (not (i1 PredRegs:$src1)), bb:$offset), @@ -4952,26 +4967,6 @@ def: Pat<(brcond (i1 (setlt (i32 IntRegs:$src1), s8ImmPred:$src2)), bb:$offset), (J2_jumpf (C2_cmpgti IntRegs:$src1, (DEC_CONST_SIGNED s8ImmPred:$src2)), bb:$offset)>; -// cmp.lt(r0, r1) -> cmp.gt(r1, r0) -def : Pat <(brcond (i1 (setlt (i32 IntRegs:$src1), (i32 IntRegs:$src2))), - bb:$offset), - (J2_jumpt (C2_cmpgt (i32 IntRegs:$src2), (i32 IntRegs:$src1)), bb:$offset)>; - -def : Pat <(brcond (i1 (setuge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))), - bb:$offset), - (J2_jumpf (C2_cmpgtup (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1)), - bb:$offset)>; - -def : Pat <(brcond (i1 (setule (i32 IntRegs:$src1), (i32 IntRegs:$src2))), - bb:$offset), - (J2_jumpf (C2_cmpgtu (i32 IntRegs:$src1), (i32 IntRegs:$src2)), - bb:$offset)>; - -def : Pat <(brcond (i1 (setule (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))), - bb:$offset), - (J2_jumpf (C2_cmpgtup (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)), - bb:$offset)>; - // Map from a 64-bit select to an emulated 64-bit mux. // Hexagon does not support 64-bit MUXes; so emulate with combines. def: Pat<(select (i1 PredRegs:$src1), (i64 DoubleRegs:$src2), @@ -4987,10 +4982,6 @@ def: Pat<(select (i1 PredRegs:$src1), (i1 PredRegs:$src2), (i1 PredRegs:$src3)), (C2_or (C2_and PredRegs:$src1, PredRegs:$src2), (C2_and (C2_not PredRegs:$src1), PredRegs:$src3))>; -// Map Pd = load(addr) -> Rs = load(addr); Pd = Rs. -def : Pat<(i1 (load ADDRriS11_2:$addr)), - (i1 (C2_tfrrp (i32 (L2_loadrb_io AddrFI:$addr, 0))))>; - // Map for truncating from 64 immediates to 32 bit immediates. def: Pat<(i32 (trunc (i64 DoubleRegs:$src))), (LoReg DoubleRegs:$src)>; @@ -4999,42 +4990,10 @@ def: Pat<(i32 (trunc (i64 DoubleRegs:$src))), def: Pat<(i1 (trunc (i64 DoubleRegs:$src))), (C2_tfrrp (LoReg DoubleRegs:$src))>; -// Map memb(Rs) = Rdd -> memb(Rs) = Rt. -def : Pat<(truncstorei8 (i64 DoubleRegs:$src), ADDRriS11_0:$addr), - (S2_storerb_io AddrFI:$addr, 0, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src), - subreg_loreg)))>; - -// Map memh(Rs) = Rdd -> memh(Rs) = Rt. -def : Pat<(truncstorei16 (i64 DoubleRegs:$src), ADDRriS11_0:$addr), - (S2_storerh_io AddrFI:$addr, 0, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src), - subreg_loreg)))>; -// Map memw(Rs) = Rdd -> memw(Rs) = Rt -def : Pat<(truncstorei32 (i64 DoubleRegs:$src), ADDRriS11_0:$addr), - (S2_storeri_io AddrFI:$addr, 0, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src), - subreg_loreg)))>; - -// Map memw(Rs) = Rdd -> memw(Rs) = Rt. -def : Pat<(truncstorei32 (i64 DoubleRegs:$src), ADDRriS11_0:$addr), - (S2_storeri_io AddrFI:$addr, 0, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src), - subreg_loreg)))>; - -// Map from i1 = constant<-1>; memw(addr) = i1 -> r0 = 1; memw(addr) = r0. -def : Pat<(store (i1 -1), ADDRriS11_2:$addr), - (S2_storerb_io AddrFI:$addr, 0, (A2_tfrsi 1))>; - - -// Map from i1 = constant<-1>; store i1 -> r0 = 1; store r0. -def : Pat<(store (i1 -1), ADDRriS11_2:$addr), - (S2_storerb_io AddrFI:$addr, 0, (A2_tfrsi 1))>; - -// Map from memb(Rs) = Pd -> Rt = mux(Pd, #0, #1); store Rt. -def : Pat<(store (i1 PredRegs:$src1), ADDRriS11_2:$addr), - (S2_storerb_io AddrFI:$addr, 0, (i32 (C2_muxii (i1 PredRegs:$src1), 1, 0)) )>; - // rs <= rt -> !(rs > rt). let AddedComplexity = 30 in -def: Pat<(i1 (setle (i32 IntRegs:$src1), s10ExtPred:$src2)), - (C2_not (C2_cmpgti IntRegs:$src1, s10ExtPred:$src2))>; +def: Pat<(i1 (setle (i32 IntRegs:$src1), s32ImmPred:$src2)), + (C2_not (C2_cmpgti IntRegs:$src1, s32ImmPred:$src2))>; // rs <= rt -> !(rs > rt). def : Pat<(i1 (setle (i32 IntRegs:$src1), (i32 IntRegs:$src2))), @@ -5048,13 +5007,8 @@ def: Pat<(i1 (setle (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))), // Hexagon_TODO: We should improve on this. // rs != rt -> !(rs == rt). let AddedComplexity = 30 in -def: Pat<(i1 (setne (i32 IntRegs:$src1), s10ExtPred:$src2)), - (C2_not (C2_cmpeqi IntRegs:$src1, s10ExtPred:$src2))>; - -// Map cmpne(Rs) -> !cmpeqe(Rs). -// rs != rt -> !(rs == rt). -def : Pat <(i1 (setne (i32 IntRegs:$src1), (i32 IntRegs:$src2))), - (i1 (C2_not (i1 (C2_cmpeq (i32 IntRegs:$src1), (i32 IntRegs:$src2)))))>; +def: Pat<(i1 (setne (i32 IntRegs:$src1), s32ImmPred:$src2)), + (C2_not (C2_cmpeqi IntRegs:$src1, s32ImmPred:$src2))>; // Convert setne back to xor for hexagon since we compute w/ pred registers. def: Pat<(i1 (setne (i1 PredRegs:$src1), (i1 PredRegs:$src2))), @@ -5072,8 +5026,8 @@ def : Pat <(i1 (setge (i32 IntRegs:$src1), (i32 IntRegs:$src2))), // cmpge(Rs, Imm) -> cmpgt(Rs, Imm-1) let AddedComplexity = 30 in -def: Pat<(i1 (setge (i32 IntRegs:$src1), s8ExtPred:$src2)), - (C2_cmpgti IntRegs:$src1, (DEC_CONST_SIGNED s8ExtPred:$src2))>; +def: Pat<(i1 (setge (i32 IntRegs:$src1), s32ImmPred:$src2)), + (C2_cmpgti IntRegs:$src1, (DEC_CONST_SIGNED s32ImmPred:$src2))>; // Map cmpge(Rss, Rtt) -> !cmpgt(Rtt, Rss). // rss >= rtt -> !(rtt > rss). @@ -5084,20 +5038,21 @@ def: Pat<(i1 (setge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))), // !cmpge(Rs, Imm) -> !cmpgt(Rs, Imm-1). // rs < rt -> !(rs >= rt). let AddedComplexity = 30 in -def: Pat<(i1 (setlt (i32 IntRegs:$src1), s8ExtPred:$src2)), - (C2_not (C2_cmpgti IntRegs:$src1, (DEC_CONST_SIGNED s8ExtPred:$src2)))>; +def: Pat<(i1 (setlt (i32 IntRegs:$src1), s32ImmPred:$src2)), + (C2_not (C2_cmpgti IntRegs:$src1, + (DEC_CONST_SIGNED s32ImmPred:$src2)))>; // Generate cmpgeu(Rs, #0) -> cmpeq(Rs, Rs) def: Pat<(i1 (setuge (i32 IntRegs:$src1), 0)), (C2_cmpeq IntRegs:$src1, IntRegs:$src1)>; // Generate cmpgeu(Rs, #u8) -> cmpgtu(Rs, #u8 -1) -def: Pat<(i1 (setuge (i32 IntRegs:$src1), u8ExtPred:$src2)), - (C2_cmpgtui IntRegs:$src1, (DEC_CONST_UNSIGNED u8ExtPred:$src2))>; +def: Pat<(i1 (setuge (i32 IntRegs:$src1), u32ImmPred:$src2)), + (C2_cmpgtui IntRegs:$src1, (DEC_CONST_UNSIGNED u32ImmPred:$src2))>; // Generate cmpgtu(Rs, #u9) -def: Pat<(i1 (setugt (i32 IntRegs:$src1), u9ExtPred:$src2)), - (C2_cmpgtui IntRegs:$src1, u9ExtPred:$src2)>; +def: Pat<(i1 (setugt (i32 IntRegs:$src1), u32ImmPred:$src2)), + (C2_cmpgtui IntRegs:$src1, u32ImmPred:$src2)>; // Map from Rs >= Rt -> !(Rt > Rs). // rs >= rt -> !(rt > rs). @@ -5118,11 +5073,6 @@ def: Pat<(i32 (sext (i1 PredRegs:$src1))), def: Pat<(i64 (sext (i1 PredRegs:$src1))), (A2_combinew (A2_tfrsi -1), (C2_muxii PredRegs:$src1, -1, 0))>; -// Convert sign-extended load back to load and sign extend. -// i32 -> i64 -def: Pat <(i64 (sextloadi32 ADDRriS11_2:$src1)), - (i64 (A2_sxtw (L2_loadri_io AddrFI:$src1, 0)))>; - // Zero extends. // i1 -> i32 def: Pat<(i32 (zext (i1 PredRegs:$src1))), @@ -5136,12 +5086,6 @@ def: Pat<(i32 (anyext (i1 PredRegs:$src1))), def: Pat<(i64 (anyext (i1 PredRegs:$src1))), (A2_sxtw (C2_muxii PredRegs:$src1, 1, 0))>; -def: Pat<(i64 (or (i64 (shl (i64 DoubleRegs:$srcHigh), - (i32 32))), - (i64 (zextloadi32 ADDRriS11_2:$srcLow)))), - (i64 (A2_combinew (EXTRACT_SUBREG (i64 DoubleRegs:$srcHigh), subreg_loreg), - (L2_loadri_io AddrFI:$srcLow, 0)))>; - // Multiply 64-bit unsigned and use upper result. def : Pat <(mulhu (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)), (A2_addp @@ -5186,10 +5130,13 @@ let AddedComplexity = 100 in def: Pat<(i32 (sext_inreg (Hexagon_ARGEXTEND (i32 IntRegs:$src1)), i16)), (i32 IntRegs:$src1)>; -def HexagonWrapperJT: SDNode<"HexagonISD::WrapperJT", SDTIntUnaryOp>; +def HexagonJT: SDNode<"HexagonISD::JT", SDTIntUnaryOp>; +def HexagonCP: SDNode<"HexagonISD::CP", SDTIntUnaryOp>; -def : Pat<(HexagonWrapperJT tjumptable:$dst), - (i32 (CONST32_set_jt tjumptable:$dst))>; +def: Pat<(HexagonJT tjumptable:$dst), + (CONST32_set_jt tjumptable:$dst)>; +def: Pat<(HexagonCP tconstpool :$dst), + (CONST32_set_jt tconstpool:$dst)>; // XTYPE/SHIFT // @@ -5626,6 +5573,43 @@ let hasNewValue = 1 in { def S2_insertp_rp : T_S3op_insert<"insert", DoubleRegs>; def S2_insertp : T_S2op_insert <0b0011, DoubleRegs, u6Imm>; + +def SDTHexagonINSERT_ri : SDTypeProfile<1, 4, [SDTCisVT<0, i32>, + SDTCisVT<1, i32>, + SDTCisVT<2, i32>, + SDTCisVT<3, i32>, + SDTCisVT<4, i32>]>; +def SDTHexagonINSERT_rd : SDTypeProfile<1, 4, [SDTCisVT<0, i64>, + SDTCisVT<1, i64>, + SDTCisVT<2, i64>, + SDTCisVT<3, i32>, + SDTCisVT<4, i32>]>; +def SDTHexagonINSERT_riv : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, + SDTCisVT<1, i32>, + SDTCisVT<2, i32>, + SDTCisVT<3, i64>]>; +def SDTHexagonINSERT_rdv : SDTypeProfile<1, 3, [SDTCisVT<0, i64>, + SDTCisVT<1, i64>, + SDTCisVT<2, i64>, + SDTCisVT<3, i64>]>; +def HexagonINSERT_ri : SDNode<"HexagonISD::INSERT_ri", SDTHexagonINSERT_ri>; +def HexagonINSERT_rd : SDNode<"HexagonISD::INSERT_rd", SDTHexagonINSERT_rd>; +def HexagonINSERT_riv: SDNode<"HexagonISD::INSERT_riv", SDTHexagonINSERT_riv>; +def HexagonINSERT_rdv: SDNode<"HexagonISD::INSERT_rdv", SDTHexagonINSERT_rdv>; + +def: Pat<(HexagonINSERT_ri I32:$Rs, I32:$Rt, u5ImmPred:$u1, u5ImmPred:$u2), + (S2_insert I32:$Rs, I32:$Rt, u5ImmPred:$u1, u5ImmPred:$u2)>; + +def: Pat<(HexagonINSERT_rd I64:$Rs, I64:$Rt, u6ImmPred:$u1, u6ImmPred:$u2), + (S2_insertp I64:$Rs, I64:$Rt, u6ImmPred:$u1, u6ImmPred:$u2)>; + +def: Pat<(HexagonINSERT_riv I32:$Rs, I32:$Rt, I64:$Ru), + (S2_insert_rp I32:$Rs, I32:$Rt, I64:$Ru)>; + +def: Pat<(HexagonINSERT_rdv I64:$Rs, I64:$Rt, I64:$Ru), + (S2_insertp_rp I64:$Rs, I64:$Rt, I64:$Ru)>; + + //===----------------------------------------------------------------------===// // Template class for 'extract bitfield' instructions //===----------------------------------------------------------------------===// @@ -5692,6 +5676,37 @@ let hasNewValue = 1 in { def S2_extractu : T_S2op_extract <"extractu", 0b1101, IntRegs, u5Imm>; } +def SDTHexagonEXTRACTU_ri : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, + SDTCisVT<1, i32>, + SDTCisVT<2, i32>, + SDTCisVT<3, i32>]>; +def SDTHexagonEXTRACTU_rd : SDTypeProfile<1, 3, [SDTCisVT<0, i64>, + SDTCisVT<1, i64>, + SDTCisVT<2, i32>, + SDTCisVT<3, i32>]>; +def SDTHexagonEXTRACTU_riv : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, + SDTCisVT<1, i32>, + SDTCisVT<2, i64>]>; +def SDTHexagonEXTRACTU_rdv : SDTypeProfile<1, 2, [SDTCisVT<0, i64>, + SDTCisVT<1, i64>, + SDTCisVT<2, i64>]>; +def HexagonEXTRACTU_ri : SDNode<"HexagonISD::EXTRACTU_ri", SDTHexagonEXTRACTU_ri>; +def HexagonEXTRACTU_rd : SDNode<"HexagonISD::EXTRACTU_rd", SDTHexagonEXTRACTU_rd>; +def HexagonEXTRACTU_riv: SDNode<"HexagonISD::EXTRACTU_riv", SDTHexagonEXTRACTU_riv>; +def HexagonEXTRACTU_rdv: SDNode<"HexagonISD::EXTRACTU_rdv", SDTHexagonEXTRACTU_rdv>; + +def: Pat<(HexagonEXTRACTU_ri I32:$src1, u5ImmPred:$src2, u5ImmPred:$src3), + (S2_extractu I32:$src1, u5ImmPred:$src2, u5ImmPred:$src3)>; + +def: Pat<(HexagonEXTRACTU_rd I64:$src1, u6ImmPred:$src2, u6ImmPred:$src3), + (S2_extractup I64:$src1, u6ImmPred:$src2, u6ImmPred:$src3)>; + +def: Pat<(HexagonEXTRACTU_riv I32:$src1, I64:$src2), + (S2_extractu_rp I32:$src1, I64:$src2)>; + +def: Pat<(HexagonEXTRACTU_rdv I64:$src1, I64:$src2), + (S2_extractup_rp I64:$src1, I64:$src2)>; + // Change the sign of the immediate for Rd=-mpyi(Rs,#u8) def: Pat<(mul (i32 IntRegs:$src1), (ineg n8ImmPred:$src2)), (M2_mpysin IntRegs:$src1, u8ImmPred:$src2)>; @@ -5728,6 +5743,22 @@ def S2_tableidxw : tableidxRaw<"tableidxw", 0b10>; def S2_tableidxd : tableidxRaw<"tableidxd", 0b11>; //===----------------------------------------------------------------------===// +// Template class for 'table index' instructions which are assembler mapped +// to their :raw format. +//===----------------------------------------------------------------------===// +let isPseudo = 1 in +class tableidx_goodsyntax <string mnemonic> + : SInst <(outs IntRegs:$Rx), + (ins IntRegs:$_dst_, IntRegs:$Rs, u4Imm:$u4, u5Imm:$u5), + "$Rx = "#mnemonic#"($Rs, #$u4, #$u5)", + [], "$Rx = $_dst_" >; + +def S2_tableidxb_goodsyntax : tableidx_goodsyntax<"tableidxb">; +def S2_tableidxh_goodsyntax : tableidx_goodsyntax<"tableidxh">; +def S2_tableidxw_goodsyntax : tableidx_goodsyntax<"tableidxw">; +def S2_tableidxd_goodsyntax : tableidx_goodsyntax<"tableidxd">; + +//===----------------------------------------------------------------------===// // V3 Instructions + //===----------------------------------------------------------------------===// @@ -5761,4 +5792,4 @@ include "HexagonInstrInfoV5.td" // ALU32/64/Vector + //===----------------------------------------------------------------------===/// -include "HexagonInstrInfoVector.td"
\ No newline at end of file +include "HexagonInstrInfoVector.td" diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td index 0e4dde3..918b482 100644 --- a/lib/Target/Hexagon/HexagonInstrInfoV4.td +++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td @@ -11,6 +11,25 @@ // //===----------------------------------------------------------------------===// +def DuplexIClass0: InstDuplex < 0 >; +def DuplexIClass1: InstDuplex < 1 >; +def DuplexIClass2: InstDuplex < 2 >; +let isExtendable = 1 in { + def DuplexIClass3: InstDuplex < 3 >; + def DuplexIClass4: InstDuplex < 4 >; + def DuplexIClass5: InstDuplex < 5 >; + def DuplexIClass6: InstDuplex < 6 >; + def DuplexIClass7: InstDuplex < 7 >; +} +def DuplexIClass8: InstDuplex < 8 >; +def DuplexIClass9: InstDuplex < 9 >; +def DuplexIClassA: InstDuplex < 0xA >; +def DuplexIClassB: InstDuplex < 0xB >; +def DuplexIClassC: InstDuplex < 0xC >; +def DuplexIClassD: InstDuplex < 0xD >; +def DuplexIClassE: InstDuplex < 0xE >; +def DuplexIClassF: InstDuplex < 0xF >; + def addrga: PatLeaf<(i32 AddrGA:$Addr)>; def addrgp: PatLeaf<(i32 AddrGP:$Addr)>; @@ -137,6 +156,9 @@ def: T_cmp32_rr_pat<A4_rcmpeq, CmpInReg<seteq>, i32>; def: T_cmp32_rr_pat<A4_rcmpneq, CmpInReg<setne>, i32>; def: T_cmp32_rr_pat<C4_cmpneq, setne, i1>; +def: T_cmp32_rr_pat<C4_cmplteu, setule, i1>; + +def: T_cmp32_rr_pat<C4_cmplteu, RevCmp<setuge>, i1>; class T_CMP_rrbh<string mnemonic, bits<3> MinOp, bit IsComm> : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt), @@ -247,10 +269,10 @@ class T_RCMP_EQ_ri<string mnemonic, bit IsNeg> def A4_rcmpeqi : T_RCMP_EQ_ri<"cmp.eq", 0>; def A4_rcmpneqi : T_RCMP_EQ_ri<"!cmp.eq", 1>; -def: Pat<(i32 (zext (i1 (seteq (i32 IntRegs:$Rs), s8ExtPred:$s8)))), - (A4_rcmpeqi IntRegs:$Rs, s8ExtPred:$s8)>; -def: Pat<(i32 (zext (i1 (setne (i32 IntRegs:$Rs), s8ExtPred:$s8)))), - (A4_rcmpneqi IntRegs:$Rs, s8ExtPred:$s8)>; +def: Pat<(i32 (zext (i1 (seteq (i32 IntRegs:$Rs), s32ImmPred:$s8)))), + (A4_rcmpeqi IntRegs:$Rs, s32ImmPred:$s8)>; +def: Pat<(i32 (zext (i1 (setne (i32 IntRegs:$Rs), s32ImmPred:$s8)))), + (A4_rcmpneqi IntRegs:$Rs, s32ImmPred:$s8)>; // Preserve the S2_tstbit_r generation def: Pat<(i32 (zext (i1 (setne (i32 (and (i32 (shl 1, (i32 IntRegs:$src2))), @@ -292,16 +314,15 @@ let opExtendable = 1 in def A4_combineir : T_Combine1<0b01, (ins s8Ext:$s8, IntRegs:$Rs), "$Rdd = combine(#$s8, $Rs)">; -def HexagonWrapperCombineRI_V4 : - SDNode<"HexagonISD::WrapperCombineRI_V4", SDTHexagonI64I32I32>; -def HexagonWrapperCombineIR_V4 : - SDNode<"HexagonISD::WrapperCombineIR_V4", SDTHexagonI64I32I32>; +// The complexity of the combines involving immediates should be greater +// than the complexity of the combine with two registers. +let AddedComplexity = 50 in { +def: Pat<(HexagonCOMBINE IntRegs:$r, s32ImmPred:$i), + (A4_combineri IntRegs:$r, s32ImmPred:$i)>; -def : Pat <(HexagonWrapperCombineRI_V4 IntRegs:$r, s8ExtPred:$i), - (A4_combineri IntRegs:$r, s8ExtPred:$i)>; - -def : Pat <(HexagonWrapperCombineIR_V4 s8ExtPred:$i, IntRegs:$r), - (A4_combineir s8ExtPred:$i, IntRegs:$r)>; +def: Pat<(HexagonCOMBINE s32ImmPred:$i, IntRegs:$r), + (A4_combineir s32ImmPred:$i, IntRegs:$r)>; +} // A4_combineii: Set two small immediates. let hasSideEffects = 0, isExtendable = 1, opExtentBits = 6, opExtendable = 2 in @@ -322,7 +343,7 @@ def A4_combineii: ALU32Inst<(outs DoubleRegs:$Rdd), (ins s8Imm:$s8, u6Ext:$U6), // The complexity of the combine with two immediates should be greater than // the complexity of a combine involving a register. let AddedComplexity = 75 in -def: Pat<(HexagonCOMBINE s8ImmPred:$s8, u6ExtPred:$u6), +def: Pat<(HexagonCOMBINE s8ImmPred:$s8, u32ImmPred:$u6), (A4_combineii imm:$s8, imm:$u6)>; //===----------------------------------------------------------------------===// @@ -346,20 +367,22 @@ multiclass Loadxm_pat<PatFrag Load, ValueType VT, PatFrag ValueMod, PatLeaf ImmPred, InstHexagon MI> { def: Pat<(VT (Load AddrFI:$fi)), (VT (ValueMod (MI AddrFI:$fi, 0)))>; + def: Pat<(VT (Load (add AddrFI:$fi, ImmPred:$Off))), + (VT (ValueMod (MI AddrFI:$fi, imm:$Off)))>; def: Pat<(VT (Load (add IntRegs:$Rs, ImmPred:$Off))), (VT (ValueMod (MI IntRegs:$Rs, imm:$Off)))>; def: Pat<(VT (Load (i32 IntRegs:$Rs))), (VT (ValueMod (MI IntRegs:$Rs, 0)))>; } -defm: Loadxm_pat<extloadi1, i64, Zext64, s11_0ExtPred, L2_loadrub_io>; -defm: Loadxm_pat<extloadi8, i64, Zext64, s11_0ExtPred, L2_loadrub_io>; -defm: Loadxm_pat<extloadi16, i64, Zext64, s11_1ExtPred, L2_loadruh_io>; -defm: Loadxm_pat<zextloadi1, i64, Zext64, s11_0ExtPred, L2_loadrub_io>; -defm: Loadxm_pat<zextloadi8, i64, Zext64, s11_0ExtPred, L2_loadrub_io>; -defm: Loadxm_pat<zextloadi16, i64, Zext64, s11_1ExtPred, L2_loadruh_io>; -defm: Loadxm_pat<sextloadi8, i64, Sext64, s11_0ExtPred, L2_loadrb_io>; -defm: Loadxm_pat<sextloadi16, i64, Sext64, s11_1ExtPred, L2_loadrh_io>; +defm: Loadxm_pat<extloadi1, i64, Zext64, s32_0ImmPred, L2_loadrub_io>; +defm: Loadxm_pat<extloadi8, i64, Zext64, s32_0ImmPred, L2_loadrub_io>; +defm: Loadxm_pat<extloadi16, i64, Zext64, s31_1ImmPred, L2_loadruh_io>; +defm: Loadxm_pat<zextloadi1, i64, Zext64, s32_0ImmPred, L2_loadrub_io>; +defm: Loadxm_pat<zextloadi8, i64, Zext64, s32_0ImmPred, L2_loadrub_io>; +defm: Loadxm_pat<zextloadi16, i64, Zext64, s31_1ImmPred, L2_loadruh_io>; +defm: Loadxm_pat<sextloadi8, i64, Sext64, s32_0ImmPred, L2_loadrb_io>; +defm: Loadxm_pat<sextloadi16, i64, Sext64, s31_1ImmPred, L2_loadrh_io>; // Map Rdd = anyext(Rs) -> Rdd = combine(#0, Rs). def: Pat<(i64 (anyext (i32 IntRegs:$src1))), (Zext64 IntRegs:$src1)>; @@ -635,19 +658,6 @@ def: Pat<(i64 (zext (i1 PredRegs:$src1))), def: Pat<(i64 (zext (i32 IntRegs:$src1))), (Zext64 IntRegs:$src1)>; -// zext i32->i64 -def: Pat <(i64 (zextloadi32 ADDRriS11_2:$src1)), - (i64 (A4_combineir 0, (L2_loadri_io AddrFI:$src1, 0)))>; - -let AddedComplexity = 100 in -def: Pat <(i64 (zextloadi32 (i32 (add IntRegs:$src1, s11_2ExtPred:$offset)))), - (i64 (A4_combineir 0, (L2_loadri_io IntRegs:$src1, - s11_2ExtPred:$offset)))>; - -// anyext i32->i64 -def: Pat <(i64 (extloadi32 ADDRriS11_2:$src1)), - (i64 (A4_combineir 0, (L2_loadri_io AddrFI:$src1, 0)))>; - //===----------------------------------------------------------------------===// // LD - //===----------------------------------------------------------------------===// @@ -768,8 +778,8 @@ multiclass T_StoreAbsReg_Pats <InstHexagon MI, RegisterClass RC, ValueType VT, PatFrag stOp> { def : Pat<(stOp (VT RC:$src4), (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2), - u0AlwaysExtPred:$src3)), - (MI IntRegs:$src1, u2ImmPred:$src2, u0AlwaysExtPred:$src3, RC:$src4)>; + u32ImmPred:$src3)), + (MI IntRegs:$src1, u2ImmPred:$src2, u32ImmPred:$src3, RC:$src4)>; def : Pat<(stOp (VT RC:$src4), (add (shl IntRegs:$src1, u2ImmPred:$src2), @@ -1157,17 +1167,17 @@ let AddedComplexity = 40 in { // is not extendable. This could cause problems during removing the frame // indices, since the offset with respect to R29/R30 may not fit in the // u6 field. - def: Storexm_add_pat<truncstorei8, s8ExtPred, u6_0ImmPred, ToImmByte, + def: Storexm_add_pat<truncstorei8, s32ImmPred, u6_0ImmPred, ToImmByte, S4_storeirb_io>; - def: Storexm_add_pat<truncstorei16, s8ExtPred, u6_1ImmPred, ToImmHalf, + def: Storexm_add_pat<truncstorei16, s32ImmPred, u6_1ImmPred, ToImmHalf, S4_storeirh_io>; - def: Storexm_add_pat<store, s8ExtPred, u6_2ImmPred, ToImmWord, + def: Storexm_add_pat<store, s32ImmPred, u6_2ImmPred, ToImmWord, S4_storeiri_io>; } -def: Storexm_simple_pat<truncstorei8, s8ExtPred, ToImmByte, S4_storeirb_io>; -def: Storexm_simple_pat<truncstorei16, s8ExtPred, ToImmHalf, S4_storeirh_io>; -def: Storexm_simple_pat<store, s8ExtPred, ToImmWord, S4_storeiri_io>; +def: Storexm_simple_pat<truncstorei8, s32ImmPred, ToImmByte, S4_storeirb_io>; +def: Storexm_simple_pat<truncstorei16, s32ImmPred, ToImmHalf, S4_storeirh_io>; +def: Storexm_simple_pat<store, s32ImmPred, ToImmWord, S4_storeiri_io>; // memb(Rx++#s4:0:circ(Mu))=Rt // memb(Rx++I:circ(Mu))=Rt @@ -1798,6 +1808,49 @@ def: LogLogNot_pat<or, and, C4_or_andn>; def: LogLogNot_pat<or, or, C4_or_orn>; //===----------------------------------------------------------------------===// +// PIC: Support for PIC compilations. The patterns and SD nodes defined +// below are needed to support code generation for PIC +//===----------------------------------------------------------------------===// + +def SDT_HexagonPICAdd + : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; +def SDT_HexagonGOTAdd + : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; + +def SDT_HexagonGOTAddInternal : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>; +def SDT_HexagonGOTAddInternalJT : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>; +def SDT_HexagonGOTAddInternalBA : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>; + +def Hexagonpic_add : SDNode<"HexagonISD::PIC_ADD", SDT_HexagonPICAdd>; +def Hexagonat_got : SDNode<"HexagonISD::AT_GOT", SDT_HexagonGOTAdd>; +def Hexagongat_pcrel : SDNode<"HexagonISD::AT_PCREL", + SDT_HexagonGOTAddInternal>; +def Hexagongat_pcrel_jt : SDNode<"HexagonISD::AT_PCREL", + SDT_HexagonGOTAddInternalJT>; +def Hexagongat_pcrel_ba : SDNode<"HexagonISD::AT_PCREL", + SDT_HexagonGOTAddInternalBA>; + +// PIC: Map from a block address computation to a PC-relative add +def: Pat<(Hexagongat_pcrel_ba tblockaddress:$src1), + (C4_addipc u32ImmPred:$src1)>; + +// PIC: Map from the computation to generate a GOT pointer to a PC-relative add +def: Pat<(Hexagonpic_add texternalsym:$src1), + (C4_addipc u32ImmPred:$src1)>; + +// PIC: Map from a jump table address computation to a PC-relative add +def: Pat<(Hexagongat_pcrel_jt tjumptable:$src1), + (C4_addipc u32ImmPred:$src1)>; + +// PIC: Map from a GOT-relative symbol reference to a load +def: Pat<(Hexagonat_got (i32 IntRegs:$src1), tglobaladdr:$src2), + (L2_loadri_io IntRegs:$src1, s30_2ImmPred:$src2)>; + +// PIC: Map from a static symbol reference to a PC-relative add +def: Pat<(Hexagongat_pcrel tglobaladdr:$src1), + (C4_addipc u32ImmPred:$src1)>; + +//===----------------------------------------------------------------------===// // CR - //===----------------------------------------------------------------------===// @@ -1836,7 +1889,7 @@ def S4_addaddi : ALU64Inst <(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Ru, s6Ext:$s6), "$Rd = add($Rs, add($Ru, #$s6))" , [(set (i32 IntRegs:$Rd), (add (i32 IntRegs:$Rs), - (add (i32 IntRegs:$Ru), s6_16ExtPred:$s6)))], + (add (i32 IntRegs:$Ru), s16_16ImmPred:$s6)))], "", ALU64_tc_2_SLOT23> { bits<5> Rd; bits<5> Rs; @@ -1877,19 +1930,19 @@ def S4_subaddi: ALU64Inst <(outs IntRegs:$Rd), } // Rd=add(Rs,sub(#s6,Ru)) -def: Pat<(add (i32 IntRegs:$src1), (sub s6_10ExtPred:$src2, +def: Pat<(add (i32 IntRegs:$src1), (sub s32ImmPred:$src2, (i32 IntRegs:$src3))), - (S4_subaddi IntRegs:$src1, s6_10ExtPred:$src2, IntRegs:$src3)>; + (S4_subaddi IntRegs:$src1, s32ImmPred:$src2, IntRegs:$src3)>; // Rd=sub(add(Rs,#s6),Ru) -def: Pat<(sub (add (i32 IntRegs:$src1), s6_10ExtPred:$src2), +def: Pat<(sub (add (i32 IntRegs:$src1), s32ImmPred:$src2), (i32 IntRegs:$src3)), - (S4_subaddi IntRegs:$src1, s6_10ExtPred:$src2, IntRegs:$src3)>; + (S4_subaddi IntRegs:$src1, s32ImmPred:$src2, IntRegs:$src3)>; // Rd=add(sub(Rs,Ru),#s6) def: Pat<(add (sub (i32 IntRegs:$src1), (i32 IntRegs:$src3)), - (s6_10ExtPred:$src2)), - (S4_subaddi IntRegs:$src1, s6_10ExtPred:$src2, IntRegs:$src3)>; + (s32ImmPred:$src2)), + (S4_subaddi IntRegs:$src1, s32ImmPred:$src2, IntRegs:$src3)>; // Add or subtract doublewords with carry. @@ -2042,7 +2095,7 @@ def S4_or_andix: (ins IntRegs:$Ru, IntRegs:$_src_, s10Ext:$s10), "$Rx = or($Ru, and($_src_, #$s10))" , [(set (i32 IntRegs:$Rx), - (or (i32 IntRegs:$Ru), (and (i32 IntRegs:$_src_), s10ExtPred:$s10)))] , + (or (i32 IntRegs:$Ru), (and (i32 IntRegs:$_src_), s32ImmPred:$s10)))] , "$_src_ = $Rx", ALU64_tc_2_SLOT23> { bits<5> Rx; bits<5> Ru; @@ -2187,7 +2240,7 @@ class T_CompOR <string mnemonic, bits<2> MajOp, SDNode OpNode> (ins IntRegs:$src1, IntRegs:$Rs, s10Ext:$s10), "$Rx |= "#mnemonic#"($Rs, #$s10)", [(set (i32 IntRegs:$Rx), (or (i32 IntRegs:$src1), - (OpNode (i32 IntRegs:$Rs), s10ExtPred:$s10)))], + (OpNode (i32 IntRegs:$Rs), s32ImmPred:$s10)))], "$src1 = $Rx", ALU64_tc_2_SLOT23>, ImmRegRel { bits<5> Rx; bits<5> Rs; @@ -2349,7 +2402,7 @@ def M4_mpyri_addi : MInst<(outs IntRegs:$Rd), "$Rd = add(#$u6, mpyi($Rs, #$U6))" , [(set (i32 IntRegs:$Rd), (add (mul (i32 IntRegs:$Rs), u6ImmPred:$U6), - u6ExtPred:$u6))] ,"",ALU64_tc_3x_SLOT23> { + u32ImmPred:$u6))] ,"",ALU64_tc_3x_SLOT23> { bits<5> Rd; bits<6> u6; bits<5> Rs; @@ -2374,7 +2427,7 @@ def M4_mpyrr_addi : MInst <(outs IntRegs:$Rd), (ins u6Ext:$u6, IntRegs:$Rs, IntRegs:$Rt), "$Rd = add(#$u6, mpyi($Rs, $Rt))" , [(set (i32 IntRegs:$Rd), - (add (mul (i32 IntRegs:$Rs), (i32 IntRegs:$Rt)), u6ExtPred:$u6))], + (add (mul (i32 IntRegs:$Rs), (i32 IntRegs:$Rt)), u32ImmPred:$u6))], "", ALU64_tc_3x_SLOT23>, ImmRegRel { bits<5> Rd; bits<6> u6; @@ -2424,7 +2477,7 @@ def M4_mpyri_addr_u2 : T_AddMpy<0b0, u6_2ImmPred, let isExtendable = 1, opExtentBits = 6, opExtendable = 3, CextOpcode = "ADD_MPY", InputType = "imm" in -def M4_mpyri_addr : T_AddMpy<0b1, u6ExtPred, +def M4_mpyri_addr : T_AddMpy<0b1, u32ImmPred, (ins IntRegs:$src1, IntRegs:$src3, u6Ext:$src2)>, ImmRegRel; // Rx=add(Ru,mpyi(Rx,Rs)) @@ -2447,17 +2500,6 @@ def M4_mpyrr_addr: MInst_acc <(outs IntRegs:$Rx), let Inst{20-16} = Rs; } -// Rd=add(##,mpyi(Rs,#U6)) -def : Pat <(add (mul (i32 IntRegs:$src2), u6ImmPred:$src3), - (HexagonCONST32 tglobaladdr:$src1)), - (i32 (M4_mpyri_addi tglobaladdr:$src1, IntRegs:$src2, - u6ImmPred:$src3))>; - -// Rd=add(##,mpyi(Rs,Rt)) -def : Pat <(add (mul (i32 IntRegs:$src2), (i32 IntRegs:$src3)), - (HexagonCONST32 tglobaladdr:$src1)), - (i32 (M4_mpyrr_addi tglobaladdr:$src1, IntRegs:$src2, - IntRegs:$src3))>; // Vector reduce multiply word by signed half (32x16) //Rdd=vrmpyweh(Rss,Rtt)[:<<1] @@ -2569,7 +2611,7 @@ class T_S4_ShiftOperate<string MnOp, string MnSh, SDNode Op, SDNode Sh, : MInst_acc<(outs IntRegs:$Rd), (ins u8Ext:$u8, IntRegs:$Rx, u5Imm:$U5), "$Rd = "#MnOp#"(#$u8, "#MnSh#"($Rx, #$U5))", [(set (i32 IntRegs:$Rd), - (Op (Sh I32:$Rx, u5ImmPred:$U5), u8ExtPred:$u8))], + (Op (Sh I32:$Rx, u5ImmPred:$U5), u32ImmPred:$u8))], "$Rd = $Rx", Itin> { bits<5> Rd; @@ -2904,7 +2946,7 @@ let isExtendable = 1, opExtendable = 1, isExtentSigned = 0 in { // mem[bh](Rs+#u6) += #U5 //===----------------------------------------------------------------------===// -multiclass MemOpi_u5Pats <PatFrag ldOp, PatFrag stOp, PatLeaf ExtPred, +multiclass MemOpi_u5Pats <PatFrag ldOp, PatFrag stOp, PatLeaf ImmPred, InstHexagon MI, SDNode OpNode> { let AddedComplexity = 180 in def: Pat<(stOp (OpNode (ldOp IntRegs:$addr), u5ImmPred:$addend), @@ -2912,24 +2954,24 @@ multiclass MemOpi_u5Pats <PatFrag ldOp, PatFrag stOp, PatLeaf ExtPred, (MI IntRegs:$addr, 0, u5ImmPred:$addend)>; let AddedComplexity = 190 in - def: Pat<(stOp (OpNode (ldOp (add IntRegs:$base, ExtPred:$offset)), + def: Pat<(stOp (OpNode (ldOp (add IntRegs:$base, ImmPred:$offset)), u5ImmPred:$addend), - (add IntRegs:$base, ExtPred:$offset)), - (MI IntRegs:$base, ExtPred:$offset, u5ImmPred:$addend)>; + (add IntRegs:$base, ImmPred:$offset)), + (MI IntRegs:$base, ImmPred:$offset, u5ImmPred:$addend)>; } -multiclass MemOpi_u5ALUOp<PatFrag ldOp, PatFrag stOp, PatLeaf ExtPred, +multiclass MemOpi_u5ALUOp<PatFrag ldOp, PatFrag stOp, PatLeaf ImmPred, InstHexagon addMI, InstHexagon subMI> { - defm: MemOpi_u5Pats<ldOp, stOp, ExtPred, addMI, add>; - defm: MemOpi_u5Pats<ldOp, stOp, ExtPred, subMI, sub>; + defm: MemOpi_u5Pats<ldOp, stOp, ImmPred, addMI, add>; + defm: MemOpi_u5Pats<ldOp, stOp, ImmPred, subMI, sub>; } multiclass MemOpi_u5ExtType<PatFrag ldOpByte, PatFrag ldOpHalf > { // Half Word - defm: MemOpi_u5ALUOp <ldOpHalf, truncstorei16, u6_1ExtPred, + defm: MemOpi_u5ALUOp <ldOpHalf, truncstorei16, u31_1ImmPred, L4_iadd_memoph_io, L4_isub_memoph_io>; // Byte - defm: MemOpi_u5ALUOp <ldOpByte, truncstorei8, u6ExtPred, + defm: MemOpi_u5ALUOp <ldOpByte, truncstorei8, u32ImmPred, L4_iadd_memopb_io, L4_isub_memopb_io>; } @@ -2939,7 +2981,7 @@ let Predicates = [UseMEMOP] in { defm: MemOpi_u5ExtType<extloadi8, extloadi16>; // any extend // Word - defm: MemOpi_u5ALUOp <load, store, u6_2ExtPred, L4_iadd_memopw_io, + defm: MemOpi_u5ALUOp <load, store, u30_2ImmPred, L4_iadd_memopw_io, L4_isub_memopw_io>; } @@ -2950,7 +2992,7 @@ let Predicates = [UseMEMOP] in { // mem[bh](Rs+#u6) += #m5 //===----------------------------------------------------------------------===// -multiclass MemOpi_m5Pats <PatFrag ldOp, PatFrag stOp, PatLeaf extPred, +multiclass MemOpi_m5Pats <PatFrag ldOp, PatFrag stOp, PatLeaf ImmPred, PatLeaf immPred, SDNodeXForm xformFunc, InstHexagon MI> { let AddedComplexity = 190 in @@ -2958,18 +3000,18 @@ multiclass MemOpi_m5Pats <PatFrag ldOp, PatFrag stOp, PatLeaf extPred, (MI IntRegs:$addr, 0, (xformFunc immPred:$subend))>; let AddedComplexity = 195 in - def: Pat<(stOp (add (ldOp (add IntRegs:$base, extPred:$offset)), + def: Pat<(stOp (add (ldOp (add IntRegs:$base, ImmPred:$offset)), immPred:$subend), - (add IntRegs:$base, extPred:$offset)), - (MI IntRegs:$base, extPred:$offset, (xformFunc immPred:$subend))>; + (add IntRegs:$base, ImmPred:$offset)), + (MI IntRegs:$base, ImmPred:$offset, (xformFunc immPred:$subend))>; } multiclass MemOpi_m5ExtType<PatFrag ldOpByte, PatFrag ldOpHalf > { // Half Word - defm: MemOpi_m5Pats <ldOpHalf, truncstorei16, u6_1ExtPred, m5HImmPred, + defm: MemOpi_m5Pats <ldOpHalf, truncstorei16, u31_1ImmPred, m5HImmPred, MEMOPIMM_HALF, L4_isub_memoph_io>; // Byte - defm: MemOpi_m5Pats <ldOpByte, truncstorei8, u6ExtPred, m5BImmPred, + defm: MemOpi_m5Pats <ldOpByte, truncstorei8, u32ImmPred, m5BImmPred, MEMOPIMM_BYTE, L4_isub_memopb_io>; } @@ -2979,7 +3021,7 @@ let Predicates = [UseMEMOP] in { defm: MemOpi_m5ExtType<extloadi8, extloadi16>; // any extend // Word - defm: MemOpi_m5Pats <load, store, u6_2ExtPred, m5ImmPred, + defm: MemOpi_m5Pats <load, store, u30_2ImmPred, m5ImmPred, MEMOPIMM, L4_isub_memopw_io>; } @@ -3008,16 +3050,16 @@ multiclass MemOpi_bitPats <PatFrag ldOp, PatFrag stOp, PatLeaf immPred, multiclass MemOpi_bitExtType<PatFrag ldOpByte, PatFrag ldOpHalf> { // Byte - clrbit - defm: MemOpi_bitPats<ldOpByte, truncstorei8, Clr3ImmPred, u6ExtPred, + defm: MemOpi_bitPats<ldOpByte, truncstorei8, Clr3ImmPred, u32ImmPred, CLRMEMIMM_BYTE, L4_iand_memopb_io, and>; // Byte - setbit - defm: MemOpi_bitPats<ldOpByte, truncstorei8, Set3ImmPred, u6ExtPred, + defm: MemOpi_bitPats<ldOpByte, truncstorei8, Set3ImmPred, u32ImmPred, SETMEMIMM_BYTE, L4_ior_memopb_io, or>; // Half Word - clrbit - defm: MemOpi_bitPats<ldOpHalf, truncstorei16, Clr4ImmPred, u6_1ExtPred, + defm: MemOpi_bitPats<ldOpHalf, truncstorei16, Clr4ImmPred, u31_1ImmPred, CLRMEMIMM_SHORT, L4_iand_memoph_io, and>; // Half Word - setbit - defm: MemOpi_bitPats<ldOpHalf, truncstorei16, Set4ImmPred, u6_1ExtPred, + defm: MemOpi_bitPats<ldOpHalf, truncstorei16, Set4ImmPred, u31_1ImmPred, SETMEMIMM_SHORT, L4_ior_memoph_io, or>; } @@ -3030,9 +3072,9 @@ let Predicates = [UseMEMOP] in { // memw(Rs+#0) = [clrbit|setbit](#U5) // memw(Rs+#u6:2) = [clrbit|setbit](#U5) - defm: MemOpi_bitPats<load, store, Clr5ImmPred, u6_2ExtPred, CLRMEMIMM, + defm: MemOpi_bitPats<load, store, Clr5ImmPred, u30_2ImmPred, CLRMEMIMM, L4_iand_memopw_io, and>; - defm: MemOpi_bitPats<load, store, Set5ImmPred, u6_2ExtPred, SETMEMIMM, + defm: MemOpi_bitPats<load, store, Set5ImmPred, u30_2ImmPred, SETMEMIMM, L4_ior_memopw_io, or>; } @@ -3070,11 +3112,11 @@ multiclass MemOPr_ALUOp<PatFrag ldOp, PatFrag stOp, PatLeaf extPred, multiclass MemOPr_ExtType<PatFrag ldOpByte, PatFrag ldOpHalf > { // Half Word - defm: MemOPr_ALUOp <ldOpHalf, truncstorei16, u6_1ExtPred, + defm: MemOPr_ALUOp <ldOpHalf, truncstorei16, u31_1ImmPred, L4_add_memoph_io, L4_sub_memoph_io, L4_and_memoph_io, L4_or_memoph_io>; // Byte - defm: MemOPr_ALUOp <ldOpByte, truncstorei8, u6ExtPred, + defm: MemOPr_ALUOp <ldOpByte, truncstorei8, u32ImmPred, L4_add_memopb_io, L4_sub_memopb_io, L4_and_memopb_io, L4_or_memopb_io>; } @@ -3086,7 +3128,7 @@ let Predicates = [UseMEMOP] in { defm: MemOPr_ExtType<sextloadi8, sextloadi16>; // sign extend defm: MemOPr_ExtType<extloadi8, extloadi16>; // any extend // Word - defm: MemOPr_ALUOp <load, store, u6_2ExtPred, L4_add_memopw_io, + defm: MemOPr_ALUOp <load, store, u30_2ImmPred, L4_add_memopw_io, L4_sub_memopw_io, L4_and_memopw_io, L4_or_memopw_io>; } @@ -3110,23 +3152,23 @@ def C4_cmpneqi : T_CMP <"cmp.eq", 0b00, 1, s10Ext>; def C4_cmpltei : T_CMP <"cmp.gt", 0b01, 1, s10Ext>; def C4_cmplteui : T_CMP <"cmp.gtu", 0b10, 1, u9Ext>; -def : T_CMP_pat <C4_cmpneqi, setne, s10ExtPred>; -def : T_CMP_pat <C4_cmpltei, setle, s10ExtPred>; +def : T_CMP_pat <C4_cmpneqi, setne, s32ImmPred>; +def : T_CMP_pat <C4_cmpltei, setle, s32ImmPred>; def : T_CMP_pat <C4_cmplteui, setule, u9ImmPred>; // rs <= rt -> !(rs > rt). /* -def: Pat<(i1 (setle (i32 IntRegs:$src1), s10ExtPred:$src2)), - (C2_not (C2_cmpgti IntRegs:$src1, s10ExtPred:$src2))>; -// (C4_cmpltei IntRegs:$src1, s10ExtPred:$src2)>; +def: Pat<(i1 (setle (i32 IntRegs:$src1), s32ImmPred:$src2)), + (C2_not (C2_cmpgti IntRegs:$src1, s32ImmPred:$src2))>; +// (C4_cmpltei IntRegs:$src1, s32ImmPred:$src2)>; */ // Map cmplt(Rs, Imm) -> !cmpgt(Rs, Imm-1). -def: Pat<(i1 (setlt (i32 IntRegs:$src1), s8ExtPred:$src2)), - (C4_cmpltei IntRegs:$src1, (DEC_CONST_SIGNED s8ExtPred:$src2))>; +def: Pat<(i1 (setlt (i32 IntRegs:$src1), s32ImmPred:$src2)), + (C4_cmpltei IntRegs:$src1, (DEC_CONST_SIGNED s32ImmPred:$src2))>; // rs != rt -> !(rs == rt). -def: Pat<(i1 (setne (i32 IntRegs:$src1), s10ExtPred:$src2)), - (C4_cmpneqi IntRegs:$src1, s10ExtPred:$src2)>; +def: Pat<(i1 (setne (i32 IntRegs:$src1), s32ImmPred:$src2)), + (C4_cmpneqi IntRegs:$src1, s32ImmPred:$src2)>; // SDNode for converting immediate C to C-1. def DEC_CONST_BYTE : SDNodeXForm<imm, [{ @@ -3136,168 +3178,6 @@ def DEC_CONST_BYTE : SDNodeXForm<imm, [{ }]>; // For the sequence -// zext( seteq ( and(Rs, 255), u8)) -// Generate -// Pd=cmpb.eq(Rs, #u8) -// if (Pd.new) Rd=#1 -// if (!Pd.new) Rd=#0 -def : Pat <(i32 (zext (i1 (seteq (i32 (and (i32 IntRegs:$Rs), 255)), - u8ExtPred:$u8)))), - (i32 (TFR_condset_ii (i1 (A4_cmpbeqi (i32 IntRegs:$Rs), - (u8ExtPred:$u8))), - 1, 0))>; - -// For the sequence -// zext( setne ( and(Rs, 255), u8)) -// Generate -// Pd=cmpb.eq(Rs, #u8) -// if (Pd.new) Rd=#0 -// if (!Pd.new) Rd=#1 -def : Pat <(i32 (zext (i1 (setne (i32 (and (i32 IntRegs:$Rs), 255)), - u8ExtPred:$u8)))), - (i32 (TFR_condset_ii (i1 (A4_cmpbeqi (i32 IntRegs:$Rs), - (u8ExtPred:$u8))), - 0, 1))>; - -// For the sequence -// zext( seteq (Rs, and(Rt, 255))) -// Generate -// Pd=cmpb.eq(Rs, Rt) -// if (Pd.new) Rd=#1 -// if (!Pd.new) Rd=#0 -def : Pat <(i32 (zext (i1 (seteq (i32 IntRegs:$Rt), - (i32 (and (i32 IntRegs:$Rs), 255)))))), - (i32 (TFR_condset_ii (i1 (A4_cmpbeq (i32 IntRegs:$Rs), - (i32 IntRegs:$Rt))), - 1, 0))>; - -// For the sequence -// zext( setne (Rs, and(Rt, 255))) -// Generate -// Pd=cmpb.eq(Rs, Rt) -// if (Pd.new) Rd=#0 -// if (!Pd.new) Rd=#1 -def : Pat <(i32 (zext (i1 (setne (i32 IntRegs:$Rt), - (i32 (and (i32 IntRegs:$Rs), 255)))))), - (i32 (TFR_condset_ii (i1 (A4_cmpbeq (i32 IntRegs:$Rs), - (i32 IntRegs:$Rt))), - 0, 1))>; - -// For the sequence -// zext( setugt ( and(Rs, 255), u8)) -// Generate -// Pd=cmpb.gtu(Rs, #u8) -// if (Pd.new) Rd=#1 -// if (!Pd.new) Rd=#0 -def : Pat <(i32 (zext (i1 (setugt (i32 (and (i32 IntRegs:$Rs), 255)), - u8ExtPred:$u8)))), - (i32 (TFR_condset_ii (i1 (A4_cmpbgtui (i32 IntRegs:$Rs), - (u8ExtPred:$u8))), - 1, 0))>; - -// For the sequence -// zext( setugt ( and(Rs, 254), u8)) -// Generate -// Pd=cmpb.gtu(Rs, #u8) -// if (Pd.new) Rd=#1 -// if (!Pd.new) Rd=#0 -def : Pat <(i32 (zext (i1 (setugt (i32 (and (i32 IntRegs:$Rs), 254)), - u8ExtPred:$u8)))), - (i32 (TFR_condset_ii (i1 (A4_cmpbgtui (i32 IntRegs:$Rs), - (u8ExtPred:$u8))), - 1, 0))>; - -// For the sequence -// zext( setult ( Rs, Rt)) -// Generate -// Pd=cmp.ltu(Rs, Rt) -// if (Pd.new) Rd=#1 -// if (!Pd.new) Rd=#0 -// cmp.ltu(Rs, Rt) -> cmp.gtu(Rt, Rs) -def : Pat <(i32 (zext (i1 (setult (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))), - (i32 (TFR_condset_ii (i1 (C2_cmpgtu (i32 IntRegs:$Rt), - (i32 IntRegs:$Rs))), - 1, 0))>; - -// For the sequence -// zext( setlt ( Rs, Rt)) -// Generate -// Pd=cmp.lt(Rs, Rt) -// if (Pd.new) Rd=#1 -// if (!Pd.new) Rd=#0 -// cmp.lt(Rs, Rt) -> cmp.gt(Rt, Rs) -def : Pat <(i32 (zext (i1 (setlt (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))), - (i32 (TFR_condset_ii (i1 (C2_cmpgt (i32 IntRegs:$Rt), - (i32 IntRegs:$Rs))), - 1, 0))>; - -// For the sequence -// zext( setugt ( Rs, Rt)) -// Generate -// Pd=cmp.gtu(Rs, Rt) -// if (Pd.new) Rd=#1 -// if (!Pd.new) Rd=#0 -def : Pat <(i32 (zext (i1 (setugt (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))), - (i32 (TFR_condset_ii (i1 (C2_cmpgtu (i32 IntRegs:$Rs), - (i32 IntRegs:$Rt))), - 1, 0))>; - -// This pattern interefers with coremark performance, not implementing at this -// time. -// For the sequence -// zext( setgt ( Rs, Rt)) -// Generate -// Pd=cmp.gt(Rs, Rt) -// if (Pd.new) Rd=#1 -// if (!Pd.new) Rd=#0 - -// For the sequence -// zext( setuge ( Rs, Rt)) -// Generate -// Pd=cmp.ltu(Rs, Rt) -// if (Pd.new) Rd=#0 -// if (!Pd.new) Rd=#1 -// cmp.ltu(Rs, Rt) -> cmp.gtu(Rt, Rs) -def : Pat <(i32 (zext (i1 (setuge (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))), - (i32 (TFR_condset_ii (i1 (C2_cmpgtu (i32 IntRegs:$Rt), - (i32 IntRegs:$Rs))), - 0, 1))>; - -// For the sequence -// zext( setge ( Rs, Rt)) -// Generate -// Pd=cmp.lt(Rs, Rt) -// if (Pd.new) Rd=#0 -// if (!Pd.new) Rd=#1 -// cmp.lt(Rs, Rt) -> cmp.gt(Rt, Rs) -def : Pat <(i32 (zext (i1 (setge (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))), - (i32 (TFR_condset_ii (i1 (C2_cmpgt (i32 IntRegs:$Rt), - (i32 IntRegs:$Rs))), - 0, 1))>; - -// For the sequence -// zext( setule ( Rs, Rt)) -// Generate -// Pd=cmp.gtu(Rs, Rt) -// if (Pd.new) Rd=#0 -// if (!Pd.new) Rd=#1 -def : Pat <(i32 (zext (i1 (setule (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))), - (i32 (TFR_condset_ii (i1 (C2_cmpgtu (i32 IntRegs:$Rs), - (i32 IntRegs:$Rt))), - 0, 1))>; - -// For the sequence -// zext( setle ( Rs, Rt)) -// Generate -// Pd=cmp.gt(Rs, Rt) -// if (Pd.new) Rd=#0 -// if (!Pd.new) Rd=#1 -def : Pat <(i32 (zext (i1 (setle (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))), - (i32 (TFR_condset_ii (i1 (C2_cmpgt (i32 IntRegs:$Rs), - (i32 IntRegs:$Rt))), - 0, 1))>; - -// For the sequence // zext( setult ( and(Rs, 255), u8)) // Use the isdigit transformation below @@ -3381,26 +3261,17 @@ defm L4_return: LD_MISC_L4_RETURN <"dealloc_return">, PredNewRel; // Restore registers and dealloc return function call. let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1, Defs = [R29, R30, R31, PC], isPredicable = 0, isAsmParserOnly = 1 in { - def RESTORE_DEALLOC_RET_JMP_V4 : JInst<(outs), - (ins calltarget:$dst), - "jump $dst", - []>; + def RESTORE_DEALLOC_RET_JMP_V4 : T_JMP<"">; } // Restore registers and dealloc frame before a tail call. let isCall = 1, Defs = [R29, R30, R31, PC], isAsmParserOnly = 1 in { - def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : JInst<(outs), - (ins calltarget:$dst), - "call $dst", - []>; + def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : T_Call<"">, PredRel; } // Save registers function call. let isCall = 1, Uses = [R29, R31], isAsmParserOnly = 1 in { - def SAVE_REGISTERS_CALL_V4 : JInst<(outs), - (ins calltarget:$dst), - "call $dst // Save_calle_saved_registers", - []>; + def SAVE_REGISTERS_CALL_V4 : T_Call<"">, PredRel; } //===----------------------------------------------------------------------===// @@ -3472,7 +3343,7 @@ class T_StoreAbs_Pred <string mnemonic, RegisterClass RC, bits<2> MajOp, //===----------------------------------------------------------------------===// class T_StoreAbs <string mnemonic, RegisterClass RC, Operand ImmOp, bits<2> MajOp, bit isHalf> - : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, u0AlwaysExt, 1, isHalf>, + : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, u32Imm, 1, isHalf>, AddrModeRel { string ImmOpStr = !cast<string>(ImmOp); let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19, @@ -3513,7 +3384,7 @@ multiclass ST_Abs<string mnemonic, string CextOp, RegisterClass RC, let hasSideEffects = 0, isPredicable = 1, mayStore = 1, isNVStore = 1, isNewValue = 1, opNewValue = 1 in class T_StoreAbsGP_NV <string mnemonic, Operand ImmOp, bits<2>MajOp, bit isAbs> - : NVInst_V4<(outs), (ins u0AlwaysExt:$addr, IntRegs:$src), + : NVInst_V4<(outs), (ins u32Imm:$addr, IntRegs:$src), mnemonic # !if(isAbs, "(##", "(#")#"$addr) = $src.new", [], "", V2LDST_tc_st_SLOT0> { bits<19> addr; @@ -3743,7 +3614,7 @@ class T_LoadAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp, class T_LoadAbs <string mnemonic, RegisterClass RC, Operand ImmOp, bits<3> MajOp> - : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp, u0AlwaysExt, 1>, AddrModeRel { + : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp, u32Imm, 1>, AddrModeRel { string ImmOpStr = !cast<string>(ImmOp); let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19, @@ -3903,17 +3774,17 @@ def: Pat<(i64 (ctlz I64:$src1)), (Zext64 (S2_cl0p I64:$src1))>; def: Pat<(i64 (cttz I64:$src1)), (Zext64 (S2_ct0p I64:$src1))>; let AddedComplexity = 30 in { - def: Storea_pat<truncstorei8, I32, u0AlwaysExtPred, S2_storerbabs>; - def: Storea_pat<truncstorei16, I32, u0AlwaysExtPred, S2_storerhabs>; - def: Storea_pat<store, I32, u0AlwaysExtPred, S2_storeriabs>; + def: Storea_pat<truncstorei8, I32, u32ImmPred, S2_storerbabs>; + def: Storea_pat<truncstorei16, I32, u32ImmPred, S2_storerhabs>; + def: Storea_pat<store, I32, u32ImmPred, S2_storeriabs>; } let AddedComplexity = 30 in { - def: Loada_pat<load, i32, u0AlwaysExtPred, L4_loadri_abs>; - def: Loada_pat<sextloadi8, i32, u0AlwaysExtPred, L4_loadrb_abs>; - def: Loada_pat<zextloadi8, i32, u0AlwaysExtPred, L4_loadrub_abs>; - def: Loada_pat<sextloadi16, i32, u0AlwaysExtPred, L4_loadrh_abs>; - def: Loada_pat<zextloadi16, i32, u0AlwaysExtPred, L4_loadruh_abs>; + def: Loada_pat<load, i32, u32ImmPred, L4_loadri_abs>; + def: Loada_pat<sextloadi8, i32, u32ImmPred, L4_loadrb_abs>; + def: Loada_pat<zextloadi8, i32, u32ImmPred, L4_loadrub_abs>; + def: Loada_pat<sextloadi16, i32, u32ImmPred, L4_loadrh_abs>; + def: Loada_pat<zextloadi16, i32, u32ImmPred, L4_loadruh_abs>; } // Indexed store word - global address. @@ -4012,6 +3883,18 @@ def: Storea_pat<SwapSt<atomic_store_16>, I32, addrgp, S2_storerhabs>; def: Storea_pat<SwapSt<atomic_store_32>, I32, addrgp, S2_storeriabs>; def: Storea_pat<SwapSt<atomic_store_64>, I64, addrgp, S2_storerdabs>; +let Constraints = "@earlyclobber $dst" in +def Insert4 : PseudoM<(outs DoubleRegs:$dst), (ins IntRegs:$a, IntRegs:$b, + IntRegs:$c, IntRegs:$d), + ".error \"Should never try to emit Insert4\"", + [(set (i64 DoubleRegs:$dst), + (or (or (or (shl (i64 (zext (i32 (and (i32 IntRegs:$b), (i32 65535))))), + (i32 16)), + (i64 (zext (i32 (and (i32 IntRegs:$a), (i32 65535)))))), + (shl (i64 (anyext (i32 (and (i32 IntRegs:$c), (i32 65535))))), + (i32 32))), + (shl (i64 (anyext (i32 IntRegs:$d))), (i32 48))))]>; + //===----------------------------------------------------------------------===// // :raw for of boundscheck:hi:lo insns //===----------------------------------------------------------------------===// @@ -4116,7 +3999,7 @@ class CJInst_tstbit_R0<string px, bit np, string tnt> : InstHexagon<(outs), (ins IntRegs:$Rs, brtarget:$r9_2), ""#px#" = tstbit($Rs, #0); if (" #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2", - [], "", COMPOUND, TypeCOMPOUND> { + [], "", COMPOUND, TypeCOMPOUND>, OpcodeHexagon { bits<4> Rs; bits<11> r9_2; @@ -4162,7 +4045,7 @@ class CJInst_RR<string px, string op, bit np, string tnt> : InstHexagon<(outs), (ins IntRegs:$Rs, IntRegs:$Rt, brtarget:$r9_2), ""#px#" = cmp."#op#"($Rs, $Rt); if (" #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2", - [], "", COMPOUND, TypeCOMPOUND> { + [], "", COMPOUND, TypeCOMPOUND>, OpcodeHexagon { bits<4> Rs; bits<4> Rt; bits<11> r9_2; @@ -4216,7 +4099,7 @@ class CJInst_RU5<string px, string op, bit np, string tnt> : InstHexagon<(outs), (ins IntRegs:$Rs, u5Imm:$U5, brtarget:$r9_2), ""#px#" = cmp."#op#"($Rs, #$U5); if (" #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2", - [], "", COMPOUND, TypeCOMPOUND> { + [], "", COMPOUND, TypeCOMPOUND>, OpcodeHexagon { bits<4> Rs; bits<5> U5; bits<11> r9_2; @@ -4271,7 +4154,7 @@ class CJInst_Rn1<string px, string op, bit np, string tnt> : InstHexagon<(outs), (ins IntRegs:$Rs, brtarget:$r9_2), ""#px#" = cmp."#op#"($Rs,#-1); if (" #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2", - [], "", COMPOUND, TypeCOMPOUND> { + [], "", COMPOUND, TypeCOMPOUND>, OpcodeHexagon { bits<4> Rs; bits<11> r9_2; diff --git a/lib/Target/Hexagon/HexagonInstrInfoV5.td b/lib/Target/Hexagon/HexagonInstrInfoV5.td index 19b0935..337f4ea 100644 --- a/lib/Target/Hexagon/HexagonInstrInfoV5.td +++ b/lib/Target/Hexagon/HexagonInstrInfoV5.td @@ -139,11 +139,11 @@ def S5_popcountp : ALU64_rr<(outs IntRegs:$Rd), (ins DoubleRegs:$Rss), let Inst{20-16} = Rss; } -defm: Loadx_pat<load, f32, s11_2ExtPred, L2_loadri_io>; -defm: Loadx_pat<load, f64, s11_3ExtPred, L2_loadrd_io>; +defm: Loadx_pat<load, f32, s30_2ImmPred, L2_loadri_io>; +defm: Loadx_pat<load, f64, s29_3ImmPred, L2_loadrd_io>; -defm: Storex_pat<store, F32, s11_2ExtPred, S2_storeri_io>; -defm: Storex_pat<store, F64, s11_3ExtPred, S2_storerd_io>; +defm: Storex_pat<store, F32, s30_2ImmPred, S2_storeri_io>; +defm: Storex_pat<store, F64, s29_3ImmPred, S2_storerd_io>; def: Storex_simple_pat<store, F32, S2_storeri_io>; def: Storex_simple_pat<store, F64, S2_storerd_io>; diff --git a/lib/Target/Hexagon/HexagonInstrInfoVector.td b/lib/Target/Hexagon/HexagonInstrInfoVector.td index 6e67b6e..f4fb946 100644 --- a/lib/Target/Hexagon/HexagonInstrInfoVector.td +++ b/lib/Target/Hexagon/HexagonInstrInfoVector.td @@ -20,6 +20,34 @@ def V8I8: PatLeaf<(v8i8 DoubleRegs:$R)>; def V4I16: PatLeaf<(v4i16 DoubleRegs:$R)>; def V2I32: PatLeaf<(v2i32 DoubleRegs:$R)>; + +multiclass bitconvert_32<ValueType a, ValueType b> { + def : Pat <(b (bitconvert (a IntRegs:$src))), + (b IntRegs:$src)>; + def : Pat <(a (bitconvert (b IntRegs:$src))), + (a IntRegs:$src)>; +} + +multiclass bitconvert_64<ValueType a, ValueType b> { + def : Pat <(b (bitconvert (a DoubleRegs:$src))), + (b DoubleRegs:$src)>; + def : Pat <(a (bitconvert (b DoubleRegs:$src))), + (a DoubleRegs:$src)>; +} + +// Bit convert vector types. +defm : bitconvert_32<v4i8, i32>; +defm : bitconvert_32<v2i16, i32>; +defm : bitconvert_32<v2i16, v4i8>; + +defm : bitconvert_64<v8i8, i64>; +defm : bitconvert_64<v4i16, i64>; +defm : bitconvert_64<v2i32, i64>; +defm : bitconvert_64<v8i8, v4i16>; +defm : bitconvert_64<v8i8, v2i32>; +defm : bitconvert_64<v4i16, v2i32>; + + // Vector shift support. Vector shifting in Hexagon is rather different // from internal representation of LLVM. // LLVM assumes all shifts (in vector case) will have the form @@ -44,6 +72,12 @@ class vshift_v2i32<SDNode Op, string Str, bits<3>MajOp, bits<3>MinOp> let Inst{12-8} = src2; } +def : Pat<(v2i16 (add (v2i16 IntRegs:$src1), (v2i16 IntRegs:$src2))), + (A2_svaddh IntRegs:$src1, IntRegs:$src2)>; + +def : Pat<(v2i16 (sub (v2i16 IntRegs:$src1), (v2i16 IntRegs:$src2))), + (A2_svsubh IntRegs:$src1, IntRegs:$src2)>; + def S2_asr_i_vw : vshift_v2i32<sra, "vasrw", 0b010, 0b000>; def S2_lsr_i_vw : vshift_v2i32<srl, "vlsrw", 0b010, 0b001>; def S2_asl_i_vw : vshift_v2i32<shl, "vaslw", 0b010, 0b010>; @@ -52,6 +86,87 @@ def S2_asr_i_vh : vshift_v4i16<sra, "vasrh", 0b100, 0b000>; def S2_lsr_i_vh : vshift_v4i16<srl, "vlsrh", 0b100, 0b001>; def S2_asl_i_vh : vshift_v4i16<shl, "vaslh", 0b100, 0b010>; + +def HexagonVSPLATB: SDNode<"HexagonISD::VSPLATB", SDTUnaryOp>; +def HexagonVSPLATH: SDNode<"HexagonISD::VSPLATH", SDTUnaryOp>; + +// Replicate the low 8-bits from 32-bits input register into each of the +// four bytes of 32-bits destination register. +def: Pat<(v4i8 (HexagonVSPLATB I32:$Rs)), (S2_vsplatrb I32:$Rs)>; + +// Replicate the low 16-bits from 32-bits input register into each of the +// four halfwords of 64-bits destination register. +def: Pat<(v4i16 (HexagonVSPLATH I32:$Rs)), (S2_vsplatrh I32:$Rs)>; + + +class VArith_pat <InstHexagon MI, SDNode Op, PatFrag Type> + : Pat <(Op Type:$Rss, Type:$Rtt), + (MI Type:$Rss, Type:$Rtt)>; + +def: VArith_pat <A2_vaddub, add, V8I8>; +def: VArith_pat <A2_vaddh, add, V4I16>; +def: VArith_pat <A2_vaddw, add, V2I32>; +def: VArith_pat <A2_vsubub, sub, V8I8>; +def: VArith_pat <A2_vsubh, sub, V4I16>; +def: VArith_pat <A2_vsubw, sub, V2I32>; + +def: VArith_pat <A2_and, and, V2I16>; +def: VArith_pat <A2_xor, xor, V2I16>; +def: VArith_pat <A2_or, or, V2I16>; + +def: VArith_pat <A2_andp, and, V8I8>; +def: VArith_pat <A2_andp, and, V4I16>; +def: VArith_pat <A2_andp, and, V2I32>; +def: VArith_pat <A2_orp, or, V8I8>; +def: VArith_pat <A2_orp, or, V4I16>; +def: VArith_pat <A2_orp, or, V2I32>; +def: VArith_pat <A2_xorp, xor, V8I8>; +def: VArith_pat <A2_xorp, xor, V4I16>; +def: VArith_pat <A2_xorp, xor, V2I32>; + +def: Pat<(v2i32 (sra V2I32:$b, (i64 (HexagonCOMBINE (i32 u5ImmPred:$c), + (i32 u5ImmPred:$c))))), + (S2_asr_i_vw V2I32:$b, imm:$c)>; +def: Pat<(v2i32 (srl V2I32:$b, (i64 (HexagonCOMBINE (i32 u5ImmPred:$c), + (i32 u5ImmPred:$c))))), + (S2_lsr_i_vw V2I32:$b, imm:$c)>; +def: Pat<(v2i32 (shl V2I32:$b, (i64 (HexagonCOMBINE (i32 u5ImmPred:$c), + (i32 u5ImmPred:$c))))), + (S2_asl_i_vw V2I32:$b, imm:$c)>; + +def: Pat<(v4i16 (sra V4I16:$b, (v4i16 (HexagonVSPLATH (i32 (u4ImmPred:$c)))))), + (S2_asr_i_vh V4I16:$b, imm:$c)>; +def: Pat<(v4i16 (srl V4I16:$b, (v4i16 (HexagonVSPLATH (i32 (u4ImmPred:$c)))))), + (S2_lsr_i_vh V4I16:$b, imm:$c)>; +def: Pat<(v4i16 (shl V4I16:$b, (v4i16 (HexagonVSPLATH (i32 (u4ImmPred:$c)))))), + (S2_asl_i_vh V4I16:$b, imm:$c)>; + + +def SDTHexagon_v2i32_v2i32_i32 : SDTypeProfile<1, 2, + [SDTCisSameAs<0, 1>, SDTCisVT<0, v2i32>, SDTCisInt<2>]>; +def SDTHexagon_v4i16_v4i16_i32 : SDTypeProfile<1, 2, + [SDTCisSameAs<0, 1>, SDTCisVT<0, v4i16>, SDTCisInt<2>]>; + +def HexagonVSRAW: SDNode<"HexagonISD::VSRAW", SDTHexagon_v2i32_v2i32_i32>; +def HexagonVSRAH: SDNode<"HexagonISD::VSRAH", SDTHexagon_v4i16_v4i16_i32>; +def HexagonVSRLW: SDNode<"HexagonISD::VSRLW", SDTHexagon_v2i32_v2i32_i32>; +def HexagonVSRLH: SDNode<"HexagonISD::VSRLH", SDTHexagon_v4i16_v4i16_i32>; +def HexagonVSHLW: SDNode<"HexagonISD::VSHLW", SDTHexagon_v2i32_v2i32_i32>; +def HexagonVSHLH: SDNode<"HexagonISD::VSHLH", SDTHexagon_v4i16_v4i16_i32>; + +def: Pat<(v2i32 (HexagonVSRAW V2I32:$Rs, u5ImmPred:$u5)), + (S2_asr_i_vw V2I32:$Rs, imm:$u5)>; +def: Pat<(v4i16 (HexagonVSRAH V4I16:$Rs, u4ImmPred:$u4)), + (S2_asr_i_vh V4I16:$Rs, imm:$u4)>; +def: Pat<(v2i32 (HexagonVSRLW V2I32:$Rs, u5ImmPred:$u5)), + (S2_lsr_i_vw V2I32:$Rs, imm:$u5)>; +def: Pat<(v4i16 (HexagonVSRLH V4I16:$Rs, u4ImmPred:$u4)), + (S2_lsr_i_vh V4I16:$Rs, imm:$u4)>; +def: Pat<(v2i32 (HexagonVSHLW V2I32:$Rs, u5ImmPred:$u5)), + (S2_asl_i_vw V2I32:$Rs, imm:$u5)>; +def: Pat<(v4i16 (HexagonVSHLH V4I16:$Rs, u4ImmPred:$u4)), + (S2_asl_i_vh V4I16:$Rs, imm:$u4)>; + // Vector shift words by register def S2_asr_r_vw : T_S3op_shiftVect < "vasrw", 0b00, 0b00>; def S2_lsr_r_vw : T_S3op_shiftVect < "vlsrw", 0b00, 0b01>; @@ -63,3 +178,306 @@ def S2_asr_r_vh : T_S3op_shiftVect < "vasrh", 0b01, 0b00>; def S2_lsr_r_vh : T_S3op_shiftVect < "vlsrh", 0b01, 0b01>; def S2_asl_r_vh : T_S3op_shiftVect < "vaslh", 0b01, 0b10>; def S2_lsl_r_vh : T_S3op_shiftVect < "vlslh", 0b01, 0b11>; + +class vshift_rr_pat<InstHexagon MI, SDNode Op, PatFrag Value> + : Pat <(Op Value:$Rs, I32:$Rt), + (MI Value:$Rs, I32:$Rt)>; + +def: vshift_rr_pat <S2_asr_r_vw, HexagonVSRAW, V2I32>; +def: vshift_rr_pat <S2_asr_r_vh, HexagonVSRAH, V4I16>; +def: vshift_rr_pat <S2_lsr_r_vw, HexagonVSRLW, V2I32>; +def: vshift_rr_pat <S2_lsr_r_vh, HexagonVSRLH, V4I16>; +def: vshift_rr_pat <S2_asl_r_vw, HexagonVSHLW, V2I32>; +def: vshift_rr_pat <S2_asl_r_vh, HexagonVSHLH, V4I16>; + + +def SDTHexagonVecCompare_v8i8 : SDTypeProfile<1, 2, + [SDTCisSameAs<1, 2>, SDTCisVT<0, i1>, SDTCisVT<1, v8i8>]>; +def SDTHexagonVecCompare_v4i16 : SDTypeProfile<1, 2, + [SDTCisSameAs<1, 2>, SDTCisVT<0, i1>, SDTCisVT<1, v4i16>]>; +def SDTHexagonVecCompare_v2i32 : SDTypeProfile<1, 2, + [SDTCisSameAs<1, 2>, SDTCisVT<0, i1>, SDTCisVT<1, v2i32>]>; + +def HexagonVCMPBEQ: SDNode<"HexagonISD::VCMPBEQ", SDTHexagonVecCompare_v8i8>; +def HexagonVCMPBGT: SDNode<"HexagonISD::VCMPBGT", SDTHexagonVecCompare_v8i8>; +def HexagonVCMPBGTU: SDNode<"HexagonISD::VCMPBGTU", SDTHexagonVecCompare_v8i8>; +def HexagonVCMPHEQ: SDNode<"HexagonISD::VCMPHEQ", SDTHexagonVecCompare_v4i16>; +def HexagonVCMPHGT: SDNode<"HexagonISD::VCMPHGT", SDTHexagonVecCompare_v4i16>; +def HexagonVCMPHGTU: SDNode<"HexagonISD::VCMPHGTU", SDTHexagonVecCompare_v4i16>; +def HexagonVCMPWEQ: SDNode<"HexagonISD::VCMPWEQ", SDTHexagonVecCompare_v2i32>; +def HexagonVCMPWGT: SDNode<"HexagonISD::VCMPWGT", SDTHexagonVecCompare_v2i32>; +def HexagonVCMPWGTU: SDNode<"HexagonISD::VCMPWGTU", SDTHexagonVecCompare_v2i32>; + + +class vcmp_i1_pat<InstHexagon MI, SDNode Op, PatFrag Value> + : Pat <(i1 (Op Value:$Rs, Value:$Rt)), + (MI Value:$Rs, Value:$Rt)>; + +def: vcmp_i1_pat<A2_vcmpbeq, HexagonVCMPBEQ, V8I8>; +def: vcmp_i1_pat<A4_vcmpbgt, HexagonVCMPBGT, V8I8>; +def: vcmp_i1_pat<A2_vcmpbgtu, HexagonVCMPBGTU, V8I8>; + +def: vcmp_i1_pat<A2_vcmpheq, HexagonVCMPHEQ, V4I16>; +def: vcmp_i1_pat<A2_vcmphgt, HexagonVCMPHGT, V4I16>; +def: vcmp_i1_pat<A2_vcmphgtu, HexagonVCMPHGTU, V4I16>; + +def: vcmp_i1_pat<A2_vcmpweq, HexagonVCMPWEQ, V2I32>; +def: vcmp_i1_pat<A2_vcmpwgt, HexagonVCMPWGT, V2I32>; +def: vcmp_i1_pat<A2_vcmpwgtu, HexagonVCMPWGTU, V2I32>; + + +class vcmp_vi1_pat<InstHexagon MI, PatFrag Op, PatFrag InVal, ValueType OutTy> + : Pat <(OutTy (Op InVal:$Rs, InVal:$Rt)), + (MI InVal:$Rs, InVal:$Rt)>; + +def: vcmp_vi1_pat<A2_vcmpweq, seteq, V2I32, v2i1>; +def: vcmp_vi1_pat<A2_vcmpwgt, setgt, V2I32, v2i1>; +def: vcmp_vi1_pat<A2_vcmpwgtu, setugt, V2I32, v2i1>; + +def: vcmp_vi1_pat<A2_vcmpheq, seteq, V4I16, v4i1>; +def: vcmp_vi1_pat<A2_vcmphgt, setgt, V4I16, v4i1>; +def: vcmp_vi1_pat<A2_vcmphgtu, setugt, V4I16, v4i1>; + + +// Hexagon doesn't have a vector multiply with C semantics. +// Instead, generate a pseudo instruction that gets expaneded into two +// scalar MPYI instructions. +// This is expanded by ExpandPostRAPseudos. +let isPseudo = 1 in +def VMULW : PseudoM<(outs DoubleRegs:$Rd), + (ins DoubleRegs:$Rs, DoubleRegs:$Rt), + ".error \"Should never try to emit VMULW\"", + [(set V2I32:$Rd, (mul V2I32:$Rs, V2I32:$Rt))]>; + +let isPseudo = 1 in +def VMULW_ACC : PseudoM<(outs DoubleRegs:$Rd), + (ins DoubleRegs:$Rx, DoubleRegs:$Rs, DoubleRegs:$Rt), + ".error \"Should never try to emit VMULW_ACC\"", + [(set V2I32:$Rd, (add V2I32:$Rx, (mul V2I32:$Rs, V2I32:$Rt)))], + "$Rd = $Rx">; + +// Adds two v4i8: Hexagon does not have an insn for this one, so we +// use the double add v8i8, and use only the low part of the result. +def: Pat<(v4i8 (add (v4i8 IntRegs:$Rs), (v4i8 IntRegs:$Rt))), + (LoReg (A2_vaddub (Zext64 $Rs), (Zext64 $Rt)))>; + +// Subtract two v4i8: Hexagon does not have an insn for this one, so we +// use the double sub v8i8, and use only the low part of the result. +def: Pat<(v4i8 (sub (v4i8 IntRegs:$Rs), (v4i8 IntRegs:$Rt))), + (LoReg (A2_vsubub (Zext64 $Rs), (Zext64 $Rt)))>; + +// +// No 32 bit vector mux. +// +def: Pat<(v4i8 (select I1:$Pu, V4I8:$Rs, V4I8:$Rt)), + (LoReg (C2_vmux I1:$Pu, (Zext64 $Rs), (Zext64 $Rt)))>; +def: Pat<(v2i16 (select I1:$Pu, V2I16:$Rs, V2I16:$Rt)), + (LoReg (C2_vmux I1:$Pu, (Zext64 $Rs), (Zext64 $Rt)))>; + +// +// 64-bit vector mux. +// +def: Pat<(v8i8 (vselect V8I1:$Pu, V8I8:$Rs, V8I8:$Rt)), + (C2_vmux V8I1:$Pu, V8I8:$Rs, V8I8:$Rt)>; +def: Pat<(v4i16 (vselect V4I1:$Pu, V4I16:$Rs, V4I16:$Rt)), + (C2_vmux V4I1:$Pu, V4I16:$Rs, V4I16:$Rt)>; +def: Pat<(v2i32 (vselect V2I1:$Pu, V2I32:$Rs, V2I32:$Rt)), + (C2_vmux V2I1:$Pu, V2I32:$Rs, V2I32:$Rt)>; + +// +// No 32 bit vector compare. +// +def: Pat<(i1 (seteq V4I8:$Rs, V4I8:$Rt)), + (A2_vcmpbeq (Zext64 $Rs), (Zext64 $Rt))>; +def: Pat<(i1 (setgt V4I8:$Rs, V4I8:$Rt)), + (A4_vcmpbgt (Zext64 $Rs), (Zext64 $Rt))>; +def: Pat<(i1 (setugt V4I8:$Rs, V4I8:$Rt)), + (A2_vcmpbgtu (Zext64 $Rs), (Zext64 $Rt))>; + +def: Pat<(i1 (seteq V2I16:$Rs, V2I16:$Rt)), + (A2_vcmpheq (Zext64 $Rs), (Zext64 $Rt))>; +def: Pat<(i1 (setgt V2I16:$Rs, V2I16:$Rt)), + (A2_vcmphgt (Zext64 $Rs), (Zext64 $Rt))>; +def: Pat<(i1 (setugt V2I16:$Rs, V2I16:$Rt)), + (A2_vcmphgtu (Zext64 $Rs), (Zext64 $Rt))>; + + +class InvertCmp_pat<InstHexagon InvMI, PatFrag CmpOp, PatFrag Value, + ValueType CmpTy> + : Pat<(CmpTy (CmpOp Value:$Rs, Value:$Rt)), + (InvMI Value:$Rt, Value:$Rs)>; + +// Map from a compare operation to the corresponding instruction with the +// order of operands reversed, e.g. x > y --> cmp.lt(y,x). +def: InvertCmp_pat<A4_vcmpbgt, setlt, V8I8, i1>; +def: InvertCmp_pat<A4_vcmpbgt, setlt, V8I8, v8i1>; +def: InvertCmp_pat<A2_vcmphgt, setlt, V4I16, i1>; +def: InvertCmp_pat<A2_vcmphgt, setlt, V4I16, v4i1>; +def: InvertCmp_pat<A2_vcmpwgt, setlt, V2I32, i1>; +def: InvertCmp_pat<A2_vcmpwgt, setlt, V2I32, v2i1>; + +def: InvertCmp_pat<A2_vcmpbgtu, setult, V8I8, i1>; +def: InvertCmp_pat<A2_vcmpbgtu, setult, V8I8, v8i1>; +def: InvertCmp_pat<A2_vcmphgtu, setult, V4I16, i1>; +def: InvertCmp_pat<A2_vcmphgtu, setult, V4I16, v4i1>; +def: InvertCmp_pat<A2_vcmpwgtu, setult, V2I32, i1>; +def: InvertCmp_pat<A2_vcmpwgtu, setult, V2I32, v2i1>; + +// Map from vcmpne(Rss) -> !vcmpew(Rss). +// rs != rt -> !(rs == rt). +def: Pat<(v2i1 (setne V2I32:$Rs, V2I32:$Rt)), + (C2_not (v2i1 (A2_vcmpbeq V2I32:$Rs, V2I32:$Rt)))>; + + +// Truncate: from vector B copy all 'E'ven 'B'yte elements: +// A[0] = B[0]; A[1] = B[2]; A[2] = B[4]; A[3] = B[6]; +def: Pat<(v4i8 (trunc V4I16:$Rs)), + (S2_vtrunehb V4I16:$Rs)>; + +// Truncate: from vector B copy all 'O'dd 'B'yte elements: +// A[0] = B[1]; A[1] = B[3]; A[2] = B[5]; A[3] = B[7]; +// S2_vtrunohb + +// Truncate: from vectors B and C copy all 'E'ven 'H'alf-word elements: +// A[0] = B[0]; A[1] = B[2]; A[2] = C[0]; A[3] = C[2]; +// S2_vtruneh + +def: Pat<(v2i16 (trunc V2I32:$Rs)), + (LoReg (S2_packhl (HiReg $Rs), (LoReg $Rs)))>; + + +def HexagonVSXTBH : SDNode<"HexagonISD::VSXTBH", SDTUnaryOp>; +def HexagonVSXTBW : SDNode<"HexagonISD::VSXTBW", SDTUnaryOp>; + +def: Pat<(i64 (HexagonVSXTBH I32:$Rs)), (S2_vsxtbh I32:$Rs)>; +def: Pat<(i64 (HexagonVSXTBW I32:$Rs)), (S2_vsxthw I32:$Rs)>; + +def: Pat<(v4i16 (zext V4I8:$Rs)), (S2_vzxtbh V4I8:$Rs)>; +def: Pat<(v2i32 (zext V2I16:$Rs)), (S2_vzxthw V2I16:$Rs)>; +def: Pat<(v4i16 (anyext V4I8:$Rs)), (S2_vzxtbh V4I8:$Rs)>; +def: Pat<(v2i32 (anyext V2I16:$Rs)), (S2_vzxthw V2I16:$Rs)>; +def: Pat<(v4i16 (sext V4I8:$Rs)), (S2_vsxtbh V4I8:$Rs)>; +def: Pat<(v2i32 (sext V2I16:$Rs)), (S2_vsxthw V2I16:$Rs)>; + +// Sign extends a v2i8 into a v2i32. +def: Pat<(v2i32 (sext_inreg V2I32:$Rs, v2i8)), + (A2_combinew (A2_sxtb (HiReg $Rs)), (A2_sxtb (LoReg $Rs)))>; + +// Sign extends a v2i16 into a v2i32. +def: Pat<(v2i32 (sext_inreg V2I32:$Rs, v2i16)), + (A2_combinew (A2_sxth (HiReg $Rs)), (A2_sxth (LoReg $Rs)))>; + + +// Multiplies two v2i16 and returns a v2i32. We are using here the +// saturating multiply, as hexagon does not provide a non saturating +// vector multiply, and saturation does not impact the result that is +// in double precision of the operands. + +// Multiplies two v2i16 vectors: as Hexagon does not have a multiply +// with the C semantics for this one, this pattern uses the half word +// multiply vmpyh that takes two v2i16 and returns a v2i32. This is +// then truncated to fit this back into a v2i16 and to simulate the +// wrap around semantics for unsigned in C. +def vmpyh: OutPatFrag<(ops node:$Rs, node:$Rt), + (M2_vmpy2s_s0 (i32 $Rs), (i32 $Rt))>; + +def: Pat<(v2i16 (mul V2I16:$Rs, V2I16:$Rt)), + (LoReg (S2_vtrunewh (v2i32 (A2_combineii 0, 0)), + (v2i32 (vmpyh V2I16:$Rs, V2I16:$Rt))))>; + +// Multiplies two v4i16 vectors. +def: Pat<(v4i16 (mul V4I16:$Rs, V4I16:$Rt)), + (S2_vtrunewh (vmpyh (HiReg $Rs), (HiReg $Rt)), + (vmpyh (LoReg $Rs), (LoReg $Rt)))>; + +def VMPYB_no_V5: OutPatFrag<(ops node:$Rs, node:$Rt), + (S2_vtrunewh (vmpyh (HiReg (S2_vsxtbh $Rs)), (HiReg (S2_vsxtbh $Rt))), + (vmpyh (LoReg (S2_vsxtbh $Rs)), (LoReg (S2_vsxtbh $Rt))))>; + +// Multiplies two v4i8 vectors. +def: Pat<(v4i8 (mul V4I8:$Rs, V4I8:$Rt)), + (S2_vtrunehb (M5_vmpybsu V4I8:$Rs, V4I8:$Rt))>, + Requires<[HasV5T]>; + +def: Pat<(v4i8 (mul V4I8:$Rs, V4I8:$Rt)), + (S2_vtrunehb (VMPYB_no_V5 V4I8:$Rs, V4I8:$Rt))>; + +// Multiplies two v8i8 vectors. +def: Pat<(v8i8 (mul V8I8:$Rs, V8I8:$Rt)), + (A2_combinew (S2_vtrunehb (M5_vmpybsu (HiReg $Rs), (HiReg $Rt))), + (S2_vtrunehb (M5_vmpybsu (LoReg $Rs), (LoReg $Rt))))>, + Requires<[HasV5T]>; + +def: Pat<(v8i8 (mul V8I8:$Rs, V8I8:$Rt)), + (A2_combinew (S2_vtrunehb (VMPYB_no_V5 (HiReg $Rs), (HiReg $Rt))), + (S2_vtrunehb (VMPYB_no_V5 (LoReg $Rs), (LoReg $Rt))))>; + + +class shuffler<SDNode Op, string Str> + : SInst<(outs DoubleRegs:$a), (ins DoubleRegs:$b, DoubleRegs:$c), + "$a = " # Str # "($b, $c)", + [(set (i64 DoubleRegs:$a), + (i64 (Op (i64 DoubleRegs:$b), (i64 DoubleRegs:$c))))], + "", S_3op_tc_1_SLOT23>; + +def SDTHexagonBinOp64 : SDTypeProfile<1, 2, + [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<0, i64>]>; + +def HexagonSHUFFEB: SDNode<"HexagonISD::SHUFFEB", SDTHexagonBinOp64>; +def HexagonSHUFFEH: SDNode<"HexagonISD::SHUFFEH", SDTHexagonBinOp64>; +def HexagonSHUFFOB: SDNode<"HexagonISD::SHUFFOB", SDTHexagonBinOp64>; +def HexagonSHUFFOH: SDNode<"HexagonISD::SHUFFOH", SDTHexagonBinOp64>; + +class ShufflePat<InstHexagon MI, SDNode Op> + : Pat<(i64 (Op DoubleRegs:$src1, DoubleRegs:$src2)), + (i64 (MI DoubleRegs:$src1, DoubleRegs:$src2))>; + +// Shuffles even bytes for i=0..3: A[2*i].b = C[2*i].b; A[2*i+1].b = B[2*i].b +def: ShufflePat<S2_shuffeb, HexagonSHUFFEB>; + +// Shuffles odd bytes for i=0..3: A[2*i].b = C[2*i+1].b; A[2*i+1].b = B[2*i+1].b +def: ShufflePat<S2_shuffob, HexagonSHUFFOB>; + +// Shuffles even half for i=0,1: A[2*i].h = C[2*i].h; A[2*i+1].h = B[2*i].h +def: ShufflePat<S2_shuffeh, HexagonSHUFFEH>; + +// Shuffles odd half for i=0,1: A[2*i].h = C[2*i+1].h; A[2*i+1].h = B[2*i+1].h +def: ShufflePat<S2_shuffoh, HexagonSHUFFOH>; + + +// Truncated store from v4i16 to v4i8. +def truncstorev4i8: PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), + [{ return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4i8; }]>; + +// Truncated store from v2i32 to v2i16. +def truncstorev2i16: PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), + [{ return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v2i16; }]>; + +def: Pat<(truncstorev2i16 V2I32:$Rs, I32:$Rt), + (S2_storeri_io I32:$Rt, 0, (LoReg (S2_packhl (HiReg $Rs), + (LoReg $Rs))))>; + +def: Pat<(truncstorev4i8 V4I16:$Rs, I32:$Rt), + (S2_storeri_io I32:$Rt, 0, (S2_vtrunehb V4I16:$Rs))>; + + +// Zero and sign extended load from v2i8 into v2i16. +def zextloadv2i8: PatFrag<(ops node:$ptr), (zextload node:$ptr), + [{ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::v2i8; }]>; + +def sextloadv2i8: PatFrag<(ops node:$ptr), (sextload node:$ptr), + [{ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::v2i8; }]>; + +def: Pat<(v2i16 (zextloadv2i8 I32:$Rs)), + (LoReg (v4i16 (S2_vzxtbh (L2_loadruh_io I32:$Rs, 0))))>; + +def: Pat<(v2i16 (sextloadv2i8 I32:$Rs)), + (LoReg (v4i16 (S2_vsxtbh (L2_loadrh_io I32:$Rs, 0))))>; + +def: Pat<(v2i32 (zextloadv2i8 I32:$Rs)), + (S2_vzxthw (LoReg (v4i16 (S2_vzxtbh (L2_loadruh_io I32:$Rs, 0)))))>; + +def: Pat<(v2i32 (sextloadv2i8 I32:$Rs)), + (S2_vsxthw (LoReg (v4i16 (S2_vsxtbh (L2_loadrh_io I32:$Rs, 0)))))>; diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td index c0551e8..4275230 100644 --- a/lib/Target/Hexagon/HexagonIntrinsics.td +++ b/lib/Target/Hexagon/HexagonIntrinsics.td @@ -690,16 +690,15 @@ def: T_RR_pat<A2_combine_hl, int_hexagon_A2_combine_hl>; def: T_RR_pat<A2_combine_lh, int_hexagon_A2_combine_lh>; def: T_RR_pat<A2_combine_ll, int_hexagon_A2_combine_ll>; -def: T_II_pat<A2_combineii, int_hexagon_A2_combineii, s8ExtPred, s8ImmPred>; +def: T_II_pat<A2_combineii, int_hexagon_A2_combineii, s32ImmPred, s8ImmPred>; -def: Pat<(i32 (int_hexagon_C2_mux (I32:$Rp), (I32:$Rs), - (I32:$Rt))), +def: Pat<(i32 (int_hexagon_C2_mux (I32:$Rp), (I32:$Rs), (I32:$Rt))), (i32 (C2_mux (C2_tfrrp IntRegs:$Rp), IntRegs:$Rs, IntRegs:$Rt))>; // Mux -def : T_QRI_pat<C2_muxir, int_hexagon_C2_muxir, s8ExtPred>; -def : T_QIR_pat<C2_muxri, int_hexagon_C2_muxri, s8ExtPred>; -def : T_QII_pat<C2_muxii, int_hexagon_C2_muxii, s8ExtPred, s8ImmPred>; +def : T_QRI_pat<C2_muxir, int_hexagon_C2_muxir, s32ImmPred>; +def : T_QIR_pat<C2_muxri, int_hexagon_C2_muxri, s32ImmPred>; +def : T_QII_pat<C2_muxii, int_hexagon_C2_muxii, s32ImmPred, s8ImmPred>; // Shift halfword def : T_R_pat<A2_aslh, int_hexagon_A2_aslh>; @@ -720,17 +719,17 @@ def : T_RR_pat<C2_cmpeq, int_hexagon_C2_cmpeq>; def : T_RR_pat<C2_cmpgt, int_hexagon_C2_cmpgt>; def : T_RR_pat<C2_cmpgtu, int_hexagon_C2_cmpgtu>; -def : T_RI_pat<C2_cmpeqi, int_hexagon_C2_cmpeqi, s10ExtPred>; -def : T_RI_pat<C2_cmpgti, int_hexagon_C2_cmpgti, s10ExtPred>; -def : T_RI_pat<C2_cmpgtui, int_hexagon_C2_cmpgtui, u9ExtPred>; +def : T_RI_pat<C2_cmpeqi, int_hexagon_C2_cmpeqi, s32ImmPred>; +def : T_RI_pat<C2_cmpgti, int_hexagon_C2_cmpgti, s32ImmPred>; +def : T_RI_pat<C2_cmpgtui, int_hexagon_C2_cmpgtui, u32ImmPred>; -def : Pat <(i32 (int_hexagon_C2_cmpgei (I32:$src1), s8ExtPred:$src2)), +def : Pat <(i32 (int_hexagon_C2_cmpgei (I32:$src1), s32ImmPred:$src2)), (i32 (C2_cmpgti (I32:$src1), - (DEC_CONST_SIGNED s8ExtPred:$src2)))>; + (DEC_CONST_SIGNED s32ImmPred:$src2)))>; -def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), u8ExtPred:$src2)), +def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), u32ImmPred:$src2)), (i32 (C2_cmpgtui (I32:$src1), - (DEC_CONST_UNSIGNED u8ExtPred:$src2)))>; + (DEC_CONST_UNSIGNED u32ImmPred:$src2)))>; // The instruction, Pd=cmp.geu(Rs, #u8) -> Pd=cmp.eq(Rs,Rs) when #u8 == 0. def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), 0)), @@ -1258,6 +1257,30 @@ def: Pat<(i32 (int_hexagon_S2_storew_locked (I32:$Rs), (I32:$Rt))), def: Pat<(i32 (int_hexagon_S4_stored_locked (I32:$Rs), (I64:$Rt))), (i32 (C2_tfrpr (S4_stored_locked (I32:$Rs), (I64:$Rt))))>; +/******************************************************************** +* ST +*********************************************************************/ + +class T_stb_pat <InstHexagon MI, Intrinsic IntID, PatLeaf Val> + : Pat<(IntID I32:$Rs, Val:$Rt, I32:$Ru), + (MI I32:$Rs, Val:$Rt, I32:$Ru)>; + +def : T_stb_pat <S2_storerh_pbr_pseudo, int_hexagon_brev_sth, I32>; +def : T_stb_pat <S2_storerb_pbr_pseudo, int_hexagon_brev_stb, I32>; +def : T_stb_pat <S2_storeri_pbr_pseudo, int_hexagon_brev_stw, I32>; +def : T_stb_pat <S2_storerf_pbr_pseudo, int_hexagon_brev_sthhi, I32>; +def : T_stb_pat <S2_storerd_pbr_pseudo, int_hexagon_brev_std, I64>; + +class T_stc_pat <InstHexagon MI, Intrinsic IntID, PatLeaf Imm, PatLeaf Val> + : Pat<(IntID I32:$Rs, Val:$Rt, I32:$Ru, Imm:$s), + (MI I32:$Rs, Val:$Rt, I32:$Ru, Imm:$s)>; + +def: T_stc_pat<S2_storerb_pci_pseudo, int_hexagon_circ_stb, s4_0ImmPred, I32>; +def: T_stc_pat<S2_storerh_pci_pseudo, int_hexagon_circ_sth, s4_1ImmPred, I32>; +def: T_stc_pat<S2_storeri_pci_pseudo, int_hexagon_circ_stw, s4_2ImmPred, I32>; +def: T_stc_pat<S2_storerd_pci_pseudo, int_hexagon_circ_std, s4_3ImmPred, I64>; +def: T_stc_pat<S2_storerf_pci_pseudo, int_hexagon_circ_sthhi, s4_1ImmPred, I32>; + include "HexagonIntrinsicsV3.td" include "HexagonIntrinsicsV4.td" include "HexagonIntrinsicsV5.td" diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV4.td b/lib/Target/Hexagon/HexagonIntrinsicsV4.td index 8d068eb..c80a188 100644 --- a/lib/Target/Hexagon/HexagonIntrinsicsV4.td +++ b/lib/Target/Hexagon/HexagonIntrinsicsV4.td @@ -234,17 +234,17 @@ def: T_RR_pat<A4_orn, int_hexagon_A4_orn>; *********************************************************************/ // Combine Words Into Doublewords. -def: T_RI_pat<A4_combineri, int_hexagon_A4_combineri, s8ExtPred>; -def: T_IR_pat<A4_combineir, int_hexagon_A4_combineir, s8ExtPred>; +def: T_RI_pat<A4_combineri, int_hexagon_A4_combineri, s32ImmPred>; +def: T_IR_pat<A4_combineir, int_hexagon_A4_combineir, s32ImmPred>; /******************************************************************** * ALU32/PRED * *********************************************************************/ // Compare -def : T_RI_pat<C4_cmpneqi, int_hexagon_C4_cmpneqi, s10ExtPred>; -def : T_RI_pat<C4_cmpltei, int_hexagon_C4_cmpltei, s10ExtPred>; -def : T_RI_pat<C4_cmplteui, int_hexagon_C4_cmplteui, u9ExtPred>; +def : T_RI_pat<C4_cmpneqi, int_hexagon_C4_cmpneqi, s32ImmPred>; +def : T_RI_pat<C4_cmpltei, int_hexagon_C4_cmpltei, s32ImmPred>; +def : T_RI_pat<C4_cmplteui, int_hexagon_C4_cmplteui, u32ImmPred>; def: T_RR_pat<A4_rcmpeq, int_hexagon_A4_rcmpeq>; def: T_RR_pat<A4_rcmpneq, int_hexagon_A4_rcmpneq>; diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp index 806d448..81af4db 100644 --- a/lib/Target/Hexagon/HexagonNewValueJump.cpp +++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp @@ -40,6 +40,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" @@ -199,10 +200,7 @@ static bool commonChecksToProhibitNewValueJump(bool afterRA, // of registers by individual passes in the backend. At this time, // we don't know the scope of usage and definitions of these // instructions. - if (MII->getOpcode() == Hexagon::TFR_condset_ii || - MII->getOpcode() == Hexagon::TFR_condset_ri || - MII->getOpcode() == Hexagon::TFR_condset_ir || - MII->getOpcode() == Hexagon::LDriw_pred || + if (MII->getOpcode() == Hexagon::LDriw_pred || MII->getOpcode() == Hexagon::STriw_pred) return false; } diff --git a/lib/Target/Hexagon/HexagonOperands.td b/lib/Target/Hexagon/HexagonOperands.td index 318ca72..450f594 100644 --- a/lib/Target/Hexagon/HexagonOperands.td +++ b/lib/Target/Hexagon/HexagonOperands.td @@ -66,162 +66,131 @@ def nOneImm : Operand<i32>; // Immediate predicates // def s32ImmPred : PatLeaf<(i32 imm), [{ - // s32ImmPred predicate - True if the immediate fits in a 32-bit sign extended - // field. int64_t v = (int64_t)N->getSExtValue(); return isInt<32>(v); }]>; -def s32_24ImmPred : PatLeaf<(i32 imm), [{ - // s32_24ImmPred predicate - True if the immediate fits in a 32-bit sign - // extended field that is a multiple of 0x1000000. +def s32_0ImmPred : PatLeaf<(i32 imm), [{ + int64_t v = (int64_t)N->getSExtValue(); + return isInt<32>(v); +}]>; + +def s31_1ImmPred : PatLeaf<(i32 imm), [{ + int64_t v = (int64_t)N->getSExtValue(); + return isShiftedInt<31,1>(v); +}]>; + +def s30_2ImmPred : PatLeaf<(i32 imm), [{ + int64_t v = (int64_t)N->getSExtValue(); + return isShiftedInt<31,1>(v); +}]>; + +def s29_3ImmPred : PatLeaf<(i32 imm), [{ + int64_t v = (int64_t)N->getSExtValue(); + return isShiftedInt<31,1>(v); +}]>; + +def s22_10ImmPred : PatLeaf<(i32 imm), [{ + int64_t v = (int64_t)N->getSExtValue(); + return isShiftedInt<22,10>(v); +}]>; + +def s8_24ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); - return isShiftedInt<32,24>(v); + return isShiftedInt<8,24>(v); }]>; -def s32_16s8ImmPred : PatLeaf<(i32 imm), [{ - // s32_16s8ImmPred predicate - True if the immediate fits in a 32-bit sign - // extended field that is a multiple of 0x10000. +def s16_16ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); - return isShiftedInt<24,16>(v); + return isShiftedInt<16,16>(v); }]>; def s26_6ImmPred : PatLeaf<(i32 imm), [{ - // s26_6ImmPred predicate - True if the immediate fits in a 32-bit - // sign extended field. int64_t v = (int64_t)N->getSExtValue(); return isShiftedInt<26,6>(v); }]>; - def s16ImmPred : PatLeaf<(i32 imm), [{ - // s16ImmPred predicate - True if the immediate fits in a 16-bit sign extended - // field. int64_t v = (int64_t)N->getSExtValue(); return isInt<16>(v); }]>; - def s13ImmPred : PatLeaf<(i32 imm), [{ - // s13ImmPred predicate - True if the immediate fits in a 13-bit sign extended - // field. int64_t v = (int64_t)N->getSExtValue(); return isInt<13>(v); }]>; - def s12ImmPred : PatLeaf<(i32 imm), [{ - // s12ImmPred predicate - True if the immediate fits in a 12-bit - // sign extended field. int64_t v = (int64_t)N->getSExtValue(); return isInt<12>(v); }]>; def s11_0ImmPred : PatLeaf<(i32 imm), [{ - // s11_0ImmPred predicate - True if the immediate fits in a 11-bit - // sign extended field. int64_t v = (int64_t)N->getSExtValue(); return isInt<11>(v); }]>; - def s11_1ImmPred : PatLeaf<(i32 imm), [{ - // s11_1ImmPred predicate - True if the immediate fits in a 12-bit - // sign extended field and is a multiple of 2. int64_t v = (int64_t)N->getSExtValue(); return isShiftedInt<11,1>(v); }]>; - def s11_2ImmPred : PatLeaf<(i32 imm), [{ - // s11_2ImmPred predicate - True if the immediate fits in a 13-bit - // sign extended field and is a multiple of 4. int64_t v = (int64_t)N->getSExtValue(); return isShiftedInt<11,2>(v); }]>; - def s11_3ImmPred : PatLeaf<(i32 imm), [{ - // s11_3ImmPred predicate - True if the immediate fits in a 14-bit - // sign extended field and is a multiple of 8. int64_t v = (int64_t)N->getSExtValue(); return isShiftedInt<11,3>(v); }]>; - def s10ImmPred : PatLeaf<(i32 imm), [{ - // s10ImmPred predicate - True if the immediate fits in a 10-bit sign extended - // field. int64_t v = (int64_t)N->getSExtValue(); return isInt<10>(v); }]>; - def s9ImmPred : PatLeaf<(i32 imm), [{ - // s9ImmPred predicate - True if the immediate fits in a 9-bit sign extended - // field. int64_t v = (int64_t)N->getSExtValue(); return isInt<9>(v); }]>; def m9ImmPred : PatLeaf<(i32 imm), [{ - // m9ImmPred predicate - True if the immediate fits in a 9-bit magnitude - // field. The range of m9 is -255 to 255. int64_t v = (int64_t)N->getSExtValue(); return isInt<9>(v) && (v != -256); }]>; def s8ImmPred : PatLeaf<(i32 imm), [{ - // s8ImmPred predicate - True if the immediate fits in a 8-bit sign extended - // field. int64_t v = (int64_t)N->getSExtValue(); return isInt<8>(v); }]>; - def s8Imm64Pred : PatLeaf<(i64 imm), [{ - // s8ImmPred predicate - True if the immediate fits in a 8-bit sign extended - // field. int64_t v = (int64_t)N->getSExtValue(); return isInt<8>(v); }]>; - def s6ImmPred : PatLeaf<(i32 imm), [{ - // s6ImmPred predicate - True if the immediate fits in a 6-bit sign extended - // field. int64_t v = (int64_t)N->getSExtValue(); return isInt<6>(v); }]>; - def s4_0ImmPred : PatLeaf<(i32 imm), [{ - // s4_0ImmPred predicate - True if the immediate fits in a 4-bit sign extended - // field. int64_t v = (int64_t)N->getSExtValue(); return isInt<4>(v); }]>; - def s4_1ImmPred : PatLeaf<(i32 imm), [{ - // s4_1ImmPred predicate - True if the immediate fits in a 4-bit sign extended - // field of 2. int64_t v = (int64_t)N->getSExtValue(); return isShiftedInt<4,1>(v); }]>; - def s4_2ImmPred : PatLeaf<(i32 imm), [{ - // s4_2ImmPred predicate - True if the immediate fits in a 4-bit sign extended - // field that is a multiple of 4. int64_t v = (int64_t)N->getSExtValue(); return isShiftedInt<4,2>(v); }]>; - def s4_3ImmPred : PatLeaf<(i32 imm), [{ - // s4_3ImmPred predicate - True if the immediate fits in a 4-bit sign extended - // field that is a multiple of 8. int64_t v = (int64_t)N->getSExtValue(); return isShiftedInt<4,3>(v); }]>; @@ -233,56 +202,61 @@ def u64ImmPred : PatLeaf<(i64 imm), [{ }]>; def u32ImmPred : PatLeaf<(i32 imm), [{ - // u32ImmPred predicate - True if the immediate fits in a 32-bit field. int64_t v = (int64_t)N->getSExtValue(); return isUInt<32>(v); }]>; +def u32_0ImmPred : PatLeaf<(i32 imm), [{ + int64_t v = (int64_t)N->getSExtValue(); + return isUInt<32>(v); +}]>; + +def u31_1ImmPred : PatLeaf<(i32 imm), [{ + int64_t v = (int64_t)N->getSExtValue(); + return isShiftedUInt<31,1>(v); +}]>; + +def u30_2ImmPred : PatLeaf<(i32 imm), [{ + int64_t v = (int64_t)N->getSExtValue(); + return isShiftedUInt<30,2>(v); +}]>; + +def u29_3ImmPred : PatLeaf<(i32 imm), [{ + int64_t v = (int64_t)N->getSExtValue(); + return isShiftedUInt<29,3>(v); +}]>; + def u26_6ImmPred : PatLeaf<(i32 imm), [{ - // u26_6ImmPred - True if the immediate fits in a 32-bit field and - // is a multiple of 64. int64_t v = (int64_t)N->getSExtValue(); return isShiftedUInt<26,6>(v); }]>; def u16ImmPred : PatLeaf<(i32 imm), [{ - // u16ImmPred predicate - True if the immediate fits in a 16-bit unsigned - // field. int64_t v = (int64_t)N->getSExtValue(); return isUInt<16>(v); }]>; def u16_s8ImmPred : PatLeaf<(i32 imm), [{ - // u16_s8ImmPred predicate - True if the immediate fits in a 16-bit sign - // extended s8 field. int64_t v = (int64_t)N->getSExtValue(); return isShiftedUInt<16,8>(v); }]>; def u16_0ImmPred : PatLeaf<(i32 imm), [{ - // True if the immediate fits in a 16-bit unsigned field. int64_t v = (int64_t)N->getSExtValue(); return isUInt<16>(v); }]>; def u11_3ImmPred : PatLeaf<(i32 imm), [{ - // True if the immediate fits in a 14-bit unsigned field, and the lowest - // three bits are 0. int64_t v = (int64_t)N->getSExtValue(); return isShiftedUInt<11,3>(v); }]>; def u9ImmPred : PatLeaf<(i32 imm), [{ - // u9ImmPred predicate - True if the immediate fits in a 9-bit unsigned - // field. int64_t v = (int64_t)N->getSExtValue(); return isUInt<9>(v); }]>; - def u8ImmPred : PatLeaf<(i32 imm), [{ - // u8ImmPred predicate - True if the immediate fits in a 8-bit unsigned - // field. int64_t v = (int64_t)N->getSExtValue(); return isUInt<8>(v); }]>; @@ -294,81 +268,56 @@ def u7StrictPosImmPred : ImmLeaf<i32, [{ }]>; def u7ImmPred : PatLeaf<(i32 imm), [{ - // u7ImmPred predicate - True if the immediate fits in a 7-bit unsigned - // field. int64_t v = (int64_t)N->getSExtValue(); return isUInt<7>(v); }]>; - def u6ImmPred : PatLeaf<(i32 imm), [{ - // u6ImmPred predicate - True if the immediate fits in a 6-bit unsigned - // field. int64_t v = (int64_t)N->getSExtValue(); return isUInt<6>(v); }]>; def u6_0ImmPred : PatLeaf<(i32 imm), [{ - // u6_0ImmPred predicate - True if the immediate fits in a 6-bit unsigned - // field. Same as u6ImmPred. int64_t v = (int64_t)N->getSExtValue(); return isUInt<6>(v); }]>; def u6_1ImmPred : PatLeaf<(i32 imm), [{ - // u6_1ImmPred predicate - True if the immediate fits in a 7-bit unsigned - // field that is 1 bit alinged - multiple of 2. int64_t v = (int64_t)N->getSExtValue(); return isShiftedUInt<6,1>(v); }]>; def u6_2ImmPred : PatLeaf<(i32 imm), [{ - // u6_2ImmPred predicate - True if the immediate fits in a 8-bit unsigned - // field that is 2 bits alinged - multiple of 4. int64_t v = (int64_t)N->getSExtValue(); return isShiftedUInt<6,2>(v); }]>; def u6_3ImmPred : PatLeaf<(i32 imm), [{ - // u6_3ImmPred predicate - True if the immediate fits in a 9-bit unsigned - // field that is 3 bits alinged - multiple of 8. int64_t v = (int64_t)N->getSExtValue(); return isShiftedUInt<6,3>(v); }]>; def u5ImmPred : PatLeaf<(i32 imm), [{ - // u5ImmPred predicate - True if the immediate fits in a 5-bit unsigned - // field. int64_t v = (int64_t)N->getSExtValue(); return isUInt<5>(v); }]>; def u4ImmPred : PatLeaf<(i32 imm), [{ - // u4ImmPred predicate - True if the immediate fits in a 4-bit unsigned - // field. int64_t v = (int64_t)N->getSExtValue(); return isUInt<4>(v); }]>; def u3ImmPred : PatLeaf<(i32 imm), [{ - // u3ImmPred predicate - True if the immediate fits in a 3-bit unsigned - // field. int64_t v = (int64_t)N->getSExtValue(); return isUInt<3>(v); }]>; - def u2ImmPred : PatLeaf<(i32 imm), [{ - // u2ImmPred predicate - True if the immediate fits in a 2-bit unsigned - // field. int64_t v = (int64_t)N->getSExtValue(); return isUInt<2>(v); }]>; - def u1ImmPred : PatLeaf<(i1 imm), [{ - // u1ImmPred predicate - True if the immediate fits in a 1-bit unsigned - // field. int64_t v = (int64_t)N->getSExtValue(); return isUInt<1>(v); }]>; @@ -511,212 +460,6 @@ let PrintMethod = "printExtOperand" in { def u6_3Ext : Operand<i32>; } -let PrintMethod = "printImmOperand" in -def u0AlwaysExt : Operand<i32>; - -// Predicates for constant extendable operands -def s16ExtPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - if (isInt<16>(v)) - return true; - - // Return true if extending this immediate is profitable and the value - // can fit in a 32-bit signed field. - return isConstExtProfitable(Node) && isInt<32>(v); -}]>; - -def s10ExtPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - if (isInt<10>(v)) - return true; - - // Return true if extending this immediate is profitable and the value - // can fit in a 32-bit signed field. - return isConstExtProfitable(Node) && isInt<32>(v); -}]>; - -def s9ExtPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - if (isInt<9>(v)) - return true; - - // Return true if extending this immediate is profitable and the value - // can fit in a 32-bit unsigned field. - return isConstExtProfitable(Node) && isInt<32>(v); -}]>; - -def s8ExtPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - if (isInt<8>(v)) - return true; - - // Return true if extending this immediate is profitable and the value - // can fit in a 32-bit signed field. - return isConstExtProfitable(Node) && isInt<32>(v); -}]>; - -def s8_16ExtPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - if (isInt<8>(v)) - return true; - - // Return true if extending this immediate is profitable and the value - // can't fit in a 16-bit signed field. This is required to avoid - // unnecessary constant extenders. - return isConstExtProfitable(Node) && !isInt<16>(v); -}]>; - -def s6ExtPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - if (isInt<6>(v)) - return true; - - // Return true if extending this immediate is profitable and the value - // can fit in a 32-bit unsigned field. - return isConstExtProfitable(Node) && isInt<32>(v); -}]>; - -def s6_16ExtPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - if (isInt<6>(v)) - return true; - - // Return true if extending this immediate is profitable and the value - // can't fit in a 16-bit signed field. This is required to avoid - // unnecessary constant extenders. - return isConstExtProfitable(Node) && !isInt<16>(v); -}]>; - -def s6_10ExtPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - if (isInt<6>(v)) - return true; - - // Return true if extending this immediate is profitable and the value - // can't fit in a 10-bit signed field. This is required to avoid - // unnecessary constant extenders. - return isConstExtProfitable(Node) && !isInt<10>(v); -}]>; - -def s11_0ExtPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - if (isInt<11>(v)) - return true; - - // Return true if extending this immediate is profitable and the value - // can fit in a 32-bit signed field. - return isConstExtProfitable(Node) && isInt<32>(v); -}]>; - -def s11_1ExtPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - if (isInt<12>(v)) - return isShiftedInt<11,1>(v); - - // Return true if extending this immediate is profitable and the low 1 bit - // is zero (2-byte aligned). - return isConstExtProfitable(Node) && isInt<32>(v) && ((v % 2) == 0); -}]>; - -def s11_2ExtPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - if (isInt<13>(v)) - return isShiftedInt<11,2>(v); - - // Return true if extending this immediate is profitable and the low 2-bits - // are zero (4-byte aligned). - return isConstExtProfitable(Node) && isInt<32>(v) && ((v % 4) == 0); -}]>; - -def s11_3ExtPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - if (isInt<14>(v)) - return isShiftedInt<11,3>(v); - - // Return true if extending this immediate is profitable and the low 3-bits - // are zero (8-byte aligned). - return isConstExtProfitable(Node) && isInt<32>(v) && ((v % 8) == 0); -}]>; - -def u0AlwaysExtPred : PatLeaf<(i32 imm), [{ - // Predicate for an unsigned 32-bit value that always needs to be extended. - if (isConstExtProfitable(Node)) { - int64_t v = (int64_t)N->getSExtValue(); - return isUInt<32>(v); - } - return false; -}]>; - -def u6ExtPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - if (isUInt<6>(v)) - return true; - - // Return true if extending this immediate is profitable and the value - // can fit in a 32-bit unsigned field. - return isConstExtProfitable(Node) && isUInt<32>(v); -}]>; - -def u7ExtPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - if (isUInt<7>(v)) - return true; - - // Return true if extending this immediate is profitable and the value - // can fit in a 32-bit unsigned field. - return isConstExtProfitable(Node) && isUInt<32>(v); -}]>; - -def u8ExtPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - if (isUInt<8>(v)) - return true; - - // Return true if extending this immediate is profitable and the value - // can fit in a 32-bit unsigned field. - return isConstExtProfitable(Node) && isUInt<32>(v); -}]>; - -def u9ExtPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - if (isUInt<9>(v)) - return true; - - // Return true if extending this immediate is profitable and the value - // can fit in a 32-bit unsigned field. - return isConstExtProfitable(Node) && isUInt<32>(v); -}]>; - -def u6_1ExtPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - if (isUInt<7>(v)) - return isShiftedUInt<6,1>(v); - - // Return true if extending this immediate is profitable and the value - // can fit in a 32-bit unsigned field. - return isConstExtProfitable(Node) && isUInt<32>(v) && ((v % 2) == 0); -}]>; - -def u6_2ExtPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - if (isUInt<8>(v)) - return isShiftedUInt<6,2>(v); - - // Return true if extending this immediate is profitable and the value - // can fit in a 32-bit unsigned field. - return isConstExtProfitable(Node) && isUInt<32>(v) && ((v % 4) == 0); -}]>; - -def u6_3ExtPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - if (isUInt<9>(v)) - return isShiftedUInt<6,3>(v); - - // Return true if extending this immediate is profitable and the value - // can fit in a 32-bit unsigned field. - return isConstExtProfitable(Node) && isUInt<32>(v) && ((v % 8) == 0); -}]>; - // This complex pattern exists only to create a machine instruction operand // of type "frame index". There doesn't seem to be a way to do that directly @@ -729,41 +472,8 @@ def AddrFI : ComplexPattern<i32, 1, "SelectAddrFI", [frameindex], []>; def AddrGA : ComplexPattern<i32, 1, "SelectAddrGA", [], []>; def AddrGP : ComplexPattern<i32, 1, "SelectAddrGP", [], []>; -// Addressing modes. - -def ADDRrr : ComplexPattern<i32, 2, "SelectADDRrr", [], []>; -def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex], []>; -def ADDRriS11_0 : ComplexPattern<i32, 2, "SelectADDRriS11_0", [frameindex], []>; -def ADDRriS11_1 : ComplexPattern<i32, 2, "SelectADDRriS11_1", [frameindex], []>; -def ADDRriS11_2 : ComplexPattern<i32, 2, "SelectADDRriS11_2", [frameindex], []>; -def ADDRriS11_3 : ComplexPattern<i32, 2, "SelectADDRriS11_3", [frameindex], []>; -def ADDRriU6_0 : ComplexPattern<i32, 2, "SelectADDRriU6_0", [frameindex], []>; -def ADDRriU6_1 : ComplexPattern<i32, 2, "SelectADDRriU6_1", [frameindex], []>; -def ADDRriU6_2 : ComplexPattern<i32, 2, "SelectADDRriU6_2", [frameindex], []>; - // Address operands. -def MEMrr : Operand<i32> { - let PrintMethod = "printMEMrrOperand"; - let MIOperandInfo = (ops IntRegs, IntRegs); -} - -def MEMri : Operand<i32> { - let PrintMethod = "printMEMriOperand"; - let MIOperandInfo = (ops IntRegs, IntRegs); -} - -def MEMri_s11_2 : Operand<i32>, - ComplexPattern<i32, 2, "SelectMEMriS11_2", []> { - let PrintMethod = "printMEMriOperand"; - let MIOperandInfo = (ops IntRegs, s11Imm); -} - -def FrameIndex : Operand<i32> { - let PrintMethod = "printFrameIndexOperand"; - let MIOperandInfo = (ops IntRegs, s11Imm); -} - let PrintMethod = "printGlobalOperand" in { def globaladdress : Operand<i32>; def globaladdressExt : Operand<i32>; diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp index afd3a17..503bfdb 100644 --- a/lib/Target/Hexagon/HexagonPeephole.cpp +++ b/lib/Target/Hexagon/HexagonPeephole.cpp @@ -271,15 +271,8 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { switch (Op) { case Hexagon::C2_mux: case Hexagon::C2_muxii: - case Hexagon::TFR_condset_ii: NewOp = Op; break; - case Hexagon::TFR_condset_ri: - NewOp = Hexagon::TFR_condset_ir; - break; - case Hexagon::TFR_condset_ir: - NewOp = Hexagon::TFR_condset_ri; - break; case Hexagon::C2_muxri: NewOp = Hexagon::C2_muxir; break; diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp index 3df98d6..86eaee8 100644 --- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp +++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp @@ -37,11 +37,8 @@ using namespace llvm; - -HexagonRegisterInfo::HexagonRegisterInfo(HexagonSubtarget &st) - : HexagonGenRegisterInfo(Hexagon::R31), - Subtarget(st) { -} +HexagonRegisterInfo::HexagonRegisterInfo() + : HexagonGenRegisterInfo(Hexagon::R31) {} const MCPhysReg * HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { @@ -51,7 +48,7 @@ HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { Hexagon::R24, Hexagon::R25, Hexagon::R26, Hexagon::R27, 0 }; - switch(Subtarget.getHexagonArchVersion()) { + switch (MF->getSubtarget<HexagonSubtarget>().getHexagonArchVersion()) { case HexagonSubtarget::V4: case HexagonSubtarget::V5: return CalleeSavedRegsV3; @@ -89,7 +86,7 @@ HexagonRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const { &Hexagon::IntRegsRegClass, &Hexagon::IntRegsRegClass, }; - switch(Subtarget.getHexagonArchVersion()) { + switch (MF->getSubtarget<HexagonSubtarget>().getHexagonArchVersion()) { case HexagonSubtarget::V4: case HexagonSubtarget::V5: return CalleeSavedRegClassesV3; @@ -122,7 +119,9 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, Offset -= 2 * Hexagon_WordSize; } - const unsigned FrameSize = MFI.getStackSize(); + unsigned FrameSize = MFI.getStackSize(); + if (MI.getOpcode() == Hexagon::TFR_FI) + MI.setDesc(TII.get(Hexagon::A2_addi)); if (!MFI.hasVarSizedObjects() && TII.isValidOffset(MI.getOpcode(), (FrameSize+Offset)) && diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h index a83b502..dc6dd2a 100644 --- a/lib/Target/Hexagon/HexagonRegisterInfo.h +++ b/lib/Target/Hexagon/HexagonRegisterInfo.h @@ -37,19 +37,11 @@ #define HEXAGON_RESERVED_REG_2 Hexagon::R11 namespace llvm { - -class HexagonSubtarget; -class HexagonInstrInfo; -class Type; - struct HexagonRegisterInfo : public HexagonGenRegisterInfo { - HexagonSubtarget &Subtarget; - - HexagonRegisterInfo(HexagonSubtarget &st); + HexagonRegisterInfo(); /// Code Generation virtual methods... - const MCPhysReg * - getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override; + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; const TargetRegisterClass* const* getCalleeSavedRegClasses(const MachineFunction *MF = nullptr) const; diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp index ce6a39a..1a4c7ae 100644 --- a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp +++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp @@ -71,6 +71,7 @@ bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) { return true; const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo(); + const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo(); // Loop over all of the basic blocks for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end(); @@ -82,82 +83,78 @@ bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) { while (MII != MIE) { MachineInstr *MI = MII; int Opc = MI->getOpcode(); - if (Opc == Hexagon::CONST32_set) { + if (Opc == Hexagon::CONST32_set_jt) { int DestReg = MI->getOperand(0).getReg(); MachineOperand &Symbol = MI->getOperand (1); - - BuildMI (*MBB, MII, MI->getDebugLoc(), - TII->get(Hexagon::LO), DestReg).addOperand(Symbol); BuildMI (*MBB, MII, MI->getDebugLoc(), - TII->get(Hexagon::HI), DestReg).addOperand(Symbol); - // MBB->erase returns the iterator to the next instruction, which is the - // one we want to process next - MII = MBB->erase (MI); - continue; - } - else if (Opc == Hexagon::CONST32_set_jt) { - int DestReg = MI->getOperand(0).getReg(); - MachineOperand &Symbol = MI->getOperand (1); + TII->get(Hexagon::A2_tfrsi), DestReg).addOperand(Symbol); - BuildMI (*MBB, MII, MI->getDebugLoc(), - TII->get(Hexagon::LO_jt), DestReg).addOperand(Symbol); - BuildMI (*MBB, MII, MI->getDebugLoc(), - TII->get(Hexagon::HI_jt), DestReg).addOperand(Symbol); // MBB->erase returns the iterator to the next instruction, which is the // one we want to process next MII = MBB->erase (MI); continue; } - else if (Opc == Hexagon::CONST32_Label) { + else if (Opc == Hexagon::CONST32_Int_Real && + MI->getOperand(1).isBlockAddress()) { int DestReg = MI->getOperand(0).getReg(); MachineOperand &Symbol = MI->getOperand (1); BuildMI (*MBB, MII, MI->getDebugLoc(), - TII->get(Hexagon::LO_PIC), DestReg).addOperand(Symbol); + TII->get(Hexagon::LO), DestReg).addOperand(Symbol); BuildMI (*MBB, MII, MI->getDebugLoc(), - TII->get(Hexagon::HI_PIC), DestReg).addOperand(Symbol); + TII->get(Hexagon::HI), DestReg).addOperand(Symbol); // MBB->erase returns the iterator to the next instruction, which is the // one we want to process next MII = MBB->erase (MI); continue; } - else if (Opc == Hexagon::CONST32_Int_Real) { + + else if (Opc == Hexagon::CONST32_Int_Real || + Opc == Hexagon::CONST32_Float_Real) { int DestReg = MI->getOperand(0).getReg(); - int64_t ImmValue = MI->getOperand(1).getImm (); - BuildMI (*MBB, MII, MI->getDebugLoc(), - TII->get(Hexagon::LOi), DestReg).addImm(ImmValue); - BuildMI (*MBB, MII, MI->getDebugLoc(), - TII->get(Hexagon::HIi), DestReg).addImm(ImmValue); + // We have to convert an FP immediate into its corresponding integer + // representation + int64_t ImmValue; + if (Opc == Hexagon::CONST32_Float_Real) { + APFloat Val = MI->getOperand(1).getFPImm()->getValueAPF(); + ImmValue = *Val.bitcastToAPInt().getRawData(); + } + else + ImmValue = MI->getOperand(1).getImm(); + + BuildMI(*MBB, MII, MI->getDebugLoc(), + TII->get(Hexagon::A2_tfrsi), DestReg).addImm(ImmValue); MII = MBB->erase (MI); continue; } - else if (Opc == Hexagon::CONST64_Int_Real) { + else if (Opc == Hexagon::CONST64_Int_Real || + Opc == Hexagon::CONST64_Float_Real) { int DestReg = MI->getOperand(0).getReg(); - int64_t ImmValue = MI->getOperand(1).getImm (); - unsigned DestLo = Fn.getSubtarget().getRegisterInfo()->getSubReg( - DestReg, Hexagon::subreg_loreg); - unsigned DestHi = Fn.getSubtarget().getRegisterInfo()->getSubReg( - DestReg, Hexagon::subreg_hireg); + + // We have to convert an FP immediate into its corresponding integer + // representation + int64_t ImmValue; + if (Opc == Hexagon::CONST64_Float_Real) { + APFloat Val = MI->getOperand(1).getFPImm()->getValueAPF(); + ImmValue = *Val.bitcastToAPInt().getRawData(); + } + else + ImmValue = MI->getOperand(1).getImm(); + + unsigned DestLo = TRI->getSubReg(DestReg, Hexagon::subreg_loreg); + unsigned DestHi = TRI->getSubReg(DestReg, Hexagon::subreg_hireg); int32_t LowWord = (ImmValue & 0xFFFFFFFF); int32_t HighWord = (ImmValue >> 32) & 0xFFFFFFFF; - // Lower Registers Lower Half - BuildMI (*MBB, MII, MI->getDebugLoc(), - TII->get(Hexagon::LOi), DestLo).addImm(LowWord); - // Lower Registers Higher Half + BuildMI(*MBB, MII, MI->getDebugLoc(), + TII->get(Hexagon::A2_tfrsi), DestLo).addImm(LowWord); BuildMI (*MBB, MII, MI->getDebugLoc(), - TII->get(Hexagon::HIi), DestLo).addImm(LowWord); - // Higher Registers Lower Half - BuildMI (*MBB, MII, MI->getDebugLoc(), - TII->get(Hexagon::LOi), DestHi).addImm(HighWord); - // Higher Registers Higher Half. - BuildMI (*MBB, MII, MI->getDebugLoc(), - TII->get(Hexagon::HIi), DestHi).addImm(HighWord); + TII->get(Hexagon::A2_tfrsi), DestHi).addImm(HighWord); MII = MBB->erase (MI); continue; - } + } ++MII; } } diff --git a/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp b/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp deleted file mode 100644 index 8873bb9..0000000 --- a/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp +++ /dev/null @@ -1,172 +0,0 @@ -//===-- HexagonSplitTFRCondSets.cpp - split TFR condsets into xfers -------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -// -//===----------------------------------------------------------------------===// -// This pass tries to provide opportunities for better optimization of muxes. -// The default code generated for something like: flag = (a == b) ? 1 : 3; -// would be: -// -// {p0 = cmp.eq(r0,r1)} -// {r3 = mux(p0,#1,#3)} -// -// This requires two packets. If we use .new predicated immediate transfers, -// then we can do this in a single packet, e.g.: -// -// {p0 = cmp.eq(r0,r1) -// if (p0.new) r3 = #1 -// if (!p0.new) r3 = #3} -// -// Note that the conditional assignments are not generated in .new form here. -// We assume opptimisically that they will be formed later. -// -//===----------------------------------------------------------------------===// - -#include "Hexagon.h" -#include "HexagonMachineFunctionInfo.h" -#include "HexagonSubtarget.h" -#include "HexagonTargetMachine.h" -#include "llvm/CodeGen/LatencyPriorityQueue.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/ScheduleHazardRecognizer.h" -#include "llvm/CodeGen/SchedulerRegistry.h" -#include "llvm/Support/Compiler.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetRegisterInfo.h" - -using namespace llvm; - -#define DEBUG_TYPE "xfer" - -namespace llvm { - void initializeHexagonSplitTFRCondSetsPass(PassRegistry&); -} - - -namespace { - -class HexagonSplitTFRCondSets : public MachineFunctionPass { - public: - static char ID; - HexagonSplitTFRCondSets() : MachineFunctionPass(ID) { - initializeHexagonSplitTFRCondSetsPass(*PassRegistry::getPassRegistry()); - } - - const char *getPassName() const override { - return "Hexagon Split TFRCondSets"; - } - bool runOnMachineFunction(MachineFunction &Fn) override; -}; - - -char HexagonSplitTFRCondSets::ID = 0; - - -bool HexagonSplitTFRCondSets::runOnMachineFunction(MachineFunction &Fn) { - - const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo(); - - // Loop over all of the basic blocks. - for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end(); - MBBb != MBBe; ++MBBb) { - MachineBasicBlock* MBB = MBBb; - // Traverse the basic block. - for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end(); - ++MII) { - MachineInstr *MI = MII; - switch(MI->getOpcode()) { - case Hexagon::TFR_condset_ri: { - int DestReg = MI->getOperand(0).getReg(); - int SrcReg1 = MI->getOperand(2).getReg(); - - // Do not emit the predicated copy if the source and the destination - // is the same register. - if (DestReg != SrcReg1) { - BuildMI(*MBB, MII, MI->getDebugLoc(), - TII->get(Hexagon::A2_tfrt), DestReg). - addReg(MI->getOperand(1).getReg()).addReg(SrcReg1); - } - BuildMI(*MBB, MII, MI->getDebugLoc(), - TII->get(Hexagon::C2_cmoveif), DestReg). - addReg(MI->getOperand(1).getReg()). - addImm(MI->getOperand(3).getImm()); - - MII = MBB->erase(MI); - --MII; - break; - } - case Hexagon::TFR_condset_ir: { - int DestReg = MI->getOperand(0).getReg(); - int SrcReg2 = MI->getOperand(3).getReg(); - - BuildMI(*MBB, MII, MI->getDebugLoc(), - TII->get(Hexagon::C2_cmoveit), DestReg). - addReg(MI->getOperand(1).getReg()). - addImm(MI->getOperand(2).getImm()); - - // Do not emit the predicated copy if the source and - // the destination is the same register. - if (DestReg != SrcReg2) { - BuildMI(*MBB, MII, MI->getDebugLoc(), - TII->get(Hexagon::A2_tfrf), DestReg). - addReg(MI->getOperand(1).getReg()).addReg(SrcReg2); - } - MII = MBB->erase(MI); - --MII; - break; - } - case Hexagon::TFR_condset_ii: { - int DestReg = MI->getOperand(0).getReg(); - int SrcReg1 = MI->getOperand(1).getReg(); - - int Immed1 = MI->getOperand(2).getImm(); - int Immed2 = MI->getOperand(3).getImm(); - BuildMI(*MBB, MII, MI->getDebugLoc(), - TII->get(Hexagon::C2_cmoveit), - DestReg).addReg(SrcReg1).addImm(Immed1); - BuildMI(*MBB, MII, MI->getDebugLoc(), - TII->get(Hexagon::C2_cmoveif), - DestReg).addReg(SrcReg1).addImm(Immed2); - MII = MBB->erase(MI); - --MII; - break; - } - } - } - } - return true; -} - -} - -//===----------------------------------------------------------------------===// -// Public Constructor Functions -//===----------------------------------------------------------------------===// - -static void initializePassOnce(PassRegistry &Registry) { - const char *Name = "Hexagon Split TFRCondSets"; - PassInfo *PI = new PassInfo(Name, "hexagon-split-tfr", - &HexagonSplitTFRCondSets::ID, nullptr, false, - false); - Registry.registerPass(*PI, true); -} - -void llvm::initializeHexagonSplitTFRCondSetsPass(PassRegistry &Registry) { - CALL_ONCE_INITIALIZATION(initializePassOnce) -} - -FunctionPass *llvm::createHexagonSplitTFRCondSets() { - return new HexagonSplitTFRCondSets(); -} diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp index 380f023..1717ae3 100644 --- a/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -48,6 +48,10 @@ EnableIEEERndNear( cl::Hidden, cl::ZeroOrMore, cl::init(false), cl::desc("Generate non-chopped conversion from fp to int.")); +static cl::opt<bool> DisableHexagonMISched("disable-hexagon-misched", + cl::Hidden, cl::ZeroOrMore, cl::init(false), + cl::desc("Disable Hexagon MI Scheduling")); + HexagonSubtarget & HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { // If the programmer has not specified a Hexagon version, default to -mv4. @@ -91,3 +95,9 @@ HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS, // Pin the vtable to this file. void HexagonSubtarget::anchor() {} + +bool HexagonSubtarget::enableMachineScheduler() const { + if (DisableHexagonMISched.getNumOccurrences()) + return !DisableHexagonMISched; + return true; +} diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h index 57de546..780567b 100644 --- a/lib/Target/Hexagon/HexagonSubtarget.h +++ b/lib/Target/Hexagon/HexagonSubtarget.h @@ -85,6 +85,11 @@ public: bool hasV5TOps() const { return getHexagonArchVersion() >= V5; } bool hasV5TOpsOnly() const { return getHexagonArchVersion() == V5; } bool modeIEEERndNear() const { return ModeIEEERndNear; } + bool enableMachineScheduler() const override; + // Always use the TargetLowering default scheduler. + // FIXME: This will use the vliw scheduler which is probably just hurting + // compiler time and will be removed eventually anyway. + bool enableMachineSchedDefaultSched() const override { return false; } const std::string &getCPUString () const { return CPUString; } diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp index 64f75a3..48b0bc8 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -29,10 +29,6 @@ using namespace llvm; static cl:: opt<bool> DisableHardwareLoops("disable-hexagon-hwloops", cl::Hidden, cl::desc("Disable Hardware Loops for Hexagon target")); -static cl::opt<bool> DisableHexagonMISched("disable-hexagon-misched", - cl::Hidden, cl::ZeroOrMore, cl::init(false), - cl::desc("Disable Hexagon MI Scheduling")); - static cl::opt<bool> DisableHexagonCFGOpt("disable-hexagon-cfgopt", cl::Hidden, cl::ZeroOrMore, cl::init(false), cl::desc("Disable Hexagon CFG Optimization")); @@ -69,9 +65,10 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, StringRef TT, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + : LLVMTargetMachine(T, "e-m:e-p:32:32-i1:32-i64:64-a:0-n32", TT, CPU, FS, + Options, RM, CM, OL), TLOF(make_unique<HexagonTargetObjectFile>()), - DL("e-m:e-p:32:32-i1:32-i64:64-a:0-n32"), Subtarget(TT, CPU, FS, *this) { + Subtarget(TT, CPU, FS, *this) { initAsmInfo(); } @@ -82,16 +79,7 @@ namespace { class HexagonPassConfig : public TargetPassConfig { public: HexagonPassConfig(HexagonTargetMachine *TM, PassManagerBase &PM) - : TargetPassConfig(TM, PM) { - // FIXME: Rather than calling enablePass(&MachineSchedulerID) below, define - // HexagonSubtarget::enableMachineScheduler() { return true; }. - // That will bypass the SelectionDAG VLIW scheduler, which is probably just - // hurting compile time and will be removed eventually anyway. - if (DisableHexagonMISched) - disablePass(&MachineSchedulerID); - else - enablePass(&MachineSchedulerID); - } + : TargetPassConfig(TM, PM) {} HexagonTargetMachine &getHexagonTargetMachine() const { return getTM<HexagonTargetMachine>(); @@ -159,9 +147,6 @@ void HexagonPassConfig::addPreEmitPass() { // Expand Spill code for predicate registers. addPass(createHexagonExpandPredSpillCode(), false); - // Split up TFRcondsets into conditional transfers. - addPass(createHexagonSplitTFRCondSets(), false); - // Create Packets. if (!NoOpt) { if (!DisableHardwareLoops) diff --git a/lib/Target/Hexagon/HexagonTargetMachine.h b/lib/Target/Hexagon/HexagonTargetMachine.h index e0b3a9b..5774f7e 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.h +++ b/lib/Target/Hexagon/HexagonTargetMachine.h @@ -24,7 +24,6 @@ class Module; class HexagonTargetMachine : public LLVMTargetMachine { std::unique_ptr<TargetLoweringObjectFile> TLOF; - const DataLayout DL; // Calculates type size & alignment. HexagonSubtarget Subtarget; public: @@ -33,8 +32,7 @@ public: Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); ~HexagonTargetMachine() override; - const DataLayout *getDataLayout() const override { return &DL; } - const HexagonSubtarget *getSubtargetImpl() const override { + const HexagonSubtarget *getSubtargetImpl(const Function &) const override { return &Subtarget; } static unsigned getModuleMatchQuality(const Module &M); diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp index c123640..4ca628e 100644 --- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp +++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -389,7 +389,9 @@ static bool IsLoopN(MachineInstr *MI) { /// callee-saved register. static bool DoesModifyCalleeSavedReg(MachineInstr *MI, const TargetRegisterInfo *TRI) { - for (const MCPhysReg *CSR = TRI->getCalleeSavedRegs(); *CSR; ++CSR) { + for (const MCPhysReg *CSR = + TRI->getCalleeSavedRegs(MI->getParent()->getParent()); + *CSR; ++CSR) { unsigned CalleeSavedReg = *CSR; if (MI->modifiesRegister(CalleeSavedReg, TRI)) return true; @@ -401,10 +403,7 @@ static bool DoesModifyCalleeSavedReg(MachineInstr *MI, // or new-value store. bool HexagonPacketizerList::isNewifiable(MachineInstr* MI) { const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; - if ( isCondInst(MI) || QII->mayBeNewStore(MI)) - return true; - else - return false; + return isCondInst(MI) || QII->mayBeNewStore(MI); } bool HexagonPacketizerList::isCondInst (MachineInstr* MI) { diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp index 56c9dc7..4a3ac8c 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp @@ -11,6 +11,7 @@ #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #define DEBUG_TYPE "hexagon-elf-writer" diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp index a5a09ba..eac7d6d 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp @@ -49,9 +49,8 @@ void emitLittleEndian(uint64_t Binary, raw_ostream &OS) { } HexagonMCCodeEmitter::HexagonMCCodeEmitter(MCInstrInfo const &aMII, - MCSubtargetInfo const &aMST, MCContext &aMCT) - : MST(aMST), MCT(aMCT), MCII (aMII) {} + : MCT(aMCT), MCII(aMII) {} void HexagonMCCodeEmitter::EncodeInstruction(MCInst const &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, @@ -75,15 +74,10 @@ HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO, llvm_unreachable("Only Immediates and Registers implemented right now"); } -MCSubtargetInfo const &HexagonMCCodeEmitter::getSubtargetInfo() const { - return MST; -} - MCCodeEmitter *llvm::createHexagonMCCodeEmitter(MCInstrInfo const &MII, MCRegisterInfo const &MRI, - MCSubtargetInfo const &MST, MCContext &MCT) { - return new HexagonMCCodeEmitter(MII, MST, MCT); + return new HexagonMCCodeEmitter(MII, MCT); } #include "HexagonGenMCCodeEmitter.inc" diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h index db1d707..768c10e 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h @@ -26,13 +26,11 @@ namespace llvm { class HexagonMCCodeEmitter : public MCCodeEmitter { - MCSubtargetInfo const &MST; MCContext &MCT; MCInstrInfo const &MCII; public: - HexagonMCCodeEmitter(MCInstrInfo const &aMII, MCSubtargetInfo const &aMST, - MCContext &aMCT); + HexagonMCCodeEmitter(MCInstrInfo const &aMII, MCContext &aMCT); MCSubtargetInfo const &getSubtargetInfo() const; diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp index 09a305b..c63bf32 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp @@ -47,15 +47,6 @@ static MCRegisterInfo *createHexagonMCRegisterInfo(StringRef TT) { return X; } -static MCStreamer * -createHexagonELFStreamer(MCContext &Context, MCAsmBackend &MAB, - raw_ostream &OS, MCCodeEmitter *CE, - bool RelaxAll) { - MCELFStreamer *ES = new MCELFStreamer(Context, MAB, OS, CE); - return ES; -} - - static MCSubtargetInfo * createHexagonMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS) { MCSubtargetInfo *X = new MCSubtargetInfo(); @@ -75,16 +66,6 @@ static MCAsmInfo *createHexagonMCAsmInfo(const MCRegisterInfo &MRI, return MAI; } -static MCStreamer *createMCStreamer(Target const &T, StringRef TT, - MCContext &Context, MCAsmBackend &MAB, - raw_ostream &OS, MCCodeEmitter *Emitter, - MCSubtargetInfo const &STI, bool RelaxAll) { - MCStreamer *ES = createHexagonELFStreamer(Context, MAB, OS, Emitter, RelaxAll); - new MCTargetStreamer(*ES); - return ES; -} - - static MCCodeGenInfo *createHexagonMCCodeGenInfo(StringRef TT, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) { @@ -135,7 +116,4 @@ extern "C" void LLVMInitializeHexagonTargetMC() { // Register the asm backend TargetRegistry::RegisterMCAsmBackend(TheHexagonTarget, createHexagonAsmBackend); - - // Register the obj streamer - TargetRegistry::RegisterMCObjectStreamer(TheHexagonTarget, createMCStreamer); } diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h index f074b65..17072d9 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h @@ -34,7 +34,6 @@ MCInstrInfo *createHexagonMCInstrInfo(); MCCodeEmitter *createHexagonMCCodeEmitter(MCInstrInfo const &MCII, MCRegisterInfo const &MRI, - MCSubtargetInfo const &MST, MCContext &MCT); MCAsmBackend *createHexagonAsmBackend(Target const &T, diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h index 586f5d9..241f1d6 100644 --- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h +++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h @@ -14,6 +14,8 @@ #ifndef LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430MCTARGETDESC_H #define LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430MCTARGETDESC_H +#include "llvm/Support/DataTypes.h" + namespace llvm { class Target; diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp index 2f70cde..591ceb5 100644 --- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp +++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp @@ -104,7 +104,7 @@ namespace { bool MatchWrapper(SDValue N, MSP430ISelAddressMode &AM); bool MatchAddressBase(SDValue N, MSP430ISelAddressMode &AM); - bool SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, + bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) override; // Include the pieces autogenerated from the target description. @@ -280,12 +280,12 @@ bool MSP430DAGToDAGISel::SelectAddr(SDValue N, } bool MSP430DAGToDAGISel:: -SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, +SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) { SDValue Op0, Op1; - switch (ConstraintCode) { + switch (ConstraintID) { default: return true; - case 'm': // memory + case InlineAsm::Constraint_m: // memory if (!SelectAddr(Op, Op0, Op1)) return true; break; diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h index 9266c3b..68868b6 100644 --- a/lib/Target/MSP430/MSP430ISelLowering.h +++ b/lib/Target/MSP430/MSP430ISelLowering.h @@ -102,6 +102,12 @@ namespace llvm { const std::string &Constraint, MVT VT) const override; + unsigned getInlineAsmMemConstraint( + const std::string &ConstraintCode) const override { + // FIXME: Map different constraints differently. + return InlineAsm::Constraint_m; + } + /// isTruncateFree - Return true if it's free to truncate a value of type /// Ty1 to type Ty2. e.g. On msp430 it's free to truncate a i16 value in /// register R15W to i8 by referencing its sub-register R15B. diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h index 3f88a69..0cfa4a4 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.h +++ b/lib/Target/MSP430/MSP430RegisterInfo.h @@ -26,8 +26,7 @@ public: MSP430RegisterInfo(); /// Code Generation virtual methods... - const MCPhysReg * - getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override; + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; BitVector getReservedRegs(const MachineFunction &MF) const override; const TargetRegisterClass* diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp index 7468519..3dda3bf 100644 --- a/lib/Target/MSP430/MSP430Subtarget.cpp +++ b/lib/Target/MSP430/MSP430Subtarget.cpp @@ -25,7 +25,8 @@ using namespace llvm; void MSP430Subtarget::anchor() { } -MSP430Subtarget &MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { +MSP430Subtarget & +MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { ParseSubtargetFeatures("generic", FS); return *this; } diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp index 348e672..d6cc4ae 100644 --- a/lib/Target/MSP430/MSP430TargetMachine.cpp +++ b/lib/Target/MSP430/MSP430TargetMachine.cpp @@ -30,10 +30,11 @@ MSP430TargetMachine::MSP430TargetMachine(const Target &T, StringRef TT, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + : LLVMTargetMachine(T, "e-m:e-p:16:16-i32:16:32-a:16-n8:16", TT, CPU, FS, + Options, RM, CM, OL), TLOF(make_unique<TargetLoweringObjectFileELF>()), // FIXME: Check DataLayout string. - DL("e-m:e-p:16:16-i32:16:32-a:16-n8:16"), Subtarget(TT, CPU, FS, *this) { + Subtarget(TT, CPU, FS, *this) { initAsmInfo(); } diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h index c6a6a70..6ccd30d 100644 --- a/lib/Target/MSP430/MSP430TargetMachine.h +++ b/lib/Target/MSP430/MSP430TargetMachine.h @@ -25,7 +25,6 @@ namespace llvm { /// class MSP430TargetMachine : public LLVMTargetMachine { std::unique_ptr<TargetLoweringObjectFile> TLOF; - const DataLayout DL; // Calculates type size & alignment MSP430Subtarget Subtarget; public: @@ -35,8 +34,7 @@ public: CodeGenOpt::Level OL); ~MSP430TargetMachine() override; - const DataLayout *getDataLayout() const override { return &DL; } - const MSP430Subtarget *getSubtargetImpl() const override { + const MSP430Subtarget *getSubtargetImpl(const Function &F) const override { return &Subtarget; } TargetPassConfig *createPassConfig(PassManagerBase &PM) override; diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 1040bf7..6401bc1 100644 --- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -29,6 +29,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" #include <memory> using namespace llvm; @@ -53,7 +54,13 @@ public: } unsigned getATRegNum() const { return ATReg; } - bool setATReg(unsigned Reg); + bool setATReg(unsigned Reg) { + if (Reg > 31) + return false; + + ATReg = Reg; + return true; + } bool isReorder() const { return Reorder; } void setReorder() { Reorder = true; } @@ -193,6 +200,9 @@ class MipsAsmParser : public MCTargetAsmParser { bool expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); + void createNop(bool hasShortDelaySlot, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions); + bool reportParseError(Twine ErrorMsg); bool reportParseError(SMLoc Loc, Twine ErrorMsg); @@ -236,6 +246,8 @@ class MipsAsmParser : public MCTargetAsmParser { bool parseFpABIValue(MipsABIFlagsSection::FpABIKind &FpABI, StringRef Directive); + bool parseInternalDirectiveReallowModule(); + MCSymbolRefExpr::VariantKind getVariantKind(StringRef Symbol); bool eatComma(StringRef ErrorStr); @@ -1365,22 +1377,11 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, } } + // If this instruction has a delay slot and .set reorder is active, + // emit a NOP after it. if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder()) { - // If this instruction has a delay slot and .set reorder is active, - // emit a NOP after it. Instructions.push_back(Inst); - MCInst NopInst; - if (hasShortDelaySlot(Inst.getOpcode())) { - NopInst.setOpcode(Mips::MOVE16_MM); - NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO)); - NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO)); - } else { - NopInst.setOpcode(Mips::SLL); - NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO)); - NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO)); - NopInst.addOperand(MCOperand::CreateImm(0)); - } - Instructions.push_back(NopInst); + createNop(hasShortDelaySlot(Inst.getOpcode()), IDLoc, Instructions); return false; } @@ -1584,10 +1585,10 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, bool MipsAsmParser::needsExpansion(MCInst &Inst) { switch (Inst.getOpcode()) { - case Mips::LoadImm32Reg: - case Mips::LoadAddr32Imm: - case Mips::LoadAddr32Reg: - case Mips::LoadImm64Reg: + case Mips::LoadImm32: + case Mips::LoadImm64: + case Mips::LoadAddrImm32: + case Mips::LoadAddrReg32: case Mips::B_MM_Pseudo: case Mips::LWM_MM: case Mips::SWM_MM: @@ -1603,17 +1604,17 @@ bool MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { switch (Inst.getOpcode()) { default: llvm_unreachable("unimplemented expansion"); - case Mips::LoadImm32Reg: + case Mips::LoadImm32: return expandLoadImm(Inst, IDLoc, Instructions); - case Mips::LoadImm64Reg: + case Mips::LoadImm64: if (!isGP64bit()) { Error(IDLoc, "instruction requires a 64-bit architecture"); return true; } return expandLoadImm(Inst, IDLoc, Instructions); - case Mips::LoadAddr32Imm: + case Mips::LoadAddrImm32: return expandLoadAddressImm(Inst, IDLoc, Instructions); - case Mips::LoadAddr32Reg: + case Mips::LoadAddrReg32: return expandLoadAddressReg(Inst, IDLoc, Instructions); case Mips::B_MM_Pseudo: return expandUncondBranchMMPseudo(Inst, IDLoc, Instructions); @@ -1982,14 +1983,10 @@ bool MipsAsmParser::expandUncondBranchMMPseudo( } Instructions.push_back(Inst); - if (AssemblerOptions.back()->isReorder()) { - // If .set reorder is active, emit a NOP after the branch instruction. - MCInst NopInst; - NopInst.setOpcode(Mips::MOVE16_MM); - NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO)); - NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO)); - Instructions.push_back(NopInst); - } + // If .set reorder is active, emit a NOP after the branch instruction. + if (AssemblerOptions.back()->isReorder()) + createNop(true, IDLoc, Instructions); + return false; } @@ -2132,6 +2129,22 @@ MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc, return false; } +void MipsAsmParser::createNop(bool hasShortDelaySlot, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + MCInst NopInst; + if (hasShortDelaySlot) { + NopInst.setOpcode(Mips::MOVE16_MM); + NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO)); + NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO)); + } else { + NopInst.setOpcode(Mips::SLL); + NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO)); + NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO)); + NopInst.addOperand(MCOperand::CreateImm(0)); + } + Instructions.push_back(NopInst); +} + unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) { // As described by the Mips32r2 spec, the registers Rd and Rs for // jalr.hb must be different. @@ -2370,14 +2383,6 @@ int MipsAsmParser::matchMSA128CtrlRegisterName(StringRef Name) { return CC; } -bool MipsAssemblerOptions::setATReg(unsigned Reg) { - if (Reg > 31) - return false; - - ATReg = Reg; - return true; -} - int MipsAsmParser::getATReg(SMLoc Loc) { int AT = AssemblerOptions.back()->getATRegNum(); if (AT == 0) @@ -4429,9 +4434,25 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".module") return parseDirectiveModule(); + if (IDVal == ".llvm_internal_mips_reallow_module_directive") + return parseInternalDirectiveReallowModule(); + return true; } +bool MipsAsmParser::parseInternalDirectiveReallowModule() { + // If this is not the end of the statement, report an error. + if (getLexer().isNot(AsmToken::EndOfStatement)) { + reportParseError("unexpected token, expected end of statement"); + return false; + } + + getTargetStreamer().reallowModuleDirective(); + + getParser().Lex(); // Eat EndOfStatement token. + return false; +} + extern "C" void LLVMInitializeMipsAsmParser() { RegisterMCAsmParser<MipsAsmParser> X(TheMipsTarget); RegisterMCAsmParser<MipsAsmParser> Y(TheMipselTarget); diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h index dd0e54c..243b73d 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h +++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h @@ -32,10 +32,9 @@ class MipsAsmBackend : public MCAsmBackend { bool Is64Bit; // 32 or 64 bit words public: - MipsAsmBackend(const Target &T, Triple::OSType _OSType, bool _isLittle, - bool _is64Bit) - : MCAsmBackend(), OSType(_OSType), IsLittle(_isLittle), - Is64Bit(_is64Bit) {} + MipsAsmBackend(const Target &T, Triple::OSType OSType, bool IsLittle, + bool Is64Bit) + : MCAsmBackend(), OSType(OSType), IsLittle(IsLittle), Is64Bit(Is64Bit) {} MCObjectWriter *createObjectWriter(raw_ostream &OS) const override; diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index e14dc8d..a68bf16 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -38,9 +38,9 @@ namespace { MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI, bool _isN64, bool IsLittleEndian) - : MCELFObjectTargetWriter(_is64Bit, OSABI, ELF::EM_MIPS, - /*HasRelocationAddend*/ (_isN64) ? true : false, - /*IsN64*/ _isN64) {} + : MCELFObjectTargetWriter(_is64Bit, OSABI, ELF::EM_MIPS, + /*HasRelocationAddend*/ _isN64, + /*IsN64*/ _isN64) {} MipsELFObjectWriter::~MipsELFObjectWriter() {} @@ -54,9 +54,11 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target, switch (Kind) { default: llvm_unreachable("invalid fixup kind!"); + case Mips::fixup_Mips_32: case FK_Data_4: Type = ELF::R_MIPS_32; break; + case Mips::fixup_Mips_64: case FK_Data_8: Type = ELF::R_MIPS_64; break; @@ -262,12 +264,10 @@ MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbolData &SD, } } -MCObjectWriter *llvm::createMipsELFObjectWriter(raw_ostream &OS, - uint8_t OSABI, +MCObjectWriter *llvm::createMipsELFObjectWriter(raw_ostream &OS, uint8_t OSABI, bool IsLittleEndian, bool Is64Bit) { - MCELFObjectTargetWriter *MOTW = new MipsELFObjectWriter(Is64Bit, OSABI, - (Is64Bit) ? true : false, - IsLittleEndian); + MCELFObjectTargetWriter *MOTW = + new MipsELFObjectWriter(Is64Bit, OSABI, Is64Bit, IsLittleEndian); return createELFObjectWriter(MOTW, OS, IsLittleEndian); } diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp index 18c4a20..93f60df 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp @@ -69,11 +69,9 @@ void MipsELFStreamer::EmitMipsOptionRecords() { I->EmitMipsOptionRecord(); } -namespace llvm { -MCELFStreamer *createMipsELFStreamer(MCContext &Context, MCAsmBackend &MAB, - raw_ostream &OS, MCCodeEmitter *Emitter, - const MCSubtargetInfo &STI, - bool RelaxAll) { - return new MipsELFStreamer(Context, MAB, OS, Emitter, STI); -} +MCELFStreamer *llvm::createMipsELFStreamer(MCContext &Context, + MCAsmBackend &MAB, raw_ostream &OS, + MCCodeEmitter *Emitter, + bool RelaxAll) { + return new MipsELFStreamer(Context, MAB, OS, Emitter); } diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h index bc76d8a..6b834c6 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h +++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h @@ -34,7 +34,7 @@ class MipsELFStreamer : public MCELFStreamer { public: MipsELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_ostream &OS, - MCCodeEmitter *Emitter, const MCSubtargetInfo &STI) + MCCodeEmitter *Emitter) : MCELFStreamer(Context, MAB, OS, Emitter) { RegInfoRecord = new MipsRegInfoRecord(this, Context); @@ -69,6 +69,6 @@ public: MCELFStreamer *createMipsELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_ostream &OS, MCCodeEmitter *Emitter, - const MCSubtargetInfo &STI, bool RelaxAll); + bool RelaxAll); } // namespace llvm. #endif diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h index fa8d6a6..e601963 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h +++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h @@ -18,7 +18,7 @@ namespace Mips { // one can have multiple fixup types for a given relocation and thus need // to be uniquely named. // - // This table *must* be in the save order of + // This table *must* be in the same order of // MCFixupKindInfo Infos[Mips::NumTargetFixupKinds] // in MipsAsmBackend.cpp. // diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp index 8208725..1c2f2da 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp @@ -35,14 +35,12 @@ namespace llvm { MCCodeEmitter *createMipsMCCodeEmitterEB(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx) { return new MipsMCCodeEmitter(MCII, Ctx, false); } MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx) { return new MipsMCCodeEmitter(MCII, Ctx, true); } @@ -451,7 +449,7 @@ getSImm9AddiuspValue(const MCInst &MI, unsigned OpNo, } unsigned MipsMCCodeEmitter:: -getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups, +getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { int64_t Res; @@ -500,6 +498,9 @@ getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups, switch(cast<MCSymbolRefExpr>(Expr)->getKind()) { default: llvm_unreachable("Unknown fixup kind!"); break; + case MCSymbolRefExpr::VK_None: + FixupKind = Mips::fixup_Mips_32; // FIXME: This is ok for O32/N32 but not N64. + break; case MCSymbolRefExpr::VK_Mips_GPOFF_HI : FixupKind = Mips::fixup_Mips_GPOFF_HI; break; diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h index e756b47..e6b5be7 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h +++ b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h @@ -25,7 +25,6 @@ bool baseRegNeedsLoadStoreMask(unsigned Reg); MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS, MCCodeEmitter *Emitter, - const MCSubtargetInfo &STI, bool RelaxAll); } diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp index 9b56067..6f3f37b 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp @@ -106,96 +106,73 @@ static MCInstPrinter *createMipsMCInstPrinter(const Target &T, return new MipsInstPrinter(MAI, MII, MRI); } -static MCStreamer *createMCStreamer(const Target &T, StringRef TT, - MCContext &Context, MCAsmBackend &MAB, - raw_ostream &OS, MCCodeEmitter *Emitter, - const MCSubtargetInfo &STI, bool RelaxAll) { +static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context, + MCAsmBackend &MAB, raw_ostream &OS, + MCCodeEmitter *Emitter, bool RelaxAll) { MCStreamer *S; - if (!Triple(TT).isOSNaCl()) - S = createMipsELFStreamer(Context, MAB, OS, Emitter, STI, RelaxAll); + if (!T.isOSNaCl()) + S = createMipsELFStreamer(Context, MAB, OS, Emitter, RelaxAll); else - S = createMipsNaClELFStreamer(Context, MAB, OS, Emitter, STI, RelaxAll); - new MipsTargetELFStreamer(*S, STI); + S = createMipsNaClELFStreamer(Context, MAB, OS, Emitter, RelaxAll); return S; } -static MCStreamer * -createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS, - bool isVerboseAsm, bool useDwarfDirectory, - MCInstPrinter *InstPrint, MCCodeEmitter *CE, - MCAsmBackend *TAB, bool ShowInst) { - MCStreamer *S = llvm::createAsmStreamer( - Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst); - new MipsTargetAsmStreamer(*S, OS); - return S; +static MCTargetStreamer *createMipsAsmTargetStreamer(MCStreamer &S, + formatted_raw_ostream &OS, + MCInstPrinter *InstPrint, + bool isVerboseAsm) { + return new MipsTargetAsmStreamer(S, OS); } static MCTargetStreamer *createMipsNullTargetStreamer(MCStreamer &S) { return new MipsTargetStreamer(S); } +static MCTargetStreamer * +createMipsObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) { + return new MipsTargetELFStreamer(S, STI); +} + extern "C" void LLVMInitializeMipsTargetMC() { - // Register the MC asm info. - RegisterMCAsmInfoFn X(TheMipsTarget, createMipsMCAsmInfo); - RegisterMCAsmInfoFn Y(TheMipselTarget, createMipsMCAsmInfo); - RegisterMCAsmInfoFn A(TheMips64Target, createMipsMCAsmInfo); - RegisterMCAsmInfoFn B(TheMips64elTarget, createMipsMCAsmInfo); - - // Register the MC codegen info. - TargetRegistry::RegisterMCCodeGenInfo(TheMipsTarget, - createMipsMCCodeGenInfo); - TargetRegistry::RegisterMCCodeGenInfo(TheMipselTarget, - createMipsMCCodeGenInfo); - TargetRegistry::RegisterMCCodeGenInfo(TheMips64Target, - createMipsMCCodeGenInfo); - TargetRegistry::RegisterMCCodeGenInfo(TheMips64elTarget, - createMipsMCCodeGenInfo); - - // Register the MC instruction info. - TargetRegistry::RegisterMCInstrInfo(TheMipsTarget, createMipsMCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(TheMipselTarget, createMipsMCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(TheMips64Target, createMipsMCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(TheMips64elTarget, - createMipsMCInstrInfo); - - // Register the MC register info. - TargetRegistry::RegisterMCRegInfo(TheMipsTarget, createMipsMCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(TheMipselTarget, createMipsMCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(TheMips64Target, createMipsMCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(TheMips64elTarget, - createMipsMCRegisterInfo); + for (Target *T : {&TheMipsTarget, &TheMipselTarget, &TheMips64Target, + &TheMips64elTarget}) { + // Register the MC asm info. + RegisterMCAsmInfoFn X(*T, createMipsMCAsmInfo); + + // Register the MC codegen info. + TargetRegistry::RegisterMCCodeGenInfo(*T, createMipsMCCodeGenInfo); + + // Register the MC instruction info. + TargetRegistry::RegisterMCInstrInfo(*T, createMipsMCInstrInfo); + + // Register the MC register info. + TargetRegistry::RegisterMCRegInfo(*T, createMipsMCRegisterInfo); + + // Register the elf streamer. + TargetRegistry::RegisterELFStreamer(*T, createMCStreamer); + + // Register the asm target streamer. + TargetRegistry::RegisterAsmTargetStreamer(*T, createMipsAsmTargetStreamer); + + TargetRegistry::RegisterNullTargetStreamer(*T, + createMipsNullTargetStreamer); + + // Register the MC subtarget info. + TargetRegistry::RegisterMCSubtargetInfo(*T, createMipsMCSubtargetInfo); + + // Register the MCInstPrinter. + TargetRegistry::RegisterMCInstPrinter(*T, createMipsMCInstPrinter); + + TargetRegistry::RegisterObjectTargetStreamer( + *T, createMipsObjectTargetStreamer); + } // Register the MC Code Emitter - TargetRegistry::RegisterMCCodeEmitter(TheMipsTarget, - createMipsMCCodeEmitterEB); - TargetRegistry::RegisterMCCodeEmitter(TheMipselTarget, - createMipsMCCodeEmitterEL); - TargetRegistry::RegisterMCCodeEmitter(TheMips64Target, - createMipsMCCodeEmitterEB); - TargetRegistry::RegisterMCCodeEmitter(TheMips64elTarget, - createMipsMCCodeEmitterEL); - - // Register the object streamer. - TargetRegistry::RegisterMCObjectStreamer(TheMipsTarget, createMCStreamer); - TargetRegistry::RegisterMCObjectStreamer(TheMipselTarget, createMCStreamer); - TargetRegistry::RegisterMCObjectStreamer(TheMips64Target, createMCStreamer); - TargetRegistry::RegisterMCObjectStreamer(TheMips64elTarget, - createMCStreamer); - - // Register the asm streamer. - TargetRegistry::RegisterAsmStreamer(TheMipsTarget, createMCAsmStreamer); - TargetRegistry::RegisterAsmStreamer(TheMipselTarget, createMCAsmStreamer); - TargetRegistry::RegisterAsmStreamer(TheMips64Target, createMCAsmStreamer); - TargetRegistry::RegisterAsmStreamer(TheMips64elTarget, createMCAsmStreamer); - - TargetRegistry::RegisterNullTargetStreamer(TheMipsTarget, - createMipsNullTargetStreamer); - TargetRegistry::RegisterNullTargetStreamer(TheMipselTarget, - createMipsNullTargetStreamer); - TargetRegistry::RegisterNullTargetStreamer(TheMips64Target, - createMipsNullTargetStreamer); - TargetRegistry::RegisterNullTargetStreamer(TheMips64elTarget, - createMipsNullTargetStreamer); + for (Target *T : {&TheMipsTarget, &TheMips64Target}) + TargetRegistry::RegisterMCCodeEmitter(*T, createMipsMCCodeEmitterEB); + + for (Target *T : {&TheMipselTarget, &TheMips64elTarget}) + TargetRegistry::RegisterMCCodeEmitter(*T, createMipsMCCodeEmitterEL); // Register the asm backend. TargetRegistry::RegisterMCAsmBackend(TheMipsTarget, @@ -207,23 +184,4 @@ extern "C" void LLVMInitializeMipsTargetMC() { TargetRegistry::RegisterMCAsmBackend(TheMips64elTarget, createMipsAsmBackendEL64); - // Register the MC subtarget info. - TargetRegistry::RegisterMCSubtargetInfo(TheMipsTarget, - createMipsMCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(TheMipselTarget, - createMipsMCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(TheMips64Target, - createMipsMCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(TheMips64elTarget, - createMipsMCSubtargetInfo); - - // Register the MCInstPrinter. - TargetRegistry::RegisterMCInstPrinter(TheMipsTarget, - createMipsMCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(TheMipselTarget, - createMipsMCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(TheMips64Target, - createMipsMCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(TheMips64elTarget, - createMipsMCInstPrinter); } diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h index 9528b4e..92f394a 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h +++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h @@ -35,11 +35,9 @@ extern Target TheMips64elTarget; MCCodeEmitter *createMipsMCCodeEmitterEB(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx); MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx); MCAsmBackend *createMipsAsmBackendEB32(const Target &T, diff --git a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp index 92b8455..1adfdf9 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp @@ -37,8 +37,8 @@ const unsigned LoadStoreStackMaskReg = Mips::T7; class MipsNaClELFStreamer : public MipsELFStreamer { public: MipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS, - MCCodeEmitter *Emitter, const MCSubtargetInfo &STI) - : MipsELFStreamer(Context, TAB, OS, Emitter, STI), PendingCall(false) {} + MCCodeEmitter *Emitter) + : MipsELFStreamer(Context, TAB, OS, Emitter), PendingCall(false) {} ~MipsNaClELFStreamer() {} @@ -254,10 +254,8 @@ bool baseRegNeedsLoadStoreMask(unsigned Reg) { MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS, MCCodeEmitter *Emitter, - const MCSubtargetInfo &STI, bool RelaxAll) { - MipsNaClELFStreamer *S = new MipsNaClELFStreamer(Context, TAB, OS, Emitter, - STI); + MipsNaClELFStreamer *S = new MipsNaClELFStreamer(Context, TAB, OS, Emitter); if (RelaxAll) S->getAssembler().setRelaxAll(true); diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index 64d7cab..5790a5c 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -62,7 +62,7 @@ void MipsTargetStreamer::emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) { void MipsTargetStreamer::emitDirectiveSetArch(StringRef Arch) { forbidModuleDirective(); } -void MipsTargetStreamer::emitDirectiveSetMips0() {} +void MipsTargetStreamer::emitDirectiveSetMips0() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetMips1() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetMips2() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetMips3() { forbidModuleDirective(); } @@ -78,8 +78,8 @@ void MipsTargetStreamer::emitDirectiveSetMips64R2() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetMips64R3() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetMips64R5() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetMips64R6() { forbidModuleDirective(); } -void MipsTargetStreamer::emitDirectiveSetPop() {} -void MipsTargetStreamer::emitDirectiveSetPush() {} +void MipsTargetStreamer::emitDirectiveSetPop() { forbidModuleDirective(); } +void MipsTargetStreamer::emitDirectiveSetPush() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetDsp() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetNoDsp() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveCpLoad(unsigned RegNo) {} @@ -91,6 +91,10 @@ void MipsTargetStreamer::emitDirectiveModuleOddSPReg(bool Enabled, if (!Enabled && !IsO32ABI) report_fatal_error("+nooddspreg is only valid for O32"); } +void MipsTargetStreamer::emitDirectiveSetFp( + MipsABIFlagsSection::FpABIKind Value) { + forbidModuleDirective(); +} MipsTargetAsmStreamer::MipsTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS) @@ -198,7 +202,10 @@ void MipsTargetAsmStreamer::emitDirectiveSetArch(StringRef Arch) { MipsTargetStreamer::emitDirectiveSetArch(Arch); } -void MipsTargetAsmStreamer::emitDirectiveSetMips0() { OS << "\t.set\tmips0\n"; } +void MipsTargetAsmStreamer::emitDirectiveSetMips0() { + OS << "\t.set\tmips0\n"; + MipsTargetStreamer::emitDirectiveSetMips0(); +} void MipsTargetAsmStreamer::emitDirectiveSetMips1() { OS << "\t.set\tmips1\n"; @@ -285,9 +292,15 @@ void MipsTargetAsmStreamer::emitDirectiveSetNoDsp() { MipsTargetStreamer::emitDirectiveSetNoDsp(); } -void MipsTargetAsmStreamer::emitDirectiveSetPop() { OS << "\t.set\tpop\n"; } +void MipsTargetAsmStreamer::emitDirectiveSetPop() { + OS << "\t.set\tpop\n"; + MipsTargetStreamer::emitDirectiveSetPop(); +} -void MipsTargetAsmStreamer::emitDirectiveSetPush() { OS << "\t.set\tpush\n"; } +void MipsTargetAsmStreamer::emitDirectiveSetPush() { + OS << "\t.set\tpush\n"; + MipsTargetStreamer::emitDirectiveSetPush(); +} // Print a 32 bit hex number with all numbers. static void printHex32(unsigned Value, raw_ostream &OS) { @@ -346,15 +359,13 @@ void MipsTargetAsmStreamer::emitDirectiveModuleFP( void MipsTargetAsmStreamer::emitDirectiveSetFp( MipsABIFlagsSection::FpABIKind Value) { + MipsTargetStreamer::emitDirectiveSetFp(Value); + StringRef ModuleValue; OS << "\t.set\tfp="; OS << ABIFlagsSection.getFpABIString(Value) << "\n"; } -void MipsTargetAsmStreamer::emitMipsAbiFlags() { - // No action required for text output. -} - void MipsTargetAsmStreamer::emitDirectiveModuleOddSPReg(bool Enabled, bool IsO32ABI) { MipsTargetStreamer::emitDirectiveModuleOddSPReg(Enabled, IsO32ABI); @@ -367,10 +378,7 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI) : MipsTargetStreamer(S), MicroMipsEnabled(false), STI(STI) { MCAssembler &MCA = getStreamer().getAssembler(); - Triple T(STI.getTargetTriple()); - Pic = (MCA.getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_) - ? true - : false; + Pic = MCA.getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_; uint64_t Features = STI.getFeatureBits(); diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td index e20df2f..2aab739 100644 --- a/lib/Target/Mips/MicroMipsInstrInfo.td +++ b/lib/Target/Mips/MicroMipsInstrInfo.td @@ -642,8 +642,10 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in { LW_FM_MM<0xc>; /// Arithmetic Instructions (3-Operand, R-Type) - def ADDu_MM : MMRel, ArithLogicR<"addu", GPR32Opnd>, ADD_FM_MM<0, 0x150>; - def SUBu_MM : MMRel, ArithLogicR<"subu", GPR32Opnd>, ADD_FM_MM<0, 0x1d0>; + def ADDu_MM : MMRel, ArithLogicR<"addu", GPR32Opnd, 1, II_ADDU, add>, + ADD_FM_MM<0, 0x150>; + def SUBu_MM : MMRel, ArithLogicR<"subu", GPR32Opnd, 0, II_SUBU, sub>, + ADD_FM_MM<0, 0x1d0>; def MUL_MM : MMRel, ArithLogicR<"mul", GPR32Opnd>, ADD_FM_MM<0, 0x210>; def ADD_MM : MMRel, ArithLogicR<"add", GPR32Opnd>, ADD_FM_MM<0, 0x110>; def SUB_MM : MMRel, ArithLogicR<"sub", GPR32Opnd>, ADD_FM_MM<0, 0x190>; @@ -883,6 +885,8 @@ def : MipsPat<(i32 immSExt16:$imm), (ADDiu_MM ZERO, immSExt16:$imm)>; def : MipsPat<(i32 immZExt16:$imm), (ORi_MM ZERO, immZExt16:$imm)>; +def : MipsPat<(not GPR32:$in), + (NOR_MM GPR32Opnd:$in, ZERO)>; def : MipsPat<(add GPRMM16:$src, immSExtAddiur2:$imm), (ADDIUR2_MM GPRMM16:$src, immSExtAddiur2:$imm)>; diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h index cb09c1a..671d7a8 100644 --- a/lib/Target/Mips/Mips.h +++ b/lib/Target/Mips/Mips.h @@ -20,8 +20,13 @@ namespace llvm { class MipsTargetMachine; + class ModulePass; class FunctionPass; + ModulePass *createMipsOs16Pass(MipsTargetMachine &TM); + ModulePass *createMips16HardFloatPass(MipsTargetMachine &TM); + + FunctionPass *createMipsModuleISelDagPass(MipsTargetMachine &TM); FunctionPass *createMipsOptimizePICCallPass(MipsTargetMachine &TM); FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM); FunctionPass *createMipsLongBranchPass(MipsTargetMachine &TM); diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td index 01c548e..ca24741 100644 --- a/lib/Target/Mips/Mips.td +++ b/lib/Target/Mips/Mips.td @@ -58,22 +58,22 @@ def MipsInstrInfo : InstrInfo; //===----------------------------------------------------------------------===// def FeatureNoABICalls : SubtargetFeature<"noabicalls", "NoABICalls", "true", - "Disable SVR4-style position-independent code.">; + "Disable SVR4-style position-independent code">; def FeatureGP64Bit : SubtargetFeature<"gp64", "IsGP64bit", "true", - "General Purpose Registers are 64-bit wide.">; + "General Purpose Registers are 64-bit wide">; def FeatureFP64Bit : SubtargetFeature<"fp64", "IsFP64bit", "true", - "Support 64-bit FP registers.">; + "Support 64-bit FP registers">; def FeatureFPXX : SubtargetFeature<"fpxx", "IsFPXX", "true", - "Support for FPXX.">; + "Support for FPXX">; def FeatureNaN2008 : SubtargetFeature<"nan2008", "IsNaN2008bit", "true", - "IEEE 754-2008 NaN encoding.">; + "IEEE 754-2008 NaN encoding">; def FeatureSingleFloat : SubtargetFeature<"single-float", "IsSingleFloat", "true", "Only supports single precision float">; def FeatureNoOddSPReg : SubtargetFeature<"nooddspreg", "UseOddSPReg", "false", "Disable odd numbered single-precision " "registers">; def FeatureVFPU : SubtargetFeature<"vfpu", "HasVFPU", - "true", "Enable vector FPU instructions.">; + "true", "Enable vector FPU instructions">; def FeatureMips1 : SubtargetFeature<"mips1", "MipsArchVersion", "Mips1", "Mips I ISA Support [highly experimental]">; def FeatureMips2 : SubtargetFeature<"mips2", "MipsArchVersion", "Mips2", diff --git a/lib/Target/Mips/Mips16HardFloat.cpp b/lib/Target/Mips/Mips16HardFloat.cpp index 32dc90a..893fc7c 100644 --- a/lib/Target/Mips/Mips16HardFloat.cpp +++ b/lib/Target/Mips/Mips16HardFloat.cpp @@ -11,7 +11,7 @@ // //===----------------------------------------------------------------------===// -#include "Mips16HardFloat.h" +#include "MipsTargetMachine.h" #include "llvm/IR/Module.h" #include "llvm/IR/Value.h" #include "llvm/Support/Debug.h" @@ -19,38 +19,51 @@ #include <algorithm> #include <string> -#define DEBUG_TYPE "mips16-hard-float" +using namespace llvm; -static void inlineAsmOut - (LLVMContext &C, StringRef AsmString, BasicBlock *BB ) { - std::vector<llvm::Type *> AsmArgTypes; - std::vector<llvm::Value*> AsmArgs; - llvm::FunctionType *AsmFTy = - llvm::FunctionType::get(Type::getVoidTy(C), - AsmArgTypes, false); - llvm::InlineAsm *IA = - llvm::InlineAsm::get(AsmFTy, AsmString, "", true, - /* IsAlignStack */ false, - llvm::InlineAsm::AD_ATT); - CallInst::Create(IA, AsmArgs, "", BB); -} +#define DEBUG_TYPE "mips16-hard-float" namespace { + class Mips16HardFloat : public ModulePass { + public: + static char ID; -class InlineAsmHelper { - LLVMContext &C; - BasicBlock *BB; -public: - InlineAsmHelper(LLVMContext &C_, BasicBlock *BB_) : - C(C_), BB(BB_) { - } + Mips16HardFloat(MipsTargetMachine &TM_) : ModulePass(ID), TM(TM_) {} - void Out(StringRef AsmString) { - inlineAsmOut(C, AsmString, BB); - } + const char *getPassName() const override { + return "MIPS16 Hard Float Pass"; + } -}; + bool runOnModule(Module &M) override; + + protected: + const MipsTargetMachine &TM; + }; + + class InlineAsmHelper { + LLVMContext &C; + BasicBlock *BB; + public: + InlineAsmHelper(LLVMContext &C_, BasicBlock *BB_) : + C(C_), BB(BB_) { + } + + void Out(StringRef AsmString) { + std::vector<llvm::Type *> AsmArgTypes; + std::vector<llvm::Value*> AsmArgs; + + llvm::FunctionType *AsmFTy = llvm::FunctionType::get(Type::getVoidTy(C), + AsmArgTypes, false); + llvm::InlineAsm *IA = llvm::InlineAsm::get(AsmFTy, AsmString, "", true, + /* IsAlignStack */ false, + llvm::InlineAsm::AD_ATT); + CallInst::Create(IA, AsmArgs, "", BB); + } + }; + + char Mips16HardFloat::ID = 0; } + // // Return types that matter for hard float are: // float, double, complex float, and complex double @@ -154,11 +167,11 @@ static bool needsFPStubFromParams(Function &F) { if (F.arg_size() >=1) { Type *ArgType = F.getFunctionType()->getParamType(0); switch (ArgType->getTypeID()) { - case Type::FloatTyID: - case Type::DoubleTyID: - return true; - default: - break; + case Type::FloatTyID: + case Type::DoubleTyID: + return true; + default: + break; } } return false; @@ -182,10 +195,8 @@ static bool needsFPHelperFromSig(Function &F) { // We swap between FP and Integer registers to allow Mips16 and Mips32 to // interoperate // - -static void swapFPIntParams - (FPParamVariant PV, Module *M, InlineAsmHelper &IAH, - bool LE, bool ToFP) { +static void swapFPIntParams(FPParamVariant PV, Module *M, InlineAsmHelper &IAH, + bool LE, bool ToFP) { //LLVMContext &Context = M->getContext(); std::string MI = ToFP? "mtc1 ": "mfc1 "; switch (PV) { @@ -242,6 +253,7 @@ static void swapFPIntParams return; } } + // // Make sure that we know we already need a stub for this function. // Having called needsFPHelperFromSig @@ -297,8 +309,8 @@ static void assureFPCallStub(Function &F, Module *M, break; case CFRet: if (LE) { - IAH.Out("mfc1 $$2,$$f0"); - IAH.Out("mfc1 $$3,$$f2"); + IAH.Out("mfc1 $$2,$$f0"); + IAH.Out("mfc1 $$3,$$f2"); } else { IAH.Out("mfc1 $$3,$$f0"); IAH.Out("mfc1 $$3,$$f2"); @@ -331,28 +343,27 @@ static void assureFPCallStub(Function &F, Module *M, // // Functions that are llvm intrinsics and don't need helpers. // -static const char *IntrinsicInline[] = - {"fabs", - "fabsf", - "llvm.ceil.f32", "llvm.ceil.f64", - "llvm.copysign.f32", "llvm.copysign.f64", - "llvm.cos.f32", "llvm.cos.f64", - "llvm.exp.f32", "llvm.exp.f64", - "llvm.exp2.f32", "llvm.exp2.f64", - "llvm.fabs.f32", "llvm.fabs.f64", - "llvm.floor.f32", "llvm.floor.f64", - "llvm.fma.f32", "llvm.fma.f64", - "llvm.log.f32", "llvm.log.f64", - "llvm.log10.f32", "llvm.log10.f64", - "llvm.nearbyint.f32", "llvm.nearbyint.f64", - "llvm.pow.f32", "llvm.pow.f64", - "llvm.powi.f32", "llvm.powi.f64", - "llvm.rint.f32", "llvm.rint.f64", - "llvm.round.f32", "llvm.round.f64", - "llvm.sin.f32", "llvm.sin.f64", - "llvm.sqrt.f32", "llvm.sqrt.f64", - "llvm.trunc.f32", "llvm.trunc.f64", - }; +static const char *IntrinsicInline[] = { + "fabs", "fabsf", + "llvm.ceil.f32", "llvm.ceil.f64", + "llvm.copysign.f32", "llvm.copysign.f64", + "llvm.cos.f32", "llvm.cos.f64", + "llvm.exp.f32", "llvm.exp.f64", + "llvm.exp2.f32", "llvm.exp2.f64", + "llvm.fabs.f32", "llvm.fabs.f64", + "llvm.floor.f32", "llvm.floor.f64", + "llvm.fma.f32", "llvm.fma.f64", + "llvm.log.f32", "llvm.log.f64", + "llvm.log10.f32", "llvm.log10.f64", + "llvm.nearbyint.f32", "llvm.nearbyint.f64", + "llvm.pow.f32", "llvm.pow.f64", + "llvm.powi.f32", "llvm.powi.f64", + "llvm.rint.f32", "llvm.rint.f64", + "llvm.round.f32", "llvm.round.f64", + "llvm.sin.f32", "llvm.sin.f64", + "llvm.sqrt.f32", "llvm.sqrt.f64", + "llvm.trunc.f32", "llvm.trunc.f64", +}; static bool isIntrinsicInline(Function *F) { return std::binary_search(std::begin(IntrinsicInline), @@ -384,9 +395,10 @@ static bool fixupFPReturnAndCall(Function &F, Module *M, Type *T = RVal->getType(); FPReturnVariant RV = whichFPReturnVariant(T); if (RV == NoFPRet) continue; - static const char* Helper[NoFPRet] = - {"__mips16_ret_sf", "__mips16_ret_df", "__mips16_ret_sc", - "__mips16_ret_dc"}; + static const char* Helper[NoFPRet] = { + "__mips16_ret_sf", "__mips16_ret_df", "__mips16_ret_sc", + "__mips16_ret_dc" + }; const char *Name = Helper[RV]; AttributeSet A; Value *Params[] = {RVal}; @@ -406,33 +418,33 @@ static bool fixupFPReturnAndCall(Function &F, Module *M, Value *F = (M->getOrInsertFunction(Name, A, MyVoid, T, nullptr)); CallInst::Create(F, Params, "", &Inst ); } else if (const CallInst *CI = dyn_cast<CallInst>(I)) { - const Value* V = CI->getCalledValue(); - const Type* T = nullptr; - if (V) T = V->getType(); - const PointerType *PFT=nullptr; - if (T) PFT = dyn_cast<PointerType>(T); - const FunctionType *FT=nullptr; - if (PFT) FT = dyn_cast<FunctionType>(PFT->getElementType()); - Function *F_ = CI->getCalledFunction(); - if (FT && needsFPReturnHelper(*FT) && - !(F_ && isIntrinsicInline(F_))) { + const Value* V = CI->getCalledValue(); + const Type* T = nullptr; + if (V) T = V->getType(); + const PointerType *PFT=nullptr; + if (T) PFT = dyn_cast<PointerType>(T); + const FunctionType *FT=nullptr; + if (PFT) FT = dyn_cast<FunctionType>(PFT->getElementType()); + Function *F_ = CI->getCalledFunction(); + if (FT && needsFPReturnHelper(*FT) && + !(F_ && isIntrinsicInline(F_))) { + Modified=true; + F.addFnAttr("saveS2"); + } + if (F_ && !isIntrinsicInline(F_)) { + // pic mode calls are handled by already defined + // helper functions + if (needsFPReturnHelper(*F_)) { Modified=true; F.addFnAttr("saveS2"); } - if (F_ && !isIntrinsicInline(F_)) { - // pic mode calls are handled by already defined - // helper functions - if (needsFPReturnHelper(*F_)) { + if (TM.getRelocationModel() != Reloc::PIC_ ) { + if (needsFPHelperFromSig(*F_)) { + assureFPCallStub(*F_, M, TM); Modified=true; - F.addFnAttr("saveS2"); - } - if (TM.getRelocationModel() != Reloc::PIC_ ) { - if (needsFPHelperFromSig(*F_)) { - assureFPCallStub(*F_, M, TM); - Modified=true; - } } } + } } } return Modified; @@ -489,7 +501,6 @@ static void removeUseSoftFloat(Function &F) { F.addAttributes(AttributeSet::FunctionIndex, A); } -namespace llvm { // // This pass only makes sense when the underlying chip has floating point but @@ -530,11 +541,7 @@ bool Mips16HardFloat::runOnModule(Module &M) { return Modified; } -char Mips16HardFloat::ID = 0; - -} -ModulePass *llvm::createMips16HardFloat(MipsTargetMachine &TM) { +ModulePass *llvm::createMips16HardFloatPass(MipsTargetMachine &TM) { return new Mips16HardFloat(TM); } - diff --git a/lib/Target/Mips/Mips16HardFloat.h b/lib/Target/Mips/Mips16HardFloat.h deleted file mode 100644 index 586cc25..0000000 --- a/lib/Target/Mips/Mips16HardFloat.h +++ /dev/null @@ -1,43 +0,0 @@ -//===---- Mips16HardFloat.h for Mips16 Hard Float --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines a phase which implements part of the floating point -// interoperability between Mips16 and Mips32 code. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_MIPS_MIPS16HARDFLOAT_H -#define LLVM_LIB_TARGET_MIPS_MIPS16HARDFLOAT_H - -#include "MCTargetDesc/MipsMCTargetDesc.h" -#include "MipsTargetMachine.h" -#include "llvm/Pass.h" -#include "llvm/Target/TargetMachine.h" - -using namespace llvm; - -namespace llvm { - -class Mips16HardFloat : public ModulePass { -public: - static char ID; - - Mips16HardFloat(MipsTargetMachine &TM_) : ModulePass(ID), TM(TM_) {} - - const char *getPassName() const override { return "MIPS16 Hard Float Pass"; } - bool runOnModule(Module &M) override; - -protected: - const MipsTargetMachine &TM; -}; - -ModulePass *createMips16HardFloat(MipsTargetMachine &TM); - -} -#endif diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp index 976becc..00d4495 100644 --- a/lib/Target/Mips/Mips16InstrInfo.cpp +++ b/lib/Target/Mips/Mips16InstrInfo.cpp @@ -1,4 +1,3 @@ - //===-- Mips16InstrInfo.cpp - Mips16 Instruction Information --------------===// // // The LLVM Compiler Infrastructure @@ -25,6 +24,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" #include <cctype> using namespace llvm; @@ -32,7 +32,7 @@ using namespace llvm; #define DEBUG_TYPE "mips16-instrinfo" Mips16InstrInfo::Mips16InstrInfo(const MipsSubtarget &STI) - : MipsInstrInfo(STI, Mips::Bimm16), RI(STI) {} + : MipsInstrInfo(STI, Mips::Bimm16), RI() {} const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const { return RI; diff --git a/lib/Target/Mips/Mips16InstrInfo.h b/lib/Target/Mips/Mips16InstrInfo.h index e7d0c07..f9b7387 100644 --- a/lib/Target/Mips/Mips16InstrInfo.h +++ b/lib/Target/Mips/Mips16InstrInfo.h @@ -18,7 +18,7 @@ #include "MipsInstrInfo.h" namespace llvm { - +class MipsSubtarget; class Mips16InstrInfo : public MipsInstrInfo { const Mips16RegisterInfo RI; diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp index c45acc4..ebd51d7 100644 --- a/lib/Target/Mips/Mips16RegisterInfo.cpp +++ b/lib/Target/Mips/Mips16RegisterInfo.cpp @@ -41,8 +41,7 @@ using namespace llvm; #define DEBUG_TYPE "mips16-registerinfo" -Mips16RegisterInfo::Mips16RegisterInfo(const MipsSubtarget &ST) - : MipsRegisterInfo(ST) {} +Mips16RegisterInfo::Mips16RegisterInfo() : MipsRegisterInfo() {} bool Mips16RegisterInfo::requiresRegisterScavenging (const MachineFunction &MF) const { @@ -65,7 +64,7 @@ bool Mips16RegisterInfo::saveScavengerRegister const TargetRegisterClass *RC, unsigned Reg) const { DebugLoc DL; - const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + const TargetInstrInfo &TII = *MBB.getParent()->getSubtarget().getInstrInfo(); TII.copyPhysReg(MBB, I, DL, Mips::T0, Reg, true); TII.copyPhysReg(MBB, UseMI, DL, Reg, Mips::T0, true); return true; @@ -106,7 +105,7 @@ void Mips16RegisterInfo::eliminateFI(MachineBasicBlock::iterator II, if (FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI) FrameReg = Mips::SP; else { - const TargetFrameLowering *TFI = Subtarget.getFrameLowering(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); if (TFI->hasFP(MF)) { FrameReg = Mips::S0; } @@ -140,7 +139,7 @@ void Mips16RegisterInfo::eliminateFI(MachineBasicBlock::iterator II, DebugLoc DL = II->getDebugLoc(); unsigned NewImm; const Mips16InstrInfo &TII = - *static_cast<const Mips16InstrInfo *>(Subtarget.getInstrInfo()); + *static_cast<const Mips16InstrInfo *>(MF.getSubtarget().getInstrInfo()); FrameReg = TII.loadImmediate(FrameReg, Offset, MBB, II, DL, NewImm); Offset = SignExtend64<16>(NewImm); IsKill = true; diff --git a/lib/Target/Mips/Mips16RegisterInfo.h b/lib/Target/Mips/Mips16RegisterInfo.h index 3cdf836..d67a79b 100644 --- a/lib/Target/Mips/Mips16RegisterInfo.h +++ b/lib/Target/Mips/Mips16RegisterInfo.h @@ -21,7 +21,7 @@ class Mips16InstrInfo; class Mips16RegisterInfo : public MipsRegisterInfo { public: - Mips16RegisterInfo(const MipsSubtarget &Subtarget); + Mips16RegisterInfo(); bool requiresRegisterScavenging(const MachineFunction &MF) const override; diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td index 776e473..b1cb7f7 100644 --- a/lib/Target/Mips/Mips64InstrInfo.td +++ b/lib/Target/Mips/Mips64InstrInfo.td @@ -604,7 +604,7 @@ def : MipsInstAlias<"syncws", (SYNC 0x5), 0>; // Assembler Pseudo Instructions //===----------------------------------------------------------------------===// -class LoadImm64<string instr_asm, Operand Od, RegisterOperand RO> : +class LoadImmediate64<string instr_asm, Operand Od, RegisterOperand RO> : MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm64), !strconcat(instr_asm, "\t$rt, $imm64")> ; -def LoadImm64Reg : LoadImm64<"dli", imm64, GPR64Opnd>; +def LoadImm64 : LoadImmediate64<"dli", imm64, GPR64Opnd>; diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp index c662e13..1eb3b2c 100644 --- a/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/lib/Target/Mips/MipsAsmPrinter.cpp @@ -252,6 +252,7 @@ void MipsAsmPrinter::printSavedRegsBitmask() { // Set the CPU and FPU Bitmasks const MachineFrameInfo *MFI = MF->getFrameInfo(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); // size of stack area to which FP callee-saved regs are saved. unsigned CPURegSize = Mips::GPR32RegClass.getSize(); @@ -267,8 +268,7 @@ void MipsAsmPrinter::printSavedRegsBitmask() { if (Mips::GPR32RegClass.contains(Reg)) break; - unsigned RegNum = - TM.getSubtargetImpl()->getRegisterInfo()->getEncodingValue(Reg); + unsigned RegNum = TRI->getEncodingValue(Reg); if (Mips::AFGR64RegClass.contains(Reg)) { FPUBitmask |= (3 << RegNum); CSFPRegsSize += AFGR64RegSize; @@ -283,8 +283,7 @@ void MipsAsmPrinter::printSavedRegsBitmask() { // Set CPU Bitmask. for (; i != e; ++i) { unsigned Reg = CSI[i].getReg(); - unsigned RegNum = - TM.getSubtargetImpl()->getRegisterInfo()->getEncodingValue(Reg); + unsigned RegNum = TRI->getEncodingValue(Reg); CPUBitmask |= (1 << RegNum); } @@ -309,7 +308,7 @@ void MipsAsmPrinter::printSavedRegsBitmask() { /// Frame Directive void MipsAsmPrinter::emitFrameDirective() { - const TargetRegisterInfo &RI = *TM.getSubtargetImpl()->getRegisterInfo(); + const TargetRegisterInfo &RI = *MF->getSubtarget().getRegisterInfo(); unsigned stackReg = RI.getFrameRegister(*MF); unsigned returnReg = RI.getRARegister(); @@ -438,7 +437,7 @@ bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock* // Print out an operand for an inline asm expression. bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, - unsigned AsmVariant,const char *ExtraCode, + unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { // Does this asm operand have a single letter operand modifier? if (ExtraCode && ExtraCode[0]) { @@ -540,18 +539,24 @@ bool MipsAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { - int Offset = 0; + assert(OpNum + 1 < MI->getNumOperands() && "Insufficient operands"); + const MachineOperand &BaseMO = MI->getOperand(OpNum); + const MachineOperand &OffsetMO = MI->getOperand(OpNum + 1); + assert(BaseMO.isReg() && "Unexpected base pointer for inline asm memory operand."); + assert(OffsetMO.isImm() && "Unexpected offset for inline asm memory operand."); + int Offset = OffsetMO.getImm(); + // Currently we are expecting either no ExtraCode or 'D' if (ExtraCode) { if (ExtraCode[0] == 'D') - Offset = 4; + Offset += 4; else return true; // Unknown modifier. + // FIXME: M = high order bits + // FIXME: L = low order bits } - const MachineOperand &MO = MI->getOperand(OpNum); - assert(MO.isReg() && "unexpected inline asm memory operand"); - O << Offset << "($" << MipsInstPrinter::getRegisterName(MO.getReg()) << ")"; + O << Offset << "($" << MipsInstPrinter::getRegisterName(BaseMO.getReg()) << ")"; return false; } diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td index abee185..dcd88f2 100644 --- a/lib/Target/Mips/MipsCallingConv.td +++ b/lib/Target/Mips/MipsCallingConv.td @@ -123,7 +123,7 @@ def CC_MipsN_SoftFloat : CallingConv<[ ]>; def CC_MipsN : CallingConv<[ - CCIfType<[i8, i16, i32], + CCIfType<[i8, i16, i32, i64], CCIfSubtargetNot<"isLittle()", CCIfInReg<CCPromoteToUpperBitsInType<i64>>>>, diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp index ac03c0b..606964d 100644 --- a/lib/Target/Mips/MipsDelaySlotFiller.cpp +++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp @@ -140,7 +140,7 @@ namespace { /// memory instruction can be moved to a delay slot. class MemDefsUses : public InspectMemInstr { public: - MemDefsUses(const MachineFrameInfo *MFI); + MemDefsUses(const DataLayout &DL, const MachineFrameInfo *MFI); private: typedef PointerUnion<const Value *, const PseudoSourceValue *> ValueType; @@ -158,6 +158,7 @@ namespace { const MachineFrameInfo *MFI; SmallPtrSet<ValueType, 4> Uses, Defs; + const DataLayout &DL; /// Flags indicating whether loads or stores with no underlying objects have /// been seen. @@ -212,8 +213,8 @@ namespace { /// moved to the delay slot. Returns true on success. template<typename IterTy> bool searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End, - RegDefsUses &RegDU, InspectMemInstr &IM, - IterTy &Filler, Iter Slot) const; + RegDefsUses &RegDU, InspectMemInstr &IM, Iter Slot, + IterTy &Filler) const; /// This function searches in the backward direction for an instruction that /// can be moved to the delay slot. Returns true on success. @@ -320,7 +321,8 @@ void RegDefsUses::setCallerSaved(const MachineInstr &MI) { CallerSavedRegs.reset(Mips::ZERO); CallerSavedRegs.reset(Mips::ZERO_64); - for (const MCPhysReg *R = TRI.getCalleeSavedRegs(); *R; ++R) + for (const MCPhysReg *R = TRI.getCalleeSavedRegs(MI.getParent()->getParent()); + *R; ++R) for (MCRegAliasIterator AI(*R, &TRI, true); AI.isValid(); ++AI) CallerSavedRegs.reset(*AI); @@ -427,9 +429,9 @@ bool LoadFromStackOrConst::hasHazard_(const MachineInstr &MI) { return true; } -MemDefsUses::MemDefsUses(const MachineFrameInfo *MFI_) - : InspectMemInstr(false), MFI(MFI_), SeenNoObjLoad(false), - SeenNoObjStore(false) {} +MemDefsUses::MemDefsUses(const DataLayout &DL, const MachineFrameInfo *MFI_) + : InspectMemInstr(false), MFI(MFI_), DL(DL), SeenNoObjLoad(false), + SeenNoObjStore(false) {} bool MemDefsUses::hasHazard_(const MachineInstr &MI) { bool HasHazard = false; @@ -482,7 +484,7 @@ getUnderlyingObjects(const MachineInstr &MI, const Value *V = (*MI.memoperands_begin())->getValue(); SmallVector<Value *, 4> Objs; - GetUnderlyingObjects(const_cast<Value *>(V), Objs); + GetUnderlyingObjects(const_cast<Value *>(V), Objs, DL); for (SmallVectorImpl<Value *>::iterator I = Objs.begin(), E = Objs.end(); I != E; ++I) { @@ -639,8 +641,8 @@ FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) { template<typename IterTy> bool Filler::searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End, - RegDefsUses &RegDU, InspectMemInstr& IM, - IterTy &Filler, Iter Slot) const { + RegDefsUses &RegDU, InspectMemInstr& IM, Iter Slot, + IterTy &Filler) const { for (IterTy I = Begin; I != End; ++I) { // skip debug value if (I->isDebugValue()) @@ -688,13 +690,13 @@ bool Filler::searchBackward(MachineBasicBlock &MBB, Iter Slot) const { return false; RegDefsUses RegDU(*MBB.getParent()->getSubtarget().getRegisterInfo()); - MemDefsUses MemDU(MBB.getParent()->getFrameInfo()); + MemDefsUses MemDU(*TM.getDataLayout(), MBB.getParent()->getFrameInfo()); ReverseIter Filler; RegDU.init(*Slot); - if (!searchRange(MBB, ReverseIter(Slot), MBB.rend(), RegDU, MemDU, Filler, - Slot)) + if (!searchRange(MBB, ReverseIter(Slot), MBB.rend(), RegDU, MemDU, Slot, + Filler)) return false; MBB.splice(std::next(Slot), &MBB, std::next(Filler).base()); @@ -714,7 +716,7 @@ bool Filler::searchForward(MachineBasicBlock &MBB, Iter Slot) const { RegDU.setCallerSaved(*Slot); - if (!searchRange(MBB, std::next(Slot), MBB.end(), RegDU, NM, Filler, Slot)) + if (!searchRange(MBB, std::next(Slot), MBB.end(), RegDU, NM, Slot, Filler)) return false; MBB.splice(std::next(Slot), &MBB, Filler); @@ -754,11 +756,11 @@ bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const { IM.reset(new LoadFromStackOrConst()); } else { const MachineFrameInfo *MFI = MBB.getParent()->getFrameInfo(); - IM.reset(new MemDefsUses(MFI)); + IM.reset(new MemDefsUses(*TM.getDataLayout(), MFI)); } - if (!searchRange(MBB, SuccBB->begin(), SuccBB->end(), RegDU, *IM, Filler, - Slot)) + if (!searchRange(MBB, SuccBB->begin(), SuccBB->end(), RegDU, *IM, Slot, + Filler)) return false; insertDelayFiller(Filler, BrMap); diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp index 7d69659..7de0081 100644 --- a/lib/Target/Mips/MipsFastISel.cpp +++ b/lib/Target/Mips/MipsFastISel.cpp @@ -89,6 +89,7 @@ class MipsFastISel final : public FastISel { private: // Selection routines. + bool selectLogicalOp(const Instruction *I); bool selectLoad(const Instruction *I); bool selectStore(const Instruction *I); bool selectBranch(const Instruction *I); @@ -102,6 +103,7 @@ private: // Utility helper routines. bool isTypeLegal(Type *Ty, MVT &VT); + bool isTypeSupported(Type *Ty, MVT &VT); bool isLoadTypeLegal(Type *Ty, MVT &VT); bool computeAddress(const Value *Obj, Address &Addr); bool computeCallAddress(const Value *V, Address &Addr); @@ -129,6 +131,9 @@ private: unsigned getRegEnsuringSimpleIntegerWidening(const Value *, bool IsUnsigned); + unsigned emitLogicalOp(unsigned ISDOpc, MVT RetVT, const Value *LHS, + const Value *RHS); + unsigned materializeFP(const ConstantFP *CFP, MVT VT); unsigned materializeGV(const GlobalValue *GV, MVT VT); unsigned materializeInt(const Constant *C, MVT VT); @@ -210,6 +215,43 @@ CCAssignFn *MipsFastISel::CCAssignFnForCall(CallingConv::ID CC) const { return CC_MipsO32; } +unsigned MipsFastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, + const Value *LHS, const Value *RHS) { + // Canonicalize immediates to the RHS first. + if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS)) + std::swap(LHS, RHS); + + unsigned Opc; + if (ISDOpc == ISD::AND) { + Opc = Mips::AND; + } else if (ISDOpc == ISD::OR) { + Opc = Mips::OR; + } else if (ISDOpc == ISD::XOR) { + Opc = Mips::XOR; + } else + llvm_unreachable("unexpected opcode"); + + unsigned LHSReg = getRegForValue(LHS); + unsigned ResultReg = createResultReg(&Mips::GPR32RegClass); + if (!ResultReg) + return 0; + + unsigned RHSReg; + if (!LHSReg) + return 0; + + if (const auto *C = dyn_cast<ConstantInt>(RHS)) + RHSReg = materializeInt(C, MVT::i32); + else + RHSReg = getRegForValue(RHS); + + if (!RHSReg) + return 0; + + emitInst(Opc, ResultReg).addReg(LHSReg).addReg(RHSReg); + return ResultReg; +} + unsigned MipsFastISel::materializeInt(const Constant *C, MVT VT) { if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i1) return 0; @@ -421,6 +463,21 @@ bool MipsFastISel::isTypeLegal(Type *Ty, MVT &VT) { return TLI.isTypeLegal(VT); } +bool MipsFastISel::isTypeSupported(Type *Ty, MVT &VT) { + if (Ty->isVectorTy()) + return false; + + if (isTypeLegal(Ty, VT)) + return true; + + // If this is a type than can be sign or zero-extended to a basic operation + // go ahead and accept it now. + if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16) + return true; + + return false; +} + bool MipsFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) { if (isTypeLegal(Ty, VT)) return true; @@ -671,6 +728,33 @@ bool MipsFastISel::emitStore(MVT VT, unsigned SrcReg, Address &Addr, return false; } +bool MipsFastISel::selectLogicalOp(const Instruction *I) { + MVT VT; + if (!isTypeSupported(I->getType(), VT)) + return false; + + unsigned ResultReg; + switch (I->getOpcode()) { + default: + llvm_unreachable("Unexpected instruction."); + case Instruction::And: + ResultReg = emitLogicalOp(ISD::AND, VT, I->getOperand(0), I->getOperand(1)); + break; + case Instruction::Or: + ResultReg = emitLogicalOp(ISD::OR, VT, I->getOperand(0), I->getOperand(1)); + break; + case Instruction::Xor: + ResultReg = emitLogicalOp(ISD::XOR, VT, I->getOperand(0), I->getOperand(1)); + break; + } + + if (!ResultReg) + return false; + + updateValueMap(I, ResultReg); + return true; +} + bool MipsFastISel::selectLoad(const Instruction *I) { // Atomic loads need special handling. if (cast<LoadInst>(I)->isAtomic()) @@ -1083,7 +1167,7 @@ bool MipsFastISel::fastLowerCall(CallLoweringInfo &CLI) { // Add a register mask with the call-preserved registers. // Proper defs for return values will be added by setPhysRegsDeadExcept(). - MIB.addRegMask(TRI.getCallPreservedMask(CC)); + MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC)); CLI.Call = MIB; @@ -1312,6 +1396,10 @@ bool MipsFastISel::fastSelectInstruction(const Instruction *I) { return selectLoad(I); case Instruction::Store: return selectStore(I); + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return selectLogicalOp(I); case Instruction::Br: return selectBranch(I); case Instruction::Ret: @@ -1354,7 +1442,7 @@ unsigned MipsFastISel::getRegEnsuringSimpleIntegerWidening(const Value *V, void MipsFastISel::simplifyAddress(Address &Addr) { if (!isInt<16>(Addr.getOffset())) { unsigned TempReg = - materialize32BitInt(Addr.getOffset(), &Mips::GPR32RegClass); + materialize32BitInt(Addr.getOffset(), &Mips::GPR32RegClass); unsigned DestReg = createResultReg(&Mips::GPR32RegClass); emitInst(Mips::ADDu, DestReg).addReg(TempReg).addReg(Addr.getReg()); Addr.setReg(DestReg); diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp index 21fc8ce..c78c329 100644 --- a/lib/Target/Mips/MipsISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp @@ -230,9 +230,18 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) { } bool MipsDAGToDAGISel:: -SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, +SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) { - assert(ConstraintCode == 'm' && "unexpected asm memory constraint"); - OutOps.push_back(Op); - return false; + // All memory constraints can at least accept raw pointers. + switch(ConstraintID) { + default: + llvm_unreachable("Unexpected asm memory constraint"); + case InlineAsm::Constraint_i: + case InlineAsm::Constraint_m: + case InlineAsm::Constraint_R: + case InlineAsm::Constraint_ZC: + OutOps.push_back(Op); + return false; + } + return true; } diff --git a/lib/Target/Mips/MipsISelDAGToDAG.h b/lib/Target/Mips/MipsISelDAGToDAG.h index 6b72877..aec731e 100644 --- a/lib/Target/Mips/MipsISelDAGToDAG.h +++ b/lib/Target/Mips/MipsISelDAGToDAG.h @@ -125,7 +125,7 @@ private: virtual void processFunctionAfterISel(MachineFunction &MF) = 0; bool SelectInlineAsmMemoryOperand(const SDValue &Op, - char ConstraintCode, + unsigned ConstraintID, std::vector<SDValue> &OutOps) override; }; } diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 9253b2e..e4bae03 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -617,6 +617,33 @@ static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue performCMovFPCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const MipsSubtarget &Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue ValueIfTrue = N->getOperand(0), ValueIfFalse = N->getOperand(2); + + ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(ValueIfFalse); + if (!FalseC || FalseC->getZExtValue()) + return SDValue(); + + // Since RHS (False) is 0, we swap the order of the True/False operands + // (obviously also inverting the condition) so that we can + // take advantage of conditional moves using the $0 register. + // Example: + // return (a != 0) ? x : 0; + // load $reg, x + // movz $reg, $0, a + unsigned Opc = (N->getOpcode() == MipsISD::CMovFP_T) ? MipsISD::CMovFP_F : + MipsISD::CMovFP_T; + + SDValue FCC = N->getOperand(1), Glue = N->getOperand(3); + return DAG.getNode(Opc, SDLoc(N), ValueIfFalse.getValueType(), + ValueIfFalse, FCC, ValueIfTrue, Glue); +} + static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const MipsSubtarget &Subtarget) { @@ -750,6 +777,9 @@ SDValue MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) return performDivRemCombine(N, DAG, DCI, Subtarget); case ISD::SELECT: return performSELECTCombine(N, DAG, DCI, Subtarget); + case MipsISD::CMovFP_F: + case MipsISD::CMovFP_T: + return performCMovFPCombine(N, DAG, DCI, Subtarget); case ISD::AND: return performANDCombine(N, DAG, DCI, Subtarget); case ISD::OR: @@ -2451,7 +2481,8 @@ getOpndList(SmallVectorImpl<SDValue> &Ops, // Add a register mask operand representing the call-preserved registers. const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); - const uint32_t *Mask = TRI->getCallPreservedMask(CLI.CallConv); + const uint32_t *Mask = + TRI->getCallPreservedMask(CLI.DAG.getMachineFunction(), CLI.CallConv); assert(Mask && "Missing call preserved mask for calling convention"); if (Subtarget.inMips16HardFloat()) { if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(CLI.Callee)) { @@ -3001,6 +3032,15 @@ MipsTargetLowering::CanLowerReturn(CallingConv::ID CallConv, return CCInfo.CheckReturn(Outs, RetCC_Mips); } +bool +MipsTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const { + if (Subtarget.hasMips3() && Subtarget.abiUsesSoftFloat()) { + if (Type == MVT::i32) + return true; + } + return IsSigned; +} + SDValue MipsTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, @@ -3133,6 +3173,10 @@ getConstraintType(const std::string &Constraint) const return C_Memory; } } + + if (Constraint == "ZC") + return C_Memory; + return TargetLowering::getConstraintType(Constraint); } diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h index 9f86a43..40b6661 100644 --- a/lib/Target/Mips/MipsISelLowering.h +++ b/lib/Target/Mips/MipsISelLowering.h @@ -475,6 +475,8 @@ namespace llvm { const SmallVectorImpl<SDValue> &OutVals, SDLoc dl, SelectionDAG &DAG) const override; + bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const override; + // Inline asm support ConstraintType getConstraintType(const std::string &Constraint) const override; @@ -503,6 +505,15 @@ namespace llvm { std::vector<SDValue> &Ops, SelectionDAG &DAG) const override; + unsigned getInlineAsmMemConstraint( + const std::string &ConstraintCode) const override { + if (ConstraintCode == "R") + return InlineAsm::Constraint_R; + else if (ConstraintCode == "ZC") + return InlineAsm::Constraint_ZC; + return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); + } + bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override; bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h index db149d4..7b2b289 100644 --- a/lib/Target/Mips/MipsInstrInfo.h +++ b/lib/Target/Mips/MipsInstrInfo.h @@ -29,7 +29,7 @@ #include "MipsGenInstrInfo.inc" namespace llvm { - +class MipsSubtarget; class MipsInstrInfo : public MipsGenInstrInfo { virtual void anchor(); protected: diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td index 04a16b3..c937d2b 100644 --- a/lib/Target/Mips/MipsInstrInfo.td +++ b/lib/Target/Mips/MipsInstrInfo.td @@ -1000,7 +1000,7 @@ class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd, SDPatternOperator Op = null_frag>: InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, size_ext:$size), !strconcat(opstr, " $rt, $rs, $pos, $size"), - [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size))], NoItinerary, + [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size))], II_EXT, FrmR, opstr>, ISA_MIPS32R2; class InsBase<string opstr, RegisterOperand RO, Operand PosOpnd, @@ -1008,7 +1008,7 @@ class InsBase<string opstr, RegisterOperand RO, Operand PosOpnd, InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, size_ins:$size, RO:$src), !strconcat(opstr, " $rt, $rs, $pos, $size"), [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size, RO:$src))], - NoItinerary, FrmR, opstr>, ISA_MIPS32R2 { + II_INS, FrmR, opstr>, ISA_MIPS32R2 { let Constraints = "$src = $rt"; } @@ -1140,12 +1140,13 @@ def XORi : MMRel, ArithLogicI<"xori", uimm16, GPR32Opnd, II_XORI, immZExt16, xor>, ADDI_FM<0xe>; def LUi : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16>, LUI_FM; - +let AdditionalPredicates = [NotInMicroMips] in { /// Arithmetic Instructions (3-Operand, R-Type) def ADDu : MMRel, ArithLogicR<"addu", GPR32Opnd, 1, II_ADDU, add>, ADD_FM<0, 0x21>; def SUBu : MMRel, ArithLogicR<"subu", GPR32Opnd, 0, II_SUBU, sub>, ADD_FM<0, 0x23>; +} let Defs = [HI0, LO0] in def MUL : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, II_MUL, mul>, ADD_FM<0x1c, 2>, ISA_MIPS32_NOT_32R6_64R6; @@ -1579,6 +1580,8 @@ def : MipsInstAlias<"sltu $rt, $rs, $imm", (SLTiu GPR32Opnd:$rt, GPR32Opnd:$rs, simm16:$imm), 0>; def : MipsInstAlias<"xor $rs, $rt, $imm", (XORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>; +def : MipsInstAlias<"xor $rs, $imm", + (XORi GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>; def : MipsInstAlias<"or $rs, $rt, $imm", (ORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>; def : MipsInstAlias<"or $rs, $imm", @@ -1639,20 +1642,21 @@ def : MipsInstAlias<"sync", // Assembler Pseudo Instructions //===----------------------------------------------------------------------===// -class LoadImm32<string instr_asm, Operand Od, RegisterOperand RO> : +class LoadImmediate32<string instr_asm, Operand Od, RegisterOperand RO> : MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32), !strconcat(instr_asm, "\t$rt, $imm32")> ; -def LoadImm32Reg : LoadImm32<"li", uimm5, GPR32Opnd>; +def LoadImm32 : LoadImmediate32<"li", uimm5, GPR32Opnd>; -class LoadAddress<string instr_asm, Operand MemOpnd, RegisterOperand RO> : +class LoadAddressFromReg32<string instr_asm, Operand MemOpnd, + RegisterOperand RO> : MipsAsmPseudoInst<(outs RO:$rt), (ins MemOpnd:$addr), !strconcat(instr_asm, "\t$rt, $addr")> ; -def LoadAddr32Reg : LoadAddress<"la", mem, GPR32Opnd>; +def LoadAddrReg32 : LoadAddressFromReg32<"la", mem, GPR32Opnd>; -class LoadAddressImm<string instr_asm, Operand Od, RegisterOperand RO> : +class LoadAddressFromImm32<string instr_asm, Operand Od, RegisterOperand RO> : MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32), !strconcat(instr_asm, "\t$rt, $imm32")> ; -def LoadAddr32Imm : LoadAddressImm<"la", uimm5, GPR32Opnd>; +def LoadAddrImm32 : LoadAddressFromImm32<"la", uimm5, GPR32Opnd>; def JalTwoReg : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rs), "jal\t$rd, $rs"> ; @@ -1761,9 +1765,11 @@ def : WrapperPat<tblockaddress, ADDiu, GPR32>; def : WrapperPat<tjumptable, ADDiu, GPR32>; def : WrapperPat<tglobaltlsaddr, ADDiu, GPR32>; +let AdditionalPredicates = [NotInMicroMips] in { // Mips does not have "not", so we expand our way def : MipsPat<(not GPR32:$in), (NOR GPR32Opnd:$in, ZERO)>; +} // extended loads def : MipsPat<(i32 (extloadi1 addr:$src)), (LBu addr:$src)>; diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp index 821392e..5258181 100644 --- a/lib/Target/Mips/MipsMCInstLower.cpp +++ b/lib/Target/Mips/MipsMCInstLower.cpp @@ -22,6 +22,7 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCStreamer.h" using namespace llvm; diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp index 30b93dc..09e722d 100644 --- a/lib/Target/Mips/MipsMachineFunction.cpp +++ b/lib/Target/Mips/MipsMachineFunction.cpp @@ -79,14 +79,19 @@ unsigned MipsFunctionInfo::getGlobalBaseReg() { if (GlobalBaseReg) return GlobalBaseReg; + MipsSubtarget const &STI = + static_cast<const MipsSubtarget &>(MF.getSubtarget()); + const TargetRegisterClass *RC = - static_cast<const MipsSubtarget &>(MF.getSubtarget()).inMips16Mode() + STI.inMips16Mode() ? &Mips::CPU16RegsRegClass - : static_cast<const MipsTargetMachine &>(MF.getTarget()) - .getABI() - .IsN64() - ? &Mips::GPR64RegClass - : &Mips::GPR32RegClass; + : STI.inMicroMipsMode() + ? &Mips::GPRMM16RegClass + : static_cast<const MipsTargetMachine &>(MF.getTarget()) + .getABI() + .IsN64() + ? &Mips::GPR64RegClass + : &Mips::GPR32RegClass; return GlobalBaseReg = MF.getRegInfo().createVirtualRegister(RC); } diff --git a/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp b/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp index b011e8f..b18a673 100644 --- a/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp @@ -8,15 +8,36 @@ // //===----------------------------------------------------------------------===// -#include "MipsISelDAGToDAG.h" -#include "MipsModuleISelDAGToDAG.h" -#include "llvm/Support/Casting.h" +#include "Mips.h" +#include "MipsTargetMachine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +using namespace llvm; + #define DEBUG_TYPE "mips-isel" -namespace llvm { +namespace { + class MipsModuleDAGToDAGISel : public MachineFunctionPass { + public: + static char ID; + + explicit MipsModuleDAGToDAGISel(MipsTargetMachine &TM_) + : MachineFunctionPass(ID), TM(TM_) {} + + // Pass Name + const char *getPassName() const override { + return "MIPS DAG->DAG Pattern Instruction Selection"; + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + protected: + MipsTargetMachine &TM; + }; + + char MipsModuleDAGToDAGISel::ID = 0; +} bool MipsModuleDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { DEBUG(errs() << "In MipsModuleDAGToDAGISel::runMachineFunction\n"); @@ -24,13 +45,6 @@ bool MipsModuleDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { return false; } -char MipsModuleDAGToDAGISel::ID = 0; - -} - - -llvm::FunctionPass *llvm::createMipsModuleISelDag(MipsTargetMachine &TM) { +llvm::FunctionPass *llvm::createMipsModuleISelDagPass(MipsTargetMachine &TM) { return new MipsModuleDAGToDAGISel(TM); } - - diff --git a/lib/Target/Mips/MipsModuleISelDAGToDAG.h b/lib/Target/Mips/MipsModuleISelDAGToDAG.h deleted file mode 100644 index 85bae47..0000000 --- a/lib/Target/Mips/MipsModuleISelDAGToDAG.h +++ /dev/null @@ -1,58 +0,0 @@ -//===---- MipsModuleISelDAGToDAG.h - Change Subtarget --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines a pass used to change the subtarget for the -// Mips Instruction selector. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_MIPS_MIPSMODULEISELDAGTODAG_H -#define LLVM_LIB_TARGET_MIPS_MIPSMODULEISELDAGTODAG_H - -#include "Mips.h" -#include "MipsSubtarget.h" -#include "MipsTargetMachine.h" -#include "llvm/CodeGen/SelectionDAGISel.h" - - -//===----------------------------------------------------------------------===// -// Instruction Selector Implementation -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// MipsModuleDAGToDAGISel - MIPS specific code to select MIPS machine -// instructions for SelectionDAG operations. -//===----------------------------------------------------------------------===// -namespace llvm { - -class MipsModuleDAGToDAGISel : public MachineFunctionPass { -public: - - static char ID; - - explicit MipsModuleDAGToDAGISel(MipsTargetMachine &TM_) - : MachineFunctionPass(ID), TM(TM_) {} - - // Pass Name - const char *getPassName() const override { - return "MIPS DAG->DAG Pattern Instruction Selection"; - } - - bool runOnMachineFunction(MachineFunction &MF) override; - -protected: - MipsTargetMachine &TM; -}; - -/// createMipsISelDag - This pass converts a legalized DAG into a -/// MIPS-specific DAG, ready for instruction scheduling. -FunctionPass *createMipsModuleISelDag(MipsTargetMachine &TM); -} - -#endif diff --git a/lib/Target/Mips/MipsOs16.cpp b/lib/Target/Mips/MipsOs16.cpp index 7aae964..b6cd791 100644 --- a/lib/Target/Mips/MipsOs16.cpp +++ b/lib/Target/Mips/MipsOs16.cpp @@ -11,14 +11,16 @@ // //===----------------------------------------------------------------------===// -#include "MipsOs16.h" +#include "llvm/IR/Instructions.h" +#include "Mips.h" #include "llvm/IR/Module.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#define DEBUG_TYPE "mips-os16" +using namespace llvm; +#define DEBUG_TYPE "mips-os16" static cl::opt<std::string> Mips32FunctionMask( "mips32-function-mask", @@ -27,70 +29,83 @@ static cl::opt<std::string> Mips32FunctionMask( cl::Hidden); namespace { + class MipsOs16 : public ModulePass { + public: + static char ID; + + MipsOs16() : ModulePass(ID) {} + + const char *getPassName() const override { + return "MIPS Os16 Optimization"; + } + + bool runOnModule(Module &M) override; + }; + + char MipsOs16::ID = 0; +} - // Figure out if we need float point based on the function signature. - // We need to move variables in and/or out of floating point - // registers because of the ABI - // - bool needsFPFromSig(Function &F) { - Type* RetType = F.getReturnType(); - switch (RetType->getTypeID()) { +// Figure out if we need float point based on the function signature. +// We need to move variables in and/or out of floating point +// registers because of the ABI +// +static bool needsFPFromSig(Function &F) { + Type* RetType = F.getReturnType(); + switch (RetType->getTypeID()) { + case Type::FloatTyID: + case Type::DoubleTyID: + return true; + default: + ; + } + if (F.arg_size() >=1) { + Argument &Arg = F.getArgumentList().front(); + switch (Arg.getType()->getTypeID()) { case Type::FloatTyID: case Type::DoubleTyID: return true; default: ; } - if (F.arg_size() >=1) { - Argument &Arg = F.getArgumentList().front(); - switch (Arg.getType()->getTypeID()) { - case Type::FloatTyID: - case Type::DoubleTyID: - return true; - default: - ; - } - } - return false; } + return false; +} - // Figure out if the function will need floating point operations - // - bool needsFP(Function &F) { - if (needsFPFromSig(F)) - return true; - for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) - for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); +// Figure out if the function will need floating point operations +// +static bool needsFP(Function &F) { + if (needsFPFromSig(F)) + return true; + for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) + for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I) { - const Instruction &Inst = *I; - switch (Inst.getOpcode()) { - case Instruction::FAdd: - case Instruction::FSub: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FRem: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::UIToFP: - case Instruction::SIToFP: - case Instruction::FPTrunc: - case Instruction::FPExt: - case Instruction::FCmp: + const Instruction &Inst = *I; + switch (Inst.getOpcode()) { + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::FCmp: + return true; + default: + ; + } + if (const CallInst *CI = dyn_cast<CallInst>(I)) { + DEBUG(dbgs() << "Working on call" << "\n"); + Function &F_ = *CI->getCalledFunction(); + if (needsFPFromSig(F_)) return true; - default: - ; - } - if (const CallInst *CI = dyn_cast<CallInst>(I)) { - DEBUG(dbgs() << "Working on call" << "\n"); - Function &F_ = *CI->getCalledFunction(); - if (needsFPFromSig(F_)) - return true; - } } - return false; - } + } + return false; } -namespace llvm { bool MipsOs16::runOnModule(Module &M) { @@ -136,12 +151,6 @@ bool MipsOs16::runOnModule(Module &M) { return modified; } -char MipsOs16::ID = 0; - -} - -ModulePass *llvm::createMipsOs16(MipsTargetMachine &TM) { +ModulePass *llvm::createMipsOs16Pass(MipsTargetMachine &TM) { return new MipsOs16; } - - diff --git a/lib/Target/Mips/MipsOs16.h b/lib/Target/Mips/MipsOs16.h deleted file mode 100644 index 77183ec..0000000 --- a/lib/Target/Mips/MipsOs16.h +++ /dev/null @@ -1,47 +0,0 @@ -//===---- MipsOs16.h for Mips Option -Os16 --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines an optimization phase for the MIPS target. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_MIPS_MIPSOS16_H -#define LLVM_LIB_TARGET_MIPS_MIPSOS16_H - -#include "MCTargetDesc/MipsMCTargetDesc.h" -#include "MipsTargetMachine.h" -#include "llvm/Pass.h" -#include "llvm/Target/TargetMachine.h" - -using namespace llvm; - -namespace llvm { - -class MipsOs16 : public ModulePass { - -public: - static char ID; - - MipsOs16() : ModulePass(ID) { - - } - - const char *getPassName() const override { - return "MIPS Os16 Optimization"; - } - - bool runOnModule(Module &M) override; - -}; - -ModulePass *createMipsOs16(MipsTargetMachine &TM); - -} - -#endif diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp index 2110c03..0ea48b1 100644 --- a/lib/Target/Mips/MipsRegisterInfo.cpp +++ b/lib/Target/Mips/MipsRegisterInfo.cpp @@ -43,14 +43,14 @@ using namespace llvm; #define GET_REGINFO_TARGET_DESC #include "MipsGenRegisterInfo.inc" -MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST) - : MipsGenRegisterInfo(Mips::RA), Subtarget(ST) {} +MipsRegisterInfo::MipsRegisterInfo() : MipsGenRegisterInfo(Mips::RA) {} unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; } const TargetRegisterClass * MipsRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) const { + const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>(); return Subtarget.isABI_N64() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass; } @@ -63,7 +63,7 @@ MipsRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, case Mips::GPR32RegClassID: case Mips::GPR64RegClassID: case Mips::DSPRRegClassID: { - const TargetFrameLowering *TFI = Subtarget.getFrameLowering(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); return 28 - TFI->hasFP(MF); } case Mips::FGR32RegClassID: @@ -82,6 +82,7 @@ MipsRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, /// Mips Callee Saved Registers const MCPhysReg * MipsRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + const MipsSubtarget &Subtarget = MF->getSubtarget<MipsSubtarget>(); if (Subtarget.isSingleFloat()) return CSR_SingleFloatOnly_SaveList; @@ -100,8 +101,10 @@ MipsRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_O32_SaveList; } -const uint32_t* -MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const { +const uint32_t * +MipsRegisterInfo::getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID) const { + const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>(); if (Subtarget.isSingleFloat()) return CSR_SingleFloatOnly_RegMask; @@ -135,6 +138,7 @@ getReservedRegs(const MachineFunction &MF) const { }; BitVector Reserved(getNumRegs()); + const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>(); typedef TargetRegisterClass::const_iterator RegIter; for (unsigned I = 0; I < array_lengthof(ReservedGPR32); ++I) @@ -257,6 +261,7 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned MipsRegisterInfo:: getFrameRegister(const MachineFunction &MF) const { + const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>(); const TargetFrameLowering *TFI = Subtarget.getFrameLowering(); bool IsN64 = static_cast<const MipsTargetMachine &>(MF.getTarget()).getABI().IsN64(); diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h index 9ec4a38..031b93e 100644 --- a/lib/Target/Mips/MipsRegisterInfo.h +++ b/lib/Target/Mips/MipsRegisterInfo.h @@ -21,15 +21,9 @@ #include "MipsGenRegisterInfo.inc" namespace llvm { -class MipsSubtarget; -class Type; - class MipsRegisterInfo : public MipsGenRegisterInfo { -protected: - const MipsSubtarget &Subtarget; - public: - MipsRegisterInfo(const MipsSubtarget &Subtarget); + MipsRegisterInfo(); /// getRegisterNumbering - Given the enum value for some register, e.g. /// Mips::RA, return the number that it corresponds to (e.g. 31). @@ -47,9 +41,9 @@ public: unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; - const MCPhysReg * - getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override; - const uint32_t *getCallPreservedMask(CallingConv::ID) const override; + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const uint32_t *getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID) const override; static const uint32_t *getMips16RetHelperMask(); BitVector getReservedRegs(const MachineFunction &MF) const override; diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp index 0761ded..a598c3f 100644 --- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp @@ -258,8 +258,12 @@ SDNode *MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag, CurDAG->getTargetConstant(Mips::sub_32, VT)); } - SDNode *AddCarry = CurDAG->getMachineNode(ADDuOp, DL, VT, - SDValue(Carry, 0), RHS); + // Generate a second addition only if we know that RHS is not a + // constant-zero node. + SDNode *AddCarry = Carry; + ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS); + if (!C || C->getZExtValue()) + AddCarry = CurDAG->getMachineNode(ADDuOp, DL, VT, SDValue(Carry, 0), RHS); return CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry, 0)); @@ -378,6 +382,17 @@ bool MipsSEDAGToDAGISel::selectIntAddr(SDValue Addr, SDValue &Base, selectAddrDefault(Addr, Base, Offset); } +bool MipsSEDAGToDAGISel::selectAddrRegImm9(SDValue Addr, SDValue &Base, + SDValue &Offset) const { + if (selectAddrFrameIndex(Addr, Base, Offset)) + return true; + + if (selectAddrFrameIndexOffset(Addr, Base, Offset, 9)) + return true; + + return false; +} + bool MipsSEDAGToDAGISel::selectAddrRegImm10(SDValue Addr, SDValue &Base, SDValue &Offset) const { if (selectAddrFrameIndex(Addr, Base, Offset)) @@ -401,6 +416,17 @@ bool MipsSEDAGToDAGISel::selectAddrRegImm12(SDValue Addr, SDValue &Base, return false; } +bool MipsSEDAGToDAGISel::selectAddrRegImm16(SDValue Addr, SDValue &Base, + SDValue &Offset) const { + if (selectAddrFrameIndex(Addr, Base, Offset)) + return true; + + if (selectAddrFrameIndexOffset(Addr, Base, Offset, 16)) + return true; + + return false; +} + bool MipsSEDAGToDAGISel::selectIntAddrMM(SDValue Addr, SDValue &Base, SDValue &Offset) const { return selectAddrRegImm12(Addr, Base, Offset) || @@ -912,6 +938,60 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) { return std::make_pair(false, nullptr); } +bool MipsSEDAGToDAGISel:: +SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, + std::vector<SDValue> &OutOps) { + SDValue Base, Offset; + + switch(ConstraintID) { + default: + llvm_unreachable("Unexpected asm memory constraint"); + // All memory constraints can at least accept raw pointers. + case InlineAsm::Constraint_i: + case InlineAsm::Constraint_R: + OutOps.push_back(Op); + OutOps.push_back(CurDAG->getTargetConstant(0, MVT::i32)); + return false; + case InlineAsm::Constraint_m: + if (selectAddrRegImm16(Op, Base, Offset)) { + OutOps.push_back(Base); + OutOps.push_back(Offset); + return false; + } + OutOps.push_back(Op); + OutOps.push_back(CurDAG->getTargetConstant(0, MVT::i32)); + return false; + case InlineAsm::Constraint_ZC: + // ZC matches whatever the pref, ll, and sc instructions can handle for the + // given subtarget. + if (Subtarget->inMicroMipsMode()) { + // On microMIPS, they can handle 12-bit offsets. + if (selectAddrRegImm12(Op, Base, Offset)) { + OutOps.push_back(Base); + OutOps.push_back(Offset); + return false; + } + } else if (Subtarget->hasMips32r6()) { + // On MIPS32r6/MIPS64r6, they can only handle 9-bit offsets. + if (selectAddrRegImm9(Op, Base, Offset)) { + OutOps.push_back(Base); + OutOps.push_back(Offset); + return false; + } + } else if (selectAddrRegImm16(Op, Base, Offset)) { + // Prior to MIPS32r6/MIPS64r6, they can handle 16-bit offsets. + OutOps.push_back(Base); + OutOps.push_back(Offset); + return false; + } + // In all cases, 0-bit offsets are acceptable. + OutOps.push_back(Op); + OutOps.push_back(CurDAG->getTargetConstant(0, MVT::i32)); + return false; + } + return true; +} + FunctionPass *llvm::createMipsSEISelDag(MipsTargetMachine &TM) { return new MipsSEDAGToDAGISel(TM); } diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h index 2d24eb4..a11fcf4 100644 --- a/lib/Target/Mips/MipsSEISelDAGToDAG.h +++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h @@ -56,12 +56,18 @@ private: bool selectIntAddr(SDValue Addr, SDValue &Base, SDValue &Offset) const override; + bool selectAddrRegImm9(SDValue Addr, SDValue &Base, + SDValue &Offset) const; + bool selectAddrRegImm10(SDValue Addr, SDValue &Base, SDValue &Offset) const; bool selectAddrRegImm12(SDValue Addr, SDValue &Base, SDValue &Offset) const; + bool selectAddrRegImm16(SDValue Addr, SDValue &Base, + SDValue &Offset) const; + bool selectIntAddrMM(SDValue Addr, SDValue &Base, SDValue &Offset) const override; @@ -111,6 +117,10 @@ private: // Insert instructions to initialize the global base register in the // first MBB of the function. void initGlobalBaseReg(MachineFunction &MF); + + bool SelectInlineAsmMemoryOperand(const SDValue &Op, + unsigned ConstraintID, + std::vector<SDValue> &OutOps) override; }; FunctionPass *createMipsSEISelDag(MipsTargetMachine &TM); diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp index 74f291f..b992579 100644 --- a/lib/Target/Mips/MipsSEInstrInfo.cpp +++ b/lib/Target/Mips/MipsSEInstrInfo.cpp @@ -27,7 +27,7 @@ using namespace llvm; MipsSEInstrInfo::MipsSEInstrInfo(const MipsSubtarget &STI) : MipsInstrInfo(STI, STI.getRelocationModel() == Reloc::PIC_ ? Mips::B : Mips::J), - RI(STI) {} + RI() {} const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const { return RI; diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp index 55c6638..b89207e 100644 --- a/lib/Target/Mips/MipsSERegisterInfo.cpp +++ b/lib/Target/Mips/MipsSERegisterInfo.cpp @@ -18,6 +18,7 @@ #include "MipsMachineFunction.h" #include "MipsSEInstrInfo.h" #include "MipsSubtarget.h" +#include "MipsTargetMachine.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -41,8 +42,7 @@ using namespace llvm; #define DEBUG_TYPE "mips-reg-info" -MipsSERegisterInfo::MipsSERegisterInfo(const MipsSubtarget &ST) - : MipsRegisterInfo(ST) {} +MipsSERegisterInfo::MipsSERegisterInfo() : MipsRegisterInfo() {} bool MipsSERegisterInfo:: requiresRegisterScavenging(const MachineFunction &MF) const { @@ -110,6 +110,8 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II, MachineFunction &MF = *MI.getParent()->getParent(); MachineFrameInfo *MFI = MF.getFrameInfo(); MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); + bool isN64 = + static_cast<const MipsTargetMachine &>(MF.getTarget()).getABI().IsN64(); const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); int MinCSFI = 0; @@ -132,7 +134,7 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II, unsigned FrameReg; if ((FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI) || EhDataRegFI) - FrameReg = Subtarget.isABI_N64() ? Mips::SP_64 : Mips::SP; + FrameReg = isN64 ? Mips::SP_64 : Mips::SP; else FrameReg = getFrameRegister(MF); @@ -165,9 +167,9 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II, // (where n < 16) and doesn't, but does fit into 16-bits then use an ADDiu MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = II->getDebugLoc(); - unsigned ADDiu = Subtarget.isABI_N64() ? Mips::DADDiu : Mips::ADDiu; + unsigned ADDiu = isN64 ? Mips::DADDiu : Mips::ADDiu; const TargetRegisterClass *RC = - Subtarget.isABI_N64() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass; + isN64 ? &Mips::GPR64RegClass : &Mips::GPR32RegClass; MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); unsigned Reg = RegInfo.createVirtualRegister(RC); const MipsSEInstrInfo &TII = @@ -183,7 +185,7 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II, // instructions. MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = II->getDebugLoc(); - unsigned ADDu = Subtarget.isABI_N64() ? Mips::DADDu : Mips::ADDu; + unsigned ADDu = isN64 ? Mips::DADDu : Mips::ADDu; unsigned NewImm = 0; const MipsSEInstrInfo &TII = *static_cast<const MipsSEInstrInfo *>( diff --git a/lib/Target/Mips/MipsSERegisterInfo.h b/lib/Target/Mips/MipsSERegisterInfo.h index 6b70d07..ebae190 100644 --- a/lib/Target/Mips/MipsSERegisterInfo.h +++ b/lib/Target/Mips/MipsSERegisterInfo.h @@ -22,7 +22,7 @@ class MipsSEInstrInfo; class MipsSERegisterInfo : public MipsRegisterInfo { public: - MipsSERegisterInfo(const MipsSubtarget &Subtarget); + MipsSERegisterInfo(); bool requiresRegisterScavenging(const MachineFunction &MF) const override; diff --git a/lib/Target/Mips/MipsSchedule.td b/lib/Target/Mips/MipsSchedule.td index ea98199..54b5d28 100644 --- a/lib/Target/Mips/MipsSchedule.td +++ b/lib/Target/Mips/MipsSchedule.td @@ -65,7 +65,9 @@ def II_DSRL32 : InstrItinClass; def II_DSRLV : InstrItinClass; def II_DSUBU : InstrItinClass; def II_DSUB : InstrItinClass; +def II_EXT : InstrItinClass; // Any EXT instruction def II_FLOOR : InstrItinClass; +def II_INS : InstrItinClass; // Any INS instruction def II_LB : InstrItinClass; def II_LBU : InstrItinClass; def II_LD : InstrItinClass; @@ -198,6 +200,8 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [ InstrItinData<II_DSUB , [InstrStage<1, [ALU]>]>, InstrItinData<II_DROTR , [InstrStage<1, [ALU]>]>, InstrItinData<II_DROTRV , [InstrStage<1, [ALU]>]>, + InstrItinData<II_EXT , [InstrStage<1, [ALU]>]>, + InstrItinData<II_INS , [InstrStage<1, [ALU]>]>, InstrItinData<II_LUI , [InstrStage<1, [ALU]>]>, InstrItinData<II_MOVF , [InstrStage<1, [ALU]>]>, InstrItinData<II_MOVN , [InstrStage<1, [ALU]>]>, diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp index 86c8931..79f6617 100644 --- a/lib/Target/Mips/MipsTargetMachine.cpp +++ b/lib/Target/Mips/MipsTargetMachine.cpp @@ -14,14 +14,11 @@ #include "MipsTargetMachine.h" #include "Mips.h" #include "Mips16FrameLowering.h" -#include "Mips16HardFloat.h" #include "Mips16ISelDAGToDAG.h" #include "Mips16ISelLowering.h" #include "Mips16InstrInfo.h" #include "MipsFrameLowering.h" #include "MipsInstrInfo.h" -#include "MipsModuleISelDAGToDAG.h" -#include "MipsOs16.h" #include "MipsSEFrameLowering.h" #include "MipsSEISelDAGToDAG.h" #include "MipsSEISelLowering.h" @@ -34,6 +31,7 @@ #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" + using namespace llvm; #define DEBUG_TYPE "mips" @@ -46,8 +44,12 @@ extern "C" void LLVMInitializeMipsTarget() { RegisterTargetMachine<MipselTargetMachine> B(TheMips64elTarget); } -static std::string computeDataLayout(bool isLittle, MipsABIInfo &ABI) { +static std::string computeDataLayout(StringRef TT, StringRef CPU, + const TargetOptions &Options, + bool isLittle) { std::string Ret = ""; + MipsABIInfo ABI = + MipsABIInfo::computeTargetABI(Triple(TT), CPU, Options.MCOptions); // There are both little and big endian mips. if (isLittle) @@ -86,11 +88,11 @@ MipsTargetMachine::MipsTargetMachine(const Target &T, StringRef TT, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT, + CPU, FS, Options, RM, CM, OL), isLittle(isLittle), TLOF(make_unique<MipsTargetObjectFile>()), ABI(MipsABIInfo::computeTargetABI(Triple(TT), CPU, Options.MCOptions)), - DL(computeDataLayout(isLittle, ABI)), Subtarget(nullptr), - DefaultSubtarget(TT, CPU, FS, isLittle, *this), + Subtarget(nullptr), DefaultSubtarget(TT, CPU, FS, isLittle, *this), NoMips16Subtarget(TT, CPU, FS.empty() ? "-mips16" : FS.str() + ",-mips16", isLittle, *this), Mips16Subtarget(TT, CPU, FS.empty() ? "+mips16" : FS.str() + ",+mips16", @@ -209,14 +211,14 @@ void MipsPassConfig::addIRPasses() { TargetPassConfig::addIRPasses(); addPass(createAtomicExpandPass(&getMipsTargetMachine())); if (getMipsSubtarget().os16()) - addPass(createMipsOs16(getMipsTargetMachine())); + addPass(createMipsOs16Pass(getMipsTargetMachine())); if (getMipsSubtarget().inMips16HardFloat()) - addPass(createMips16HardFloat(getMipsTargetMachine())); + addPass(createMips16HardFloatPass(getMipsTargetMachine())); } // Install an instruction selector pass using // the ISelDag to gen Mips code. bool MipsPassConfig::addInstSelector() { - addPass(createMipsModuleISelDag(getMipsTargetMachine())); + addPass(createMipsModuleISelDagPass(getMipsTargetMachine())); addPass(createMips16ISelDag(getMipsTargetMachine())); addPass(createMipsSEISelDag(getMipsTargetMachine())); return false; diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h index afd0cea..5427d6a 100644 --- a/lib/Target/Mips/MipsTargetMachine.h +++ b/lib/Target/Mips/MipsTargetMachine.h @@ -31,7 +31,6 @@ class MipsTargetMachine : public LLVMTargetMachine { std::unique_ptr<TargetLoweringObjectFile> TLOF; // Selected ABI MipsABIInfo ABI; - const DataLayout DL; // Calculates type size & alignment MipsSubtarget *Subtarget; MipsSubtarget DefaultSubtarget; MipsSubtarget NoMips16Subtarget; @@ -47,8 +46,7 @@ public: TargetIRAnalysis getTargetIRAnalysis() override; - const DataLayout *getDataLayout() const override { return &DL; } - const MipsSubtarget *getSubtargetImpl() const override { + const MipsSubtarget *getSubtargetImpl() const { if (Subtarget) return Subtarget; return &DefaultSubtarget; diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp index c07693e..723b63b 100644 --- a/lib/Target/Mips/MipsTargetObjectFile.cpp +++ b/lib/Target/Mips/MipsTargetObjectFile.cpp @@ -9,6 +9,7 @@ #include "MipsTargetObjectFile.h" #include "MipsSubtarget.h" +#include "MipsTargetMachine.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/GlobalVariable.h" @@ -44,7 +45,7 @@ void MipsTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){ SmallBSSSection = getContext().getELFSection(".sbss", ELF::SHT_NOBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC); - this->TM = &TM; + this->TM = &static_cast<const MipsTargetMachine &>(TM); } // A address must be loaded from a small section if its size is less than the @@ -84,7 +85,8 @@ IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM, bool MipsTargetObjectFile:: IsGlobalInSmallSectionImpl(const GlobalValue *GV, const TargetMachine &TM) const { - const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>(); + const MipsSubtarget &Subtarget = + *static_cast<const MipsTargetMachine &>(TM).getSubtargetImpl(); // Return if small section is not available. if (!Subtarget.useSmallSection()) @@ -127,9 +129,11 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, /// Return true if this constant should be placed into small data section. bool MipsTargetObjectFile:: IsConstantInSmallSection(const Constant *CN, const TargetMachine &TM) const { - return ( - TM.getSubtarget<MipsSubtarget>().useSmallSection() && LocalSData && - IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(CN->getType()))); + return (static_cast<const MipsTargetMachine &>(TM) + .getSubtargetImpl() + ->useSmallSection() && + LocalSData && IsInSmallSection(TM.getDataLayout()->getTypeAllocSize( + CN->getType()))); } const MCSection *MipsTargetObjectFile:: diff --git a/lib/Target/Mips/MipsTargetObjectFile.h b/lib/Target/Mips/MipsTargetObjectFile.h index 3a2b298..45ed9d0 100644 --- a/lib/Target/Mips/MipsTargetObjectFile.h +++ b/lib/Target/Mips/MipsTargetObjectFile.h @@ -13,11 +13,11 @@ #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" namespace llvm { - +class MipsTargetMachine; class MipsTargetObjectFile : public TargetLoweringObjectFileELF { const MCSection *SmallDataSection; const MCSection *SmallBSSSection; - const TargetMachine *TM; + const MipsTargetMachine *TM; public: void Initialize(MCContext &Ctx, const TargetMachine &TM) override; diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h index b3b8296..1ff041d 100644 --- a/lib/Target/Mips/MipsTargetStreamer.h +++ b/lib/Target/Mips/MipsTargetStreamer.h @@ -92,9 +92,9 @@ public: } virtual void emitDirectiveModuleOddSPReg(bool Enabled, bool IsO32ABI); - virtual void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value){}; - virtual void emitMipsAbiFlags(){}; + virtual void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value); void forbidModuleDirective() { ModuleDirectiveAllowed = false; } + void reallowModuleDirective() { ModuleDirectiveAllowed = true; } bool isModuleDirectiveAllowed() { return ModuleDirectiveAllowed; } // This method enables template classes to set internal abi flags @@ -197,7 +197,6 @@ public: bool Is32BitABI) override; void emitDirectiveModuleOddSPReg(bool Enabled, bool IsO32ABI) override; void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value) override; - void emitMipsAbiFlags() override; }; // This part is for ELF object output @@ -240,7 +239,7 @@ public: // ABI Flags void emitDirectiveModuleOddSPReg(bool Enabled, bool IsO32ABI) override; - void emitMipsAbiFlags() override; + void emitMipsAbiFlags(); }; } #endif diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt index 3a4a19d..cdd2f1f 100644 --- a/lib/Target/NVPTX/CMakeLists.txt +++ b/lib/Target/NVPTX/CMakeLists.txt @@ -29,7 +29,6 @@ set(NVPTXCodeGen_sources NVPTXTargetMachine.cpp NVPTXTargetTransformInfo.cpp NVPTXUtilities.cpp - NVPTXutil.cpp NVVMReflect.cpp ) diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp index 11d737e..b9df3d1 100644 --- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp @@ -39,6 +39,8 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(StringRef TT) { InlineAsmEnd = " inline asm"; SupportsDebugInformation = CompileForDebugging; + // PTX does not allow .align on functions. + HasFunctionAlignment = false; HasDotTypeDotSizeDirective = false; Data8bitsDirective = " .b8 "; diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp index 158ca90..2b4d864 100644 --- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp @@ -71,35 +71,23 @@ static MCInstPrinter *createNVPTXMCInstPrinter(const Target &T, // Force static initialization. extern "C" void LLVMInitializeNVPTXTargetMC() { - // Register the MC asm info. - RegisterMCAsmInfo<NVPTXMCAsmInfo> X(TheNVPTXTarget32); - RegisterMCAsmInfo<NVPTXMCAsmInfo> Y(TheNVPTXTarget64); - - // Register the MC codegen info. - TargetRegistry::RegisterMCCodeGenInfo(TheNVPTXTarget32, - createNVPTXMCCodeGenInfo); - TargetRegistry::RegisterMCCodeGenInfo(TheNVPTXTarget64, - createNVPTXMCCodeGenInfo); - - // Register the MC instruction info. - TargetRegistry::RegisterMCInstrInfo(TheNVPTXTarget32, createNVPTXMCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(TheNVPTXTarget64, createNVPTXMCInstrInfo); - - // Register the MC register info. - TargetRegistry::RegisterMCRegInfo(TheNVPTXTarget32, - createNVPTXMCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(TheNVPTXTarget64, - createNVPTXMCRegisterInfo); - - // Register the MC subtarget info. - TargetRegistry::RegisterMCSubtargetInfo(TheNVPTXTarget32, - createNVPTXMCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(TheNVPTXTarget64, - createNVPTXMCSubtargetInfo); - - // Register the MCInstPrinter. - TargetRegistry::RegisterMCInstPrinter(TheNVPTXTarget32, - createNVPTXMCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(TheNVPTXTarget64, - createNVPTXMCInstPrinter); + for (Target *T : {&TheNVPTXTarget32, &TheNVPTXTarget64}) { + // Register the MC asm info. + RegisterMCAsmInfo<NVPTXMCAsmInfo> X(*T); + + // Register the MC codegen info. + TargetRegistry::RegisterMCCodeGenInfo(*T, createNVPTXMCCodeGenInfo); + + // Register the MC instruction info. + TargetRegistry::RegisterMCInstrInfo(*T, createNVPTXMCInstrInfo); + + // Register the MC register info. + TargetRegistry::RegisterMCRegInfo(*T, createNVPTXMCRegisterInfo); + + // Register the MC subtarget info. + TargetRegistry::RegisterMCSubtargetInfo(*T, createNVPTXMCSubtargetInfo); + + // Register the MCInstPrinter. + TargetRegistry::RegisterMCInstPrinter(*T, createNVPTXMCInstPrinter); + } } diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h index 98821d2..bfd5123 100644 --- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h @@ -14,6 +14,8 @@ #ifndef LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXMCTARGETDESC_H #define LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXMCTARGETDESC_H +#include <stdint.h> + namespace llvm { class Target; diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp b/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp index 1f37696..4f3ccf4 100644 --- a/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp +++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp @@ -12,11 +12,33 @@ //===----------------------------------------------------------------------===// #include "NVPTXAllocaHoisting.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/StackProtector.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" +using namespace llvm; -namespace llvm { +namespace { +// Hoisting the alloca instructions in the non-entry blocks to the entry +// block. +class NVPTXAllocaHoisting : public FunctionPass { +public: + static char ID; // Pass ID + NVPTXAllocaHoisting() : FunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved<MachineFunctionAnalysis>(); + AU.addPreserved<StackProtector>(); + } + + const char *getPassName() const override { + return "NVPTX specific alloca hoisting"; + } + + bool runOnFunction(Function &function) override; +}; +} // namespace bool NVPTXAllocaHoisting::runOnFunction(Function &function) { bool functionModified = false; @@ -36,11 +58,15 @@ bool NVPTXAllocaHoisting::runOnFunction(Function &function) { return functionModified; } -char NVPTXAllocaHoisting::ID = 1; -static RegisterPass<NVPTXAllocaHoisting> -X("alloca-hoisting", "Hoisting alloca instructions in non-entry " - "blocks to the entry block"); +char NVPTXAllocaHoisting::ID = 0; + +namespace llvm { +void initializeNVPTXAllocaHoistingPass(PassRegistry &); +} -FunctionPass *createAllocaHoisting() { return new NVPTXAllocaHoisting(); } +INITIALIZE_PASS( + NVPTXAllocaHoisting, "alloca-hoisting", + "Hoisting alloca instructions in non-entry blocks to the entry block", + false, false) -} // end namespace llvm +FunctionPass *llvm::createAllocaHoisting() { return new NVPTXAllocaHoisting; } diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.h b/lib/Target/NVPTX/NVPTXAllocaHoisting.h index c343980..7a6fc7d 100644 --- a/lib/Target/NVPTX/NVPTXAllocaHoisting.h +++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.h @@ -14,38 +14,10 @@ #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXALLOCAHOISTING_H #define LLVM_LIB_TARGET_NVPTX_NVPTXALLOCAHOISTING_H -#include "llvm/CodeGen/MachineFunctionAnalysis.h" -#include "llvm/CodeGen/StackProtector.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/Pass.h" - namespace llvm { - class FunctionPass; -class Function; - -// Hoisting the alloca instructions in the non-entry blocks to the entry -// block. -class NVPTXAllocaHoisting : public FunctionPass { -public: - static char ID; // Pass ID - NVPTXAllocaHoisting() : FunctionPass(ID) {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<DataLayoutPass>(); - AU.addPreserved<MachineFunctionAnalysis>(); - AU.addPreserved<StackProtector>(); - } - - const char *getPassName() const override { - return "NVPTX specific alloca hoisting"; - } - - bool runOnFunction(Function &function) override; -}; extern FunctionPass *createAllocaHoisting(); - } // end namespace llvm #endif diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 833db04..cc58b07 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -504,8 +504,7 @@ void NVPTXAsmPrinter::EmitFunctionBodyEnd() { void NVPTXAsmPrinter::emitImplicitDef(const MachineInstr *MI) const { unsigned RegNo = MI->getOperand(0).getReg(); - const TargetRegisterInfo *TRI = nvptxSubtarget->getRegisterInfo(); - if (TRI->isVirtualRegister(RegNo)) { + if (TargetRegisterInfo::isVirtualRegister(RegNo)) { OutStreamer.AddComment(Twine("implicit-def: ") + getVirtualRegisterName(RegNo)); } else { @@ -522,15 +521,15 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, // If none of reqntid* is specified, don't output reqntid directive. unsigned reqntidx, reqntidy, reqntidz; bool specified = false; - if (llvm::getReqNTIDx(F, reqntidx) == false) + if (!llvm::getReqNTIDx(F, reqntidx)) reqntidx = 1; else specified = true; - if (llvm::getReqNTIDy(F, reqntidy) == false) + if (!llvm::getReqNTIDy(F, reqntidy)) reqntidy = 1; else specified = true; - if (llvm::getReqNTIDz(F, reqntidz) == false) + if (!llvm::getReqNTIDz(F, reqntidz)) reqntidz = 1; else specified = true; @@ -544,15 +543,15 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, // If none of maxntid* is specified, don't output maxntid directive. unsigned maxntidx, maxntidy, maxntidz; specified = false; - if (llvm::getMaxNTIDx(F, maxntidx) == false) + if (!llvm::getMaxNTIDx(F, maxntidx)) maxntidx = 1; else specified = true; - if (llvm::getMaxNTIDy(F, maxntidy) == false) + if (!llvm::getMaxNTIDy(F, maxntidy)) maxntidy = 1; else specified = true; - if (llvm::getMaxNTIDz(F, maxntidz) == false) + if (!llvm::getMaxNTIDz(F, maxntidz)) maxntidz = 1; else specified = true; @@ -673,7 +672,7 @@ static bool usedInOneFunc(const User *U, Function const *&oneFunc) { } for (const User *UU : U->users()) - if (usedInOneFunc(UU, oneFunc) == false) + if (!usedInOneFunc(UU, oneFunc)) return false; return true; @@ -687,7 +686,7 @@ static bool usedInOneFunc(const User *U, Function const *&oneFunc) { * 3. Is the global variable referenced only in one function? */ static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) { - if (gv->hasInternalLinkage() == false) + if (!gv->hasInternalLinkage()) return false; const PointerType *Pty = gv->getType(); if (Pty->getAddressSpace() != llvm::ADDRESS_SPACE_SHARED) @@ -696,7 +695,7 @@ static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) { const Function *oneFunc = nullptr; bool flag = usedInOneFunc(gv, oneFunc); - if (flag == false) + if (!flag) return false; if (!oneFunc) return false; @@ -1472,7 +1471,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { } } - if (PAL.hasAttribute(paramIndex + 1, Attribute::ByVal) == false) { + if (!PAL.hasAttribute(paramIndex + 1, Attribute::ByVal)) { if (Ty->isAggregateType() || Ty->isVectorTy()) { // Just print .param .align <a> .b8 .param[size]; // <a> = PAL.getparamalignment @@ -1788,7 +1787,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, break; } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { if (const ConstantInt *constInt = dyn_cast<ConstantInt>( - ConstantFoldConstantExpression(Cexpr, TD))) { + ConstantFoldConstantExpression(Cexpr, *TD))) { int int32 = (int)(constInt->getZExtValue()); ptr = (unsigned char *)&int32; aggBuffer->addBytes(ptr, 4, Bytes); @@ -1810,7 +1809,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, break; } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { if (const ConstantInt *constInt = dyn_cast<ConstantInt>( - ConstantFoldConstantExpression(Cexpr, TD))) { + ConstantFoldConstantExpression(Cexpr, *TD))) { long long int64 = (long long)(constInt->getZExtValue()); ptr = (unsigned char *)&int64; aggBuffer->addBytes(ptr, 8, Bytes); @@ -2085,13 +2084,6 @@ void NVPTXAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum, } } - -// Force static initialization. -extern "C" void LLVMInitializeNVPTXBackendAsmPrinter() { - RegisterAsmPrinter<NVPTXAsmPrinter> X(TheNVPTXTarget32); - RegisterAsmPrinter<NVPTXAsmPrinter> Y(TheNVPTXTarget64); -} - void NVPTXAsmPrinter::emitSrcInText(StringRef filename, unsigned line) { std::stringstream temp; LineReader *reader = this->getReader(filename.str()); diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h index 7e6b5e8..9b11e70 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.h +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h @@ -92,8 +92,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter { bool EmitGeneric; public: - AggBuffer(unsigned _size, raw_ostream &_O, NVPTXAsmPrinter &_AP) - : size(_size), buffer(_size), O(_O), AP(_AP) { + AggBuffer(unsigned size, raw_ostream &O, NVPTXAsmPrinter &AP) + : size(size), buffer(size), O(O), AP(AP) { curpos = 0; numSymbols = 0; EmitGeneric = AP.EmitGeneric; diff --git a/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp b/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp index f3a095d..6d7c99c 100644 --- a/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp +++ b/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp @@ -123,10 +123,9 @@ bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP( // => // %0 = gep X, indices // %1 = addrspacecast %0 - GetElementPtrInst *NewGEPI = GetElementPtrInst::Create(Cast->getOperand(0), - Indices, - GEP->getName(), - GEPI); + GetElementPtrInst *NewGEPI = GetElementPtrInst::Create( + GEP->getSourceElementType(), Cast->getOperand(0), Indices, + GEP->getName(), GEPI); NewGEPI->setIsInBounds(GEP->isInBounds()); GEP->replaceAllUsesWith( new AddrSpaceCastInst(NewGEPI, GEP->getType(), "", GEPI)); diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp index 86d134b..850c020 100644 --- a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp +++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp @@ -343,6 +343,7 @@ Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C, // GetElementPtrConstantExpr return cast<GEPOperator>(C)->isInBounds() ? Builder.CreateGEP( + cast<GEPOperator>(C)->getSourceElementType(), NewOperands[0], makeArrayRef(&NewOperands[1], NumOperands - 1)) : Builder.CreateInBoundsGEP( diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index e01c780..52c5e1b 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -78,10 +78,7 @@ bool NVPTXDAGToDAGISel::usePrecSqrtF32() const { return UsePrecSqrtF32; } else { // Otherwise, use sqrt.approx if fast math is enabled - if (TM.Options.UnsafeFPMath) - return false; - else - return true; + return !TM.Options.UnsafeFPMath; } } @@ -5044,12 +5041,12 @@ bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N, /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for /// inline asm expressions. bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand( - const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps) { + const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) { SDValue Op0, Op1; - switch (ConstraintCode) { + switch (ConstraintID) { default: return true; - case 'm': // memory + case InlineAsm::Constraint_m: // memory if (SelectDirectAddr(Op, Op0)) { OutOps.push_back(Op0); OutOps.push_back(CurDAG->getTargetConstant(0, MVT::i32)); diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index ca432b5..6d845c9 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -48,7 +48,7 @@ public: const NVPTXSubtarget *Subtarget; bool SelectInlineAsmMemoryOperand(const SDValue &Op, - char ConstraintCode, + unsigned ConstraintID, std::vector<SDValue> &OutOps) override; private: // Include the pieces autogenerated from the target description. diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp index 1dc81f7..ff74e6e 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -930,7 +930,7 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args, } first = false; - if (Outs[OIdx].Flags.isByVal() == false) { + if (!Outs[OIdx].Flags.isByVal()) { if (Ty->isAggregateType() || Ty->isVectorTy()) { unsigned align = 0; const CallInst *CallI = cast<CallInst>(CS->getInstruction()); @@ -1075,7 +1075,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, EVT VT = Outs[OIdx].VT; Type *Ty = Args[i].Ty; - if (Outs[OIdx].Flags.isByVal() == false) { + if (!Outs[OIdx].Flags.isByVal()) { if (Ty->isAggregateType()) { // aggregate SmallVector<EVT, 16> vtparts; @@ -1459,7 +1459,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ObjectVT) == NumElts && "Vector was not scalarized"); unsigned sz = EltVT.getSizeInBits(); - bool needTruncate = sz < 8 ? true : false; + bool needTruncate = sz < 8; if (NumElts == 1) { // Just a simple load @@ -1577,7 +1577,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, for (unsigned i = 0, e = Ins.size(); i != e; ++i) { unsigned sz = VTs[i].getSizeInBits(); unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]); - bool needTruncate = sz < 8 ? true : false; + bool needTruncate = sz < 8; if (VTs[i].isInteger() && (sz < 8)) sz = 8; @@ -1940,9 +1940,7 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { } // Then any remaining arguments - for (unsigned i = 2, e = N->getNumOperands(); i != e; ++i) { - Ops.push_back(N->getOperand(i)); - } + Ops.append(N->op_begin() + 2, N->op_end()); SDValue NewSt = DAG.getMemIntrinsicNode( Opcode, DL, DAG.getVTList(MVT::Other), Ops, @@ -2118,7 +2116,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( // to newly created nodes. The SDNodes for params have to // appear in the same order as their order of appearance // in the original function. "idx+1" holds that order. - if (PAL.hasAttribute(i + 1, Attribute::ByVal) == false) { + if (!PAL.hasAttribute(i + 1, Attribute::ByVal)) { if (Ty->isAggregateType()) { SmallVector<EVT, 16> vtparts; SmallVector<uint64_t, 16> offsets; @@ -4494,7 +4492,6 @@ NVPTXTargetObjectFile::~NVPTXTargetObjectFile() { delete DwarfLocSection; delete DwarfARangesSection; delete DwarfRangesSection; - delete DwarfMacroInfoSection; } const MCSection * diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h index 1b4da2c..8594364 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/lib/Target/NVPTX/NVPTXISelLowering.h @@ -497,6 +497,12 @@ public: std::vector<SDValue> &Ops, SelectionDAG &DAG) const override; + unsigned getInlineAsmMemConstraint( + const std::string &ConstraintCode) const override { + // FIXME: Map different constraints differently. + return InlineAsm::Constraint_m; + } + const NVPTXTargetMachine *nvTM; // PTX always uses 32-bit shift amounts diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp index f0c3663..578401a 100644 --- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp +++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp @@ -12,6 +12,8 @@ //===----------------------------------------------------------------------===// #include "NVPTXLowerAggrCopies.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/StackProtector.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" @@ -22,10 +24,33 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "nvptx" using namespace llvm; -namespace llvm { FunctionPass *createLowerAggrCopies(); } +namespace { +// actual analysis class, which is a functionpass +struct NVPTXLowerAggrCopies : public FunctionPass { + static char ID; + + NVPTXLowerAggrCopies() : FunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved<MachineFunctionAnalysis>(); + AU.addPreserved<StackProtector>(); + } + + bool runOnFunction(Function &F) override; + + static const unsigned MaxAggrCopySize = 128; + + const char *getPassName() const override { + return "Lower aggregate copies/intrinsics into loops"; + } +}; +} // namespace char NVPTXLowerAggrCopies::ID = 0; @@ -104,7 +129,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { SmallVector<MemTransferInst *, 4> aggrMemcpys; SmallVector<MemSetInst *, 4> aggrMemsets; - const DataLayout *DL = &getAnalysis<DataLayoutPass>().getDataLayout(); + const DataLayout &DL = F.getParent()->getDataLayout(); LLVMContext &Context = F.getParent()->getContext(); // @@ -117,10 +142,10 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { ++II) { if (LoadInst *load = dyn_cast<LoadInst>(II)) { - if (load->hasOneUse() == false) + if (!load->hasOneUse()) continue; - if (DL->getTypeStoreSize(load->getType()) < MaxAggrCopySize) + if (DL.getTypeStoreSize(load->getType()) < MaxAggrCopySize) continue; User *use = load->user_back(); @@ -166,7 +191,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { StoreInst *store = dyn_cast<StoreInst>(*load->user_begin()); Value *srcAddr = load->getOperand(0); Value *dstAddr = store->getOperand(1); - unsigned numLoads = DL->getTypeStoreSize(load->getType()); + unsigned numLoads = DL.getTypeStoreSize(load->getType()); Value *len = ConstantInt::get(Type::getInt32Ty(Context), numLoads); convertTransferToLoop(store, srcAddr, dstAddr, len, load->isVolatile(), diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h index da301d5..3c39f53 100644 --- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h +++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h @@ -15,35 +15,10 @@ #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXLOWERAGGRCOPIES_H #define LLVM_LIB_TARGET_NVPTX_NVPTXLOWERAGGRCOPIES_H -#include "llvm/CodeGen/MachineFunctionAnalysis.h" -#include "llvm/CodeGen/StackProtector.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/Pass.h" - namespace llvm { +class FunctionPass; -// actual analysis class, which is a functionpass -struct NVPTXLowerAggrCopies : public FunctionPass { - static char ID; - - NVPTXLowerAggrCopies() : FunctionPass(ID) {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<DataLayoutPass>(); - AU.addPreserved<MachineFunctionAnalysis>(); - AU.addPreserved<StackProtector>(); - } - - bool runOnFunction(Function &F) override; - - static const unsigned MaxAggrCopySize = 128; - - const char *getPassName() const override { - return "Lower aggregate copies/intrinsics into loops"; - } -}; - -extern FunctionPass *createLowerAggrCopies(); +FunctionPass *createLowerAggrCopies(); } #endif diff --git a/lib/Target/NVPTX/NVPTXLowerStructArgs.cpp b/lib/Target/NVPTX/NVPTXLowerStructArgs.cpp index 3149399..68dfbb7 100644 --- a/lib/Target/NVPTX/NVPTXLowerStructArgs.cpp +++ b/lib/Target/NVPTX/NVPTXLowerStructArgs.cpp @@ -35,7 +35,8 @@ namespace llvm { void initializeNVPTXLowerStructArgsPass(PassRegistry &); } -class LLVM_LIBRARY_VISIBILITY NVPTXLowerStructArgs : public FunctionPass { +namespace { +class NVPTXLowerStructArgs : public FunctionPass { bool runOnFunction(Function &F) override; void handleStructPtrArgs(Function &); @@ -48,6 +49,7 @@ public: return "Copy structure (byval *) arguments to stack"; } }; +} // namespace char NVPTXLowerStructArgs::ID = 1; diff --git a/lib/Target/NVPTX/NVPTXMCExpr.h b/lib/Target/NVPTX/NVPTXMCExpr.h index d39a394..f075b8b 100644 --- a/lib/Target/NVPTX/NVPTXMCExpr.h +++ b/lib/Target/NVPTX/NVPTXMCExpr.h @@ -29,8 +29,8 @@ private: const VariantKind Kind; const APFloat Flt; - explicit NVPTXFloatMCExpr(VariantKind _Kind, APFloat _Flt) - : Kind(_Kind), Flt(_Flt) {} + explicit NVPTXFloatMCExpr(VariantKind Kind, APFloat Flt) + : Kind(Kind), Flt(Flt) {} public: /// @name Construction diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp index 5ca96e4..6e97f9ef 100644 --- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp @@ -78,7 +78,7 @@ NVPTXRegisterInfo::NVPTXRegisterInfo() : NVPTXGenRegisterInfo(0) {} /// NVPTX Callee Saved Registers const MCPhysReg * -NVPTXRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { +NVPTXRegisterInfo::getCalleeSavedRegs(const MachineFunction *) const { static const MCPhysReg CalleeSavedRegs[] = { 0 }; return CalleeSavedRegs; } diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.h b/lib/Target/NVPTX/NVPTXRegisterInfo.h index 75b8f15..c310a9c 100644 --- a/lib/Target/NVPTX/NVPTXRegisterInfo.h +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.h @@ -35,8 +35,7 @@ public: //------------------------------------------------------ // NVPTX callee saved registers - const MCPhysReg * - getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override; + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; BitVector getReservedRegs(const MachineFunction &MF) const override; diff --git a/lib/Target/NVPTX/NVPTXSection.h b/lib/Target/NVPTX/NVPTXSection.h index f1d3cb4..0d2627d 100644 --- a/lib/Target/NVPTX/NVPTXSection.h +++ b/lib/Target/NVPTX/NVPTXSection.h @@ -26,7 +26,7 @@ namespace llvm { class NVPTXSection : public MCSection { virtual void anchor(); public: - NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K) {} + NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K, nullptr) {} virtual ~NVPTXSection() {} /// Override this as NVPTX has its own way of printing switching @@ -36,11 +36,8 @@ public: const MCExpr *Subsection) const override {} /// Base address of PTX sections is zero. - bool isBaseAddressKnownZero() const override { return true; } bool UseCodeAlign() const override { return false; } bool isVirtualSection() const override { return false; } - std::string getLabelBeginName() const override { return ""; } - std::string getLabelEndName() const override { return ""; } }; } // end namespace llvm diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 1a267a6..1b6bc71 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -50,6 +50,7 @@ using namespace llvm; namespace llvm { void initializeNVVMReflectPass(PassRegistry&); void initializeGenericToNVVMPass(PassRegistry&); +void initializeNVPTXAllocaHoistingPass(PassRegistry &); void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&); void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &); void initializeNVPTXLowerStructArgsPass(PassRegistry &); @@ -64,6 +65,7 @@ extern "C" void LLVMInitializeNVPTXTarget() { // but it's very NVPTX-specific. initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); initializeGenericToNVVMPass(*PassRegistry::getPassRegistry()); + initializeNVPTXAllocaHoistingPass(*PassRegistry::getPassRegistry()); initializeNVPTXAssignValidGlobalNamesPass(*PassRegistry::getPassRegistry()); initializeNVPTXFavorNonGenericAddrSpacesPass( *PassRegistry::getPassRegistry()); @@ -86,9 +88,10 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, StringRef TT, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL, bool is64bit) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), is64bit(is64bit), - TLOF(make_unique<NVPTXTargetObjectFile>()), - DL(computeDataLayout(is64bit)), Subtarget(TT, CPU, FS, *this) { + : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options, RM, + CM, OL), + is64bit(is64bit), TLOF(make_unique<NVPTXTargetObjectFile>()), + Subtarget(TT, CPU, FS, *this) { if (Triple(TT).getOS() == Triple::NVCL) drvInterface = NVPTX::NVCL; else @@ -183,8 +186,7 @@ void NVPTXPassConfig::addIRPasses() { } bool NVPTXPassConfig::addInstSelector() { - const NVPTXSubtarget &ST = - getTM<NVPTXTargetMachine>().getSubtarget<NVPTXSubtarget>(); + const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); addPass(createLowerAggrCopies()); addPass(createAllocaHoisting()); diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h index a81abfe..b8df5af 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -27,7 +27,6 @@ namespace llvm { class NVPTXTargetMachine : public LLVMTargetMachine { bool is64bit; std::unique_ptr<TargetLoweringObjectFile> TLOF; - const DataLayout DL; // Calculates type size & alignment NVPTX::DrvInterface drvInterface; NVPTXSubtarget Subtarget; @@ -40,8 +39,10 @@ public: CodeModel::Model CM, CodeGenOpt::Level OP, bool is64bit); ~NVPTXTargetMachine() override; - const DataLayout *getDataLayout() const override { return &DL; } - const NVPTXSubtarget *getSubtargetImpl() const override { return &Subtarget; } + const NVPTXSubtarget *getSubtargetImpl(const Function &) const override { + return &Subtarget; + } + const NVPTXSubtarget *getSubtargetImpl() const { return &Subtarget; } bool is64Bit() const { return is64bit; } NVPTX::DrvInterface getDrvInterface() const { return drvInterface; } ManagedStringPool *getManagedStrPool() const { diff --git a/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/lib/Target/NVPTX/NVPTXTargetObjectFile.h index 00ceca5..5d9ab0d 100644 --- a/lib/Target/NVPTX/NVPTXTargetObjectFile.h +++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h @@ -41,7 +41,6 @@ public: DwarfLocSection = nullptr; DwarfARangesSection = nullptr; DwarfRangesSection = nullptr; - DwarfMacroInfoSection = nullptr; } virtual ~NVPTXTargetObjectFile(); @@ -83,8 +82,6 @@ public: new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata()); DwarfRangesSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata()); - DwarfMacroInfoSection = - new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata()); } const MCSection *getSectionForConstant(SectionKind Kind, diff --git a/lib/Target/NVPTX/NVPTXUtilities.cpp b/lib/Target/NVPTX/NVPTXUtilities.cpp index cf1feac..1f178af 100644 --- a/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -293,12 +293,9 @@ bool llvm::isKernelFunction(const Function &F) { unsigned x = 0; bool retval = llvm::findOneNVVMAnnotation( &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_ISKERNEL_FUNCTION], x); - if (retval == false) { + if (!retval) { // There is no NVVM metadata, check the calling convention - if (F.getCallingConv() == llvm::CallingConv::PTX_Kernel) - return true; - else - return false; + return F.getCallingConv() == llvm::CallingConv::PTX_Kernel; } return (x == 1); } @@ -307,7 +304,7 @@ bool llvm::getAlign(const Function &F, unsigned index, unsigned &align) { std::vector<unsigned> Vs; bool retval = llvm::findAllNVVMAnnotation( &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_ALIGN], Vs); - if (retval == false) + if (!retval) return false; for (int i = 0, e = Vs.size(); i < e; i++) { unsigned v = Vs[i]; diff --git a/lib/Target/NVPTX/NVPTXutil.cpp b/lib/Target/NVPTX/NVPTXutil.cpp deleted file mode 100644 index 5f074b3..0000000 --- a/lib/Target/NVPTX/NVPTXutil.cpp +++ /dev/null @@ -1,90 +0,0 @@ -//===-- NVPTXutil.cpp - Functions exported to CodeGen --*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the functions that can be used in CodeGen. -// -//===----------------------------------------------------------------------===// - -#include "NVPTXutil.h" -#include "NVPTX.h" - -using namespace llvm; - -namespace llvm { - -bool isParamLoad(const MachineInstr *MI) { - if ((MI->getOpcode() != NVPTX::LD_i32_avar) && - (MI->getOpcode() != NVPTX::LD_i64_avar)) - return false; - if (MI->getOperand(2).isImm() == false) - return false; - if (MI->getOperand(2).getImm() != NVPTX::PTXLdStInstCode::PARAM) - return false; - return true; -} - -#define DATA_MASK 0x7f -#define DIGIT_WIDTH 7 -#define MORE_BYTES 0x80 - -static int encode_leb128(uint64_t val, int *nbytes, char *space, int splen) { - char *a; - char *end = space + splen; - - a = space; - do { - unsigned char uc; - - if (a >= end) - return 1; - uc = val & DATA_MASK; - val >>= DIGIT_WIDTH; - if (val != 0) - uc |= MORE_BYTES; - *a = uc; - a++; - } while (val); - *nbytes = a - space; - return 0; -} - -#undef DATA_MASK -#undef DIGIT_WIDTH -#undef MORE_BYTES - -uint64_t encode_leb128(const char *str) { - union { - uint64_t x; - char a[8]; - } temp64; - - temp64.x = 0; - - for (unsigned i = 0, e = strlen(str); i != e; ++i) - temp64.a[i] = str[e - 1 - i]; - - char encoded[16]; - int nbytes; - - int retval = encode_leb128(temp64.x, &nbytes, encoded, 16); - - (void) retval; - assert(retval == 0 && "Encoding to leb128 failed"); - - assert(nbytes <= 8 && - "Cannot support register names with leb128 encoding > 8 bytes"); - - temp64.x = 0; - for (int i = 0; i < nbytes; ++i) - temp64.a[i] = encoded[i]; - - return temp64.x; -} - -} // end namespace llvm diff --git a/lib/Target/NVPTX/NVPTXutil.h b/lib/Target/NVPTX/NVPTXutil.h deleted file mode 100644 index 1915dac..0000000 --- a/lib/Target/NVPTX/NVPTXutil.h +++ /dev/null @@ -1,25 +0,0 @@ -//===-- NVPTXutil.h - Functions exported to CodeGen --*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the functions that can be used in CodeGen. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXUTIL_H -#define LLVM_LIB_TARGET_NVPTX_NVPTXUTIL_H - -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstr.h" - -namespace llvm { -bool isParamLoad(const MachineInstr *); -uint64_t encode_leb128(const char *str); -} - -#endif diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp index a8d6b95..5e375b7 100644 --- a/lib/Target/NVPTX/NVVMReflect.cpp +++ b/lib/Target/NVPTX/NVVMReflect.cpp @@ -29,6 +29,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_os_ostream.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include <map> #include <sstream> @@ -137,6 +138,26 @@ bool NVVMReflect::handleFunction(Function *ReflectFunction) { // ConstantArray can be found successfully, see if it can be // found in VarMap. If so, replace the uses of CallInst with the // value found in VarMap. If not, replace the use with value 0. + + // IR for __nvvm_reflect calls differs between CUDA versions: + // CUDA 6.5 and earlier uses this sequence: + // %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8 + // (i8 addrspace(4)* getelementptr inbounds + // ([8 x i8], [8 x i8] addrspace(4)* @str, i32 0, i32 0)) + // %reflect = tail call i32 @__nvvm_reflect(i8* %ptr) + // + // Value returned by Sym->getOperand(0) is a Constant with a + // ConstantDataSequential operand which can be converted to string and used + // for lookup. + // + // CUDA 7.0 does it slightly differently: + // %reflect = call i32 @__nvvm_reflect(i8* addrspacecast + // (i8 addrspace(1)* getelementptr inbounds + // ([8 x i8], [8 x i8] addrspace(1)* @str, i32 0, i32 0) to i8*)) + // + // In this case, we get a Constant with a GlobalVariable operand and we need + // to dig deeper to find its initializer with the string we'll use for lookup. + for (User *U : ReflectFunction->users()) { assert(isa<CallInst>(U) && "Only a call instruction can use _reflect"); CallInst *Reflect = cast<CallInst>(U); @@ -158,16 +179,23 @@ bool NVVMReflect::handleFunction(Function *ReflectFunction) { const Value *Sym = GEP->getOperand(0); assert(isa<Constant>(Sym) && "Format of _reflect function not recognized"); - const Constant *SymStr = cast<Constant>(Sym); + const Value *Operand = cast<Constant>(Sym)->getOperand(0); + if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Operand)) { + // For CUDA-7.0 style __nvvm_reflect calls we need to find operand's + // initializer. + assert(GV->hasInitializer() && + "Format of _reflect function not recognized"); + const Constant *Initializer = GV->getInitializer(); + Operand = Initializer; + } - assert(isa<ConstantDataSequential>(SymStr->getOperand(0)) && + assert(isa<ConstantDataSequential>(Operand) && "Format of _reflect function not recognized"); - - assert(cast<ConstantDataSequential>(SymStr->getOperand(0))->isCString() && + assert(cast<ConstantDataSequential>(Operand)->isCString() && "Format of _reflect function not recognized"); std::string ReflectArg = - cast<ConstantDataSequential>(SymStr->getOperand(0))->getAsString(); + cast<ConstantDataSequential>(Operand)->getAsString(); ReflectArg = ReflectArg.substr(0, ReflectArg.size() - 1); DEBUG(dbgs() << "Arg of _reflect : " << ReflectArg << "\n"); diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index bf00e73..99a1633 100644 --- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -271,9 +271,9 @@ class PPCAsmParser : public MCTargetAsmParser { public: - PPCAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser, - const MCInstrInfo &_MII, const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(_STI), MII(_MII) { + PPCAsmParser(MCSubtargetInfo &STI, MCAsmParser &, const MCInstrInfo &MII, + const MCTargetOptions &Options) + : MCTargetAsmParser(), STI(STI), MII(MII) { // Check for 64-bit vs. 32-bit pointer mode. Triple TheTriple(STI.getTargetTriple()); IsPPC64 = (TheTriple.getArch() == Triple::ppc64 || @@ -425,7 +425,9 @@ public: bool isToken() const override { return Kind == Token; } bool isImm() const override { return Kind == Immediate || Kind == Expression; } + bool isU1Imm() const { return Kind == Immediate && isUInt<1>(getImm()); } bool isU2Imm() const { return Kind == Immediate && isUInt<2>(getImm()); } + bool isU3Imm() const { return Kind == Immediate && isUInt<3>(getImm()); } bool isU4Imm() const { return Kind == Immediate && isUInt<4>(getImm()); } bool isU5Imm() const { return Kind == Immediate && isUInt<5>(getImm()); } bool isS5Imm() const { return Kind == Immediate && isInt<5>(getImm()); } diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp index 0ed0723..a9f5fc7 100644 --- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp +++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp @@ -189,6 +189,12 @@ static DecodeStatus DecodeCRRCRegisterClass(MCInst &Inst, uint64_t RegNo, return decodeRegisterClass(Inst, RegNo, CRRegs); } +static DecodeStatus DecodeCRRC0RegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, CRRegs); +} + static DecodeStatus DecodeCRBITRCRegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address, const void *Decoder) { diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp index c287fbe..311a4f2 100644 --- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp +++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp @@ -214,6 +214,13 @@ void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo, printOperand(MI, OpNo+1, O); } +void PPCInstPrinter::printU1ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned int Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 1 && "Invalid u1imm argument!"); + O << (unsigned int)Value; +} + void PPCInstPrinter::printU2ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { unsigned int Value = MI->getOperand(OpNo).getImm(); @@ -221,6 +228,13 @@ void PPCInstPrinter::printU2ImmOperand(const MCInst *MI, unsigned OpNo, O << (unsigned int)Value; } +void PPCInstPrinter::printU3ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned int Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 8 && "Invalid u3imm argument!"); + O << (unsigned int)Value; +} + void PPCInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { unsigned int Value = MI->getOperand(OpNo).getImm(); diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h index 6ead19b..8718743 100644 --- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h +++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h @@ -43,7 +43,9 @@ public: void printPredicateOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O, const char *Modifier = nullptr); + void printU1ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU2ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU3ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU4ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp index 2b4f2d8..d8fab5b 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp @@ -45,6 +45,10 @@ PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit, const Triple& T) { void PPCELFMCAsmInfo::anchor() { } PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) { + // FIXME: This is not always needed. For example, it is not needed in the + // v2 abi. + NeedsLocalForSize = true; + if (is64Bit) { PointerSize = CalleeSaveStackSlotSize = 8; } diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp index 06d380e..b9f0afb 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp @@ -14,6 +14,7 @@ #include "MCTargetDesc/PPCMCTargetDesc.h" #include "MCTargetDesc/PPCFixupKinds.h" #include "llvm/ADT/Statistic.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" @@ -39,10 +40,10 @@ class PPCMCCodeEmitter : public MCCodeEmitter { bool IsLittleEndian; public: - PPCMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx, bool isLittle) - : MCII(mcii), CTX(ctx), IsLittleEndian(isLittle) { - } - + PPCMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) + : MCII(mcii), CTX(ctx), + IsLittleEndian(ctx.getAsmInfo()->isLittleEndian()) {} + ~PPCMCCodeEmitter() {} unsigned getDirectBrEncoding(const MCInst &MI, unsigned OpNo, @@ -158,14 +159,11 @@ public: }; } // end anonymous namespace - + MCCodeEmitter *llvm::createPPCMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx) { - Triple TT(STI.getTargetTriple()); - bool IsLittleEndian = TT.getArch() == Triple::ppc64le; - return new PPCMCCodeEmitter(MCII, Ctx, IsLittleEndian); + return new PPCMCCodeEmitter(MCII, Ctx); } unsigned PPCMCCodeEmitter:: diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h index f0a6bb9..1c840d9 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h @@ -36,9 +36,8 @@ private: int64_t EvaluateAsInt64(int64_t Value) const; - explicit PPCMCExpr(VariantKind _Kind, const MCExpr *_Expr, - bool _IsDarwin) - : Kind(_Kind), Expr(_Expr), IsDarwin(_IsDarwin) {} + explicit PPCMCExpr(VariantKind Kind, const MCExpr *Expr, bool IsDarwin) + : Kind(Kind), Expr(Expr), IsDarwin(IsDarwin) {} public: /// @name Construction diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp index f2da389..2f7a768 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp @@ -145,6 +145,7 @@ public: } void emitTCEntry(const MCSymbol &S) override { // Creates a R_PPC64_TOC relocation + Streamer.EmitValueToAlignment(8); Streamer.EmitSymbolValue(&S, 8); } void emitMachine(StringRef CPU) override { @@ -222,32 +223,19 @@ public: }; } -// This is duplicated code. Refactor this. -static MCStreamer *createMCStreamer(const Target &T, StringRef TT, - MCContext &Ctx, MCAsmBackend &MAB, - raw_ostream &OS, MCCodeEmitter *Emitter, - const MCSubtargetInfo &STI, bool RelaxAll) { - if (Triple(TT).isOSDarwin()) { - MCStreamer *S = createMachOStreamer(Ctx, MAB, OS, Emitter, RelaxAll); - new PPCTargetMachOStreamer(*S); - return S; - } - - MCStreamer *S = createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll); - new PPCTargetELFStreamer(*S); - return S; +static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S, + formatted_raw_ostream &OS, + MCInstPrinter *InstPrint, + bool isVerboseAsm) { + return new PPCTargetAsmStreamer(S, OS); } -static MCStreamer * -createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS, - bool isVerboseAsm, bool useDwarfDirectory, - MCInstPrinter *InstPrint, MCCodeEmitter *CE, - MCAsmBackend *TAB, bool ShowInst) { - - MCStreamer *S = llvm::createAsmStreamer( - Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst); - new PPCTargetAsmStreamer(*S, OS); - return S; +static MCTargetStreamer * +createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) { + Triple TT(STI.getTargetTriple()); + if (TT.getObjectFormat() == Triple::ELF) + return new PPCTargetELFStreamer(S); + return new PPCTargetMachOStreamer(S); } static MCInstPrinter *createPPCMCInstPrinter(const Target &T, @@ -261,60 +249,36 @@ static MCInstPrinter *createPPCMCInstPrinter(const Target &T, } extern "C" void LLVMInitializePowerPCTargetMC() { - // Register the MC asm info. - RegisterMCAsmInfoFn C(ThePPC32Target, createPPCMCAsmInfo); - RegisterMCAsmInfoFn D(ThePPC64Target, createPPCMCAsmInfo); - RegisterMCAsmInfoFn E(ThePPC64LETarget, createPPCMCAsmInfo); - - // Register the MC codegen info. - TargetRegistry::RegisterMCCodeGenInfo(ThePPC32Target, createPPCMCCodeGenInfo); - TargetRegistry::RegisterMCCodeGenInfo(ThePPC64Target, createPPCMCCodeGenInfo); - TargetRegistry::RegisterMCCodeGenInfo(ThePPC64LETarget, - createPPCMCCodeGenInfo); - - // Register the MC instruction info. - TargetRegistry::RegisterMCInstrInfo(ThePPC32Target, createPPCMCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(ThePPC64Target, createPPCMCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(ThePPC64LETarget, - createPPCMCInstrInfo); - - // Register the MC register info. - TargetRegistry::RegisterMCRegInfo(ThePPC32Target, createPPCMCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(ThePPC64Target, createPPCMCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(ThePPC64LETarget, createPPCMCRegisterInfo); - - // Register the MC subtarget info. - TargetRegistry::RegisterMCSubtargetInfo(ThePPC32Target, - createPPCMCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(ThePPC64Target, - createPPCMCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(ThePPC64LETarget, - createPPCMCSubtargetInfo); - - // Register the MC Code Emitter - TargetRegistry::RegisterMCCodeEmitter(ThePPC32Target, createPPCMCCodeEmitter); - TargetRegistry::RegisterMCCodeEmitter(ThePPC64Target, createPPCMCCodeEmitter); - TargetRegistry::RegisterMCCodeEmitter(ThePPC64LETarget, - createPPCMCCodeEmitter); - + for (Target *T : {&ThePPC32Target, &ThePPC64Target, &ThePPC64LETarget}) { + // Register the MC asm info. + RegisterMCAsmInfoFn C(*T, createPPCMCAsmInfo); + + // Register the MC codegen info. + TargetRegistry::RegisterMCCodeGenInfo(*T, createPPCMCCodeGenInfo); + + // Register the MC instruction info. + TargetRegistry::RegisterMCInstrInfo(*T, createPPCMCInstrInfo); + + // Register the MC register info. + TargetRegistry::RegisterMCRegInfo(*T, createPPCMCRegisterInfo); + + // Register the MC subtarget info. + TargetRegistry::RegisterMCSubtargetInfo(*T, createPPCMCSubtargetInfo); + + // Register the MC Code Emitter + TargetRegistry::RegisterMCCodeEmitter(*T, createPPCMCCodeEmitter); + // Register the asm backend. - TargetRegistry::RegisterMCAsmBackend(ThePPC32Target, createPPCAsmBackend); - TargetRegistry::RegisterMCAsmBackend(ThePPC64Target, createPPCAsmBackend); - TargetRegistry::RegisterMCAsmBackend(ThePPC64LETarget, createPPCAsmBackend); - - // Register the object streamer. - TargetRegistry::RegisterMCObjectStreamer(ThePPC32Target, createMCStreamer); - TargetRegistry::RegisterMCObjectStreamer(ThePPC64Target, createMCStreamer); - TargetRegistry::RegisterMCObjectStreamer(ThePPC64LETarget, createMCStreamer); - - // Register the asm streamer. - TargetRegistry::RegisterAsmStreamer(ThePPC32Target, createMCAsmStreamer); - TargetRegistry::RegisterAsmStreamer(ThePPC64Target, createMCAsmStreamer); - TargetRegistry::RegisterAsmStreamer(ThePPC64LETarget, createMCAsmStreamer); - - // Register the MCInstPrinter. - TargetRegistry::RegisterMCInstPrinter(ThePPC32Target, createPPCMCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(ThePPC64Target, createPPCMCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(ThePPC64LETarget, - createPPCMCInstPrinter); + TargetRegistry::RegisterMCAsmBackend(*T, createPPCAsmBackend); + + // Register the object target streamer. + TargetRegistry::RegisterObjectTargetStreamer(*T, + createObjectTargetStreamer); + + // Register the asm target streamer. + TargetRegistry::RegisterAsmTargetStreamer(*T, createAsmTargetStreamer); + + // Register the MCInstPrinter. + TargetRegistry::RegisterMCInstPrinter(*T, createPPCMCInstPrinter); + } } diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h index 68f7f7a..8b1e3b4 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h @@ -34,10 +34,9 @@ class raw_ostream; extern Target ThePPC32Target; extern Target ThePPC64Target; extern Target ThePPC64LETarget; - + MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx); MCAsmBackend *createPPCAsmBackend(const Target &T, const MCRegisterInfo &MRI, diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td index f53add5..f175f6d 100644 --- a/lib/Target/PowerPC/PPC.td +++ b/lib/Target/PowerPC/PPC.td @@ -18,7 +18,7 @@ include "llvm/Target/Target.td" //===----------------------------------------------------------------------===// // PowerPC Subtarget features. // - + //===----------------------------------------------------------------------===// // CPU Directives // //===----------------------------------------------------------------------===// @@ -112,14 +112,21 @@ def FeatureVSX : SubtargetFeature<"vsx","HasVSX", "true", def FeatureP8Altivec : SubtargetFeature<"power8-altivec", "HasP8Altivec", "true", "Enable POWER8 Altivec instructions", [FeatureAltivec]>; +def FeatureP8Crypto : SubtargetFeature<"crypto", "HasP8Crypto", "true", + "Enable POWER8 Crypto instructions", + [FeatureP8Altivec]>; def FeatureP8Vector : SubtargetFeature<"power8-vector", "HasP8Vector", "true", "Enable POWER8 vector instructions", [FeatureVSX, FeatureP8Altivec]>; - +def FeaturePartwordAtomic : SubtargetFeature<"partword-atomics", + "HasPartwordAtomics", "true", + "Enable l[bh]arx and st[bh]cx.">; def FeatureInvariantFunctionDescriptors : SubtargetFeature<"invariant-function-descriptors", "HasInvariantFunctionDescriptors", "true", "Assume function descriptors are invariant">; +def FeatureHTM : SubtargetFeature<"htm", "HasHTM", "true", + "Enable Hardware Transactional Memory instructions">; def DeprecatedMFTB : SubtargetFeature<"", "DeprecatedMFTB", "true", "Treat mftb as deprecated">; @@ -256,11 +263,11 @@ def ProcessorFeatures { [DirectivePwr8, FeatureAltivec, FeatureP8Altivec, FeatureVSX, FeatureP8Vector, FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES, - FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX, + FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX, FeatureHTM, FeatureFPRND, FeatureFPCVT, FeatureISEL, - FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX, + FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX, FeatureP8Crypto, Feature64Bit /*, Feature64BitRegs */, FeatureICBT, - DeprecatedMFTB, DeprecatedDST]; + FeaturePartwordAtomic, DeprecatedMFTB, DeprecatedDST]; } def : ProcessorModel<"970", G5Model, @@ -339,7 +346,7 @@ def : ProcessorModel<"pwr7", P7Model, FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX, FeatureFPRND, FeatureFPCVT, FeatureISEL, FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX, - Feature64Bit /*, Feature64BitRegs */, + Feature64Bit /*, Feature64BitRegs */, FeaturePartwordAtomic, DeprecatedMFTB, DeprecatedDST]>; def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.Power8FeatureList>; def : Processor<"ppc", G3Itineraries, [Directive32]>; diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp index 1327290..cd60906 100644 --- a/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -69,12 +69,11 @@ namespace { protected: MapVector<MCSymbol*, MCSymbol*> TOC; const PPCSubtarget *Subtarget; - uint64_t TOCLabelID; StackMaps SM; public: explicit PPCAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) - : AsmPrinter(TM, std::move(Streamer)), TOCLabelID(0), SM(*this) {} + : AsmPrinter(TM, std::move(Streamer)), SM(*this) {} const char *getPassName() const override { return "PowerPC Assembly Printer"; @@ -321,17 +320,9 @@ bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, /// exists for it. If not, create one. Then return a symbol that references /// the TOC entry. MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(MCSymbol *Sym) { - const DataLayout *DL = TM.getDataLayout(); MCSymbol *&TOCEntry = TOC[Sym]; - - // To avoid name clash check if the name already exists. - while (!TOCEntry) { - if (OutContext.LookupSymbol(Twine(DL->getPrivateGlobalPrefix()) + - "C" + Twine(TOCLabelID++)) == nullptr) { - TOCEntry = GetTempSymbol("C", TOCLabelID); - } - } - + if (!TOCEntry) + TOCEntry = createTempSymbol("C"); return TOCEntry; } @@ -1068,8 +1059,7 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() { OutStreamer.SwitchSection(Section); OutStreamer.EmitLabel(CurrentFnSym); OutStreamer.EmitValueToAlignment(8); - MCSymbol *Symbol1 = - OutContext.GetOrCreateSymbol(".L." + Twine(CurrentFnSym->getName())); + MCSymbol *Symbol1 = CurrentFnSymForSize; // Generates a R_PPC64_ADDR64 (from FK_DATA_8) relocation for the function // entry point. OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol1, OutContext), @@ -1082,11 +1072,6 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() { // Emit a null environment pointer. OutStreamer.EmitIntValue(0, 8 /* size */); OutStreamer.SwitchSection(Current.first, Current.second); - - MCSymbol *RealFnSym = OutContext.GetOrCreateSymbol( - ".L." + Twine(CurrentFnSym->getName())); - OutStreamer.EmitLabel(RealFnSym); - CurrentFnSymForSize = RealFnSym; } diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp index 5af8aab..c595f44 100644 --- a/lib/Target/PowerPC/PPCCTRLoops.cpp +++ b/lib/Target/PowerPC/PPCCTRLoops.cpp @@ -171,8 +171,7 @@ bool PPCCTRLoops::runOnFunction(Function &F) { LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); SE = &getAnalysis<ScalarEvolution>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; + DL = &F.getParent()->getDataLayout(); auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); LibInfo = TLIP ? &TLIP->getTLI() : nullptr; @@ -533,7 +532,7 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) { // selected branch. MadeChange = true; - SCEVExpander SCEVE(*SE, "loopcnt"); + SCEVExpander SCEVE(*SE, Preheader->getModule()->getDataLayout(), "loopcnt"); LLVMContext &C = SE->getContext(); Type *CountType = TT.isArch64Bit() ? Type::getInt64Ty(C) : Type::getInt32Ty(C); diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp index 54532b5..fbd7b6d 100644 --- a/lib/Target/PowerPC/PPCFastISel.cpp +++ b/lib/Target/PowerPC/PPCFastISel.cpp @@ -675,8 +675,18 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) { case PPC::STFS: Opc = PPC::STFSX; break; case PPC::STFD: Opc = IsVSFRC ? PPC::STXSDX : PPC::STFDX; break; } - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) - .addReg(SrcReg).addReg(Addr.Base.Reg).addReg(IndexReg); + + auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) + .addReg(SrcReg); + + // If we have an index register defined we use it in the store inst, + // otherwise we use X0 as base as it makes the vector instructions to + // use zero in the computation of the effective address regardless the + // content of the register. + if (IndexReg) + MIB.addReg(Addr.Base.Reg).addReg(IndexReg); + else + MIB.addReg(PPC::ZERO8).addReg(Addr.Base.Reg); } return true; @@ -1532,7 +1542,7 @@ bool PPCFastISel::fastLowerCall(CallLoweringInfo &CLI) { // Add a register mask with the call-preserved registers. Proper // defs for return values will be added by setPhysRegsDeadExcept(). - MIB.addRegMask(TRI.getCallPreservedMask(CC)); + MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC)); CLI.Call = MIB; diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index b10e854..3ac8e94 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -186,20 +186,34 @@ namespace { /// register can be improved, but it is wrong to substitute Reg+Reg for /// Reg in an asm, because the load or store opcode would have to change. bool SelectInlineAsmMemoryOperand(const SDValue &Op, - char ConstraintCode, + unsigned ConstraintID, std::vector<SDValue> &OutOps) override { - // We need to make sure that this one operand does not end up in r0 - // (because we might end up lowering this as 0(%op)). - const TargetRegisterInfo *TRI = PPCSubTarget->getRegisterInfo(); - const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF, /*Kind=*/1); - SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32); - SDValue NewOp = - SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, - SDLoc(Op), Op.getValueType(), - Op, RC), 0); - - OutOps.push_back(NewOp); - return false; + + switch(ConstraintID) { + default: + errs() << "ConstraintID: " << ConstraintID << "\n"; + llvm_unreachable("Unexpected asm memory constraint"); + case InlineAsm::Constraint_es: + case InlineAsm::Constraint_i: + case InlineAsm::Constraint_m: + case InlineAsm::Constraint_o: + case InlineAsm::Constraint_Q: + case InlineAsm::Constraint_Z: + case InlineAsm::Constraint_Zy: + // We need to make sure that this one operand does not end up in r0 + // (because we might end up lowering this as 0(%op)). + const TargetRegisterInfo *TRI = PPCSubTarget->getRegisterInfo(); + const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF, /*Kind=*/1); + SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32); + SDValue NewOp = + SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + SDLoc(Op), Op.getValueType(), + Op, RC), 0); + + OutOps.push_back(NewOp); + return false; + } + return true; } void InsertVRSaveCode(MachineFunction &MF); @@ -2105,7 +2119,7 @@ static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert) { // getVCmpInst: return the vector compare instruction for the specified // vector type and condition code. Since this is for altivec specific code, -// only support the altivec types (v16i8, v8i16, v4i32, and v4f32). +// only support the altivec types (v16i8, v8i16, v4i32, v2i64, and v4f32). static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC, bool HasVSX, bool &Swap, bool &Negate) { Swap = false; @@ -2184,6 +2198,8 @@ static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC, return PPC::VCMPEQUH; else if (VecVT == MVT::v4i32) return PPC::VCMPEQUW; + else if (VecVT == MVT::v2i64) + return PPC::VCMPEQUD; break; case ISD::SETGT: if (VecVT == MVT::v16i8) @@ -2192,6 +2208,8 @@ static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC, return PPC::VCMPGTSH; else if (VecVT == MVT::v4i32) return PPC::VCMPGTSW; + else if (VecVT == MVT::v2i64) + return PPC::VCMPGTSD; break; case ISD::SETUGT: if (VecVT == MVT::v16i8) @@ -2200,6 +2218,8 @@ static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC, return PPC::VCMPGTUH; else if (VecVT == MVT::v4i32) return PPC::VCMPGTUW; + else if (VecVT == MVT::v2i64) + return PPC::VCMPGTUD; break; default: break; diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 147e94b..871531e 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -516,7 +516,12 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); } - setOperationAction(ISD::MUL, MVT::v4i32, Custom); + + if (Subtarget.hasP8Altivec()) + setOperationAction(ISD::MUL, MVT::v4i32, Legal); + else + setOperationAction(ISD::MUL, MVT::v4i32, Custom); + setOperationAction(ISD::MUL, MVT::v8i16, Custom); setOperationAction(ISD::MUL, MVT::v16i8, Custom); @@ -574,15 +579,24 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass); addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass); - // VSX v2i64 only supports non-arithmetic operations. - setOperationAction(ISD::ADD, MVT::v2i64, Expand); - setOperationAction(ISD::SUB, MVT::v2i64, Expand); + if (Subtarget.hasP8Altivec()) { + setOperationAction(ISD::SHL, MVT::v2i64, Legal); + setOperationAction(ISD::SRA, MVT::v2i64, Legal); + setOperationAction(ISD::SRL, MVT::v2i64, Legal); - setOperationAction(ISD::SHL, MVT::v2i64, Expand); - setOperationAction(ISD::SRA, MVT::v2i64, Expand); - setOperationAction(ISD::SRL, MVT::v2i64, Expand); + setOperationAction(ISD::SETCC, MVT::v2i64, Legal); + } + else { + setOperationAction(ISD::SHL, MVT::v2i64, Expand); + setOperationAction(ISD::SRA, MVT::v2i64, Expand); + setOperationAction(ISD::SRL, MVT::v2i64, Expand); - setOperationAction(ISD::SETCC, MVT::v2i64, Custom); + setOperationAction(ISD::SETCC, MVT::v2i64, Custom); + + // VSX v2i64 only supports non-arithmetic operations. + setOperationAction(ISD::ADD, MVT::v2i64, Expand); + setOperationAction(ISD::SUB, MVT::v2i64, Expand); + } setOperationAction(ISD::LOAD, MVT::v2i64, Promote); AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64); @@ -892,6 +906,13 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, MaxStoresPerMemcpyOptSize = 8; MaxStoresPerMemmove = 32; MaxStoresPerMemmoveOptSize = 8; + } else if (Subtarget.getDarwinDirective() == PPC::DIR_A2) { + // The A2 also benefits from (very) aggressive inlining of memcpy and + // friends. The overhead of a the function call, even when warm, can be + // over one hundred cycles. + MaxStoresPerMemset = 128; + MaxStoresPerMemcpy = 128; + MaxStoresPerMemmove = 128; } } @@ -981,8 +1002,6 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::STBRX: return "PPCISD::STBRX"; case PPCISD::LFIWAX: return "PPCISD::LFIWAX"; case PPCISD::LFIWZX: return "PPCISD::LFIWZX"; - case PPCISD::LARX: return "PPCISD::LARX"; - case PPCISD::STCX: return "PPCISD::STCX"; case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH"; case PPCISD::BDNZ: return "PPCISD::BDNZ"; case PPCISD::BDZ: return "PPCISD::BDZ"; @@ -1384,17 +1403,10 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { // immediate field for would be zero, and we prefer to use vxor for it. if (ValSizeInBytes < ByteSize) return SDValue(); - // If the element value is larger than the splat value, cut it in half and - // check to see if the two halves are equal. Continue doing this until we - // get to ByteSize. This allows us to handle 0x01010101 as 0x01. - while (ValSizeInBytes > ByteSize) { - ValSizeInBytes >>= 1; - - // If the top half equals the bottom half, we're still ok. - if (((Value >> (ValSizeInBytes*8)) & ((1 << (8*ValSizeInBytes))-1)) != - (Value & ((1 << (8*ValSizeInBytes))-1))) - return SDValue(); - } + // If the element value is larger than the splat value, check if it consists + // of a repeated bit pattern of size ByteSize. + if (!APInt(ValSizeInBytes * 8, Value).isSplat(ByteSize * 8)) + return SDValue(); // Properly sign extend the value. int MaskVal = SignExtend32(Value, ByteSize * 8); @@ -2436,27 +2448,16 @@ bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, return false; } -/// GetFPR - Get the set of FP registers that should be allocated for arguments, +/// FPR - The set of FP registers that should be allocated for arguments, /// on Darwin. -static const MCPhysReg *GetFPR() { - static const MCPhysReg FPR[] = { - PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, - PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13 - }; +static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, + PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10, + PPC::F11, PPC::F12, PPC::F13}; - return FPR; -} - -/// GetQFPR - Get the set of QPX registers that should be allocated for -/// arguments. -static const MCPhysReg *GetQFPR() { - static const MCPhysReg QFPR[] = { - PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7, - PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13 - }; - - return QFPR; -} +/// QFPR - The set of QPX registers that should be allocated for arguments. +static const MCPhysReg QFPR[] = { + PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7, + PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13}; /// CalculateStackSlotSize - Calculates the size reserved for this argument on /// the stack. @@ -2880,9 +2881,6 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( PPC::X3, PPC::X4, PPC::X5, PPC::X6, PPC::X7, PPC::X8, PPC::X9, PPC::X10, }; - - static const MCPhysReg *FPR = GetFPR(); - static const MCPhysReg VR[] = { PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 @@ -2892,8 +2890,6 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13 }; - static const MCPhysReg *QFPR = GetQFPR(); - const unsigned Num_GPR_Regs = array_lengthof(GPR); const unsigned Num_FPR_Regs = 13; const unsigned Num_VR_Regs = array_lengthof(VR); @@ -3291,9 +3287,6 @@ PPCTargetLowering::LowerFormalArguments_Darwin( PPC::X3, PPC::X4, PPC::X5, PPC::X6, PPC::X7, PPC::X8, PPC::X9, PPC::X10, }; - - static const MCPhysReg *FPR = GetFPR(); - static const MCPhysReg VR[] = { PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 @@ -4187,7 +4180,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, // Add a register mask operand representing the call-preserved registers. const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); - const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); + const uint32_t *Mask = + TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -4582,8 +4576,6 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, PPC::X3, PPC::X4, PPC::X5, PPC::X6, PPC::X7, PPC::X8, PPC::X9, PPC::X10, }; - static const MCPhysReg *FPR = GetFPR(); - static const MCPhysReg VR[] = { PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 @@ -4593,8 +4585,6 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13 }; - static const MCPhysReg *QFPR = GetQFPR(); - const unsigned NumGPRs = array_lengthof(GPR); const unsigned NumFPRs = 13; const unsigned NumVRs = array_lengthof(VR); @@ -5280,8 +5270,6 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, PPC::X3, PPC::X4, PPC::X5, PPC::X6, PPC::X7, PPC::X8, PPC::X9, PPC::X10, }; - static const MCPhysReg *FPR = GetFPR(); - static const MCPhysReg VR[] = { PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 @@ -6418,7 +6406,7 @@ static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT, SelectionDAG &DAG, SDLoc dl) { assert(Val >= -16 && Val <= 15 && "vsplti is out of range!"); - static const EVT VTys[] = { // canonical VT to use for each size. + static const MVT VTys[] = { // canonical VT to use for each size. MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32 }; @@ -7045,7 +7033,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, /// altivec comparison. If it is, return true and fill in Opc/isDot with /// information about the intrinsic. static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc, - bool &isDot) { + bool &isDot, const PPCSubtarget &Subtarget) { unsigned IntrinsicID = cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue(); CompareOpc = -1; @@ -7058,29 +7046,83 @@ static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc, case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc = 6; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc = 70; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpequd_p: + if (Subtarget.hasP8Altivec()) { + CompareOpc = 199; + isDot = 1; + } + else + return false; + + break; case Intrinsic::ppc_altivec_vcmpgefp_p: CompareOpc = 454; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtfp_p: CompareOpc = 710; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtsd_p: + if (Subtarget.hasP8Altivec()) { + CompareOpc = 967; + isDot = 1; + } + else + return false; + + break; case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtud_p: + if (Subtarget.hasP8Altivec()) { + CompareOpc = 711; + isDot = 1; + } + else + return false; + break; + // Normal Comparisons. case Intrinsic::ppc_altivec_vcmpbfp: CompareOpc = 966; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpeqfp: CompareOpc = 198; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpequb: CompareOpc = 6; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpequh: CompareOpc = 70; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpequw: CompareOpc = 134; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpequd: + if (Subtarget.hasP8Altivec()) { + CompareOpc = 199; + isDot = 0; + } + else + return false; + + break; case Intrinsic::ppc_altivec_vcmpgefp: CompareOpc = 454; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtfp: CompareOpc = 710; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtsb: CompareOpc = 774; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtsh: CompareOpc = 838; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtsw: CompareOpc = 902; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtsd: + if (Subtarget.hasP8Altivec()) { + CompareOpc = 967; + isDot = 0; + } + else + return false; + + break; case Intrinsic::ppc_altivec_vcmpgtub: CompareOpc = 518; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtuh: CompareOpc = 582; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtuw: CompareOpc = 646; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtud: + if (Subtarget.hasP8Altivec()) { + CompareOpc = 711; + isDot = 0; + } + else + return false; + + break; } return true; } @@ -7094,7 +7136,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDLoc dl(Op); int CompareOpc; bool isDot; - if (!getAltivecCompareInfo(Op, CompareOpc, isDot)) + if (!getAltivecCompareInfo(Op, CompareOpc, isDot, Subtarget)) return SDValue(); // Don't custom lower most intrinsics. // If this is a non-dot comparison, make the VCMP node and we are done. @@ -7738,10 +7780,36 @@ Instruction* PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder, MachineBasicBlock * PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, - bool is64bit, unsigned BinOpcode) const { + unsigned AtomicSize, + unsigned BinOpcode) const { // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + auto LoadMnemonic = PPC::LDARX; + auto StoreMnemonic = PPC::STDCX; + switch (AtomicSize) { + default: + llvm_unreachable("Unexpected size of atomic entity"); + case 1: + LoadMnemonic = PPC::LBARX; + StoreMnemonic = PPC::STBCX; + assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4"); + break; + case 2: + LoadMnemonic = PPC::LHARX; + StoreMnemonic = PPC::STHCX; + assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4"); + break; + case 4: + LoadMnemonic = PPC::LWARX; + StoreMnemonic = PPC::STWCX; + break; + case 8: + LoadMnemonic = PPC::LDARX; + StoreMnemonic = PPC::STDCX; + break; + } + const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction *F = BB->getParent(); MachineFunction::iterator It = BB; @@ -7763,7 +7831,7 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, MachineRegisterInfo &RegInfo = F->getRegInfo(); unsigned TmpReg = (!BinOpcode) ? incr : - RegInfo.createVirtualRegister( is64bit ? &PPC::G8RCRegClass + RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass); // thisMBB: @@ -7778,11 +7846,11 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, // bne- loopMBB // fallthrough --> exitMBB BB = loopMBB; - BuildMI(BB, dl, TII->get(is64bit ? PPC::LDARX : PPC::LWARX), dest) + BuildMI(BB, dl, TII->get(LoadMnemonic), dest) .addReg(ptrA).addReg(ptrB); if (BinOpcode) BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest); - BuildMI(BB, dl, TII->get(is64bit ? PPC::STDCX : PPC::STWCX)) + BuildMI(BB, dl, TII->get(StoreMnemonic)) .addReg(TmpReg).addReg(ptrA).addReg(ptrB); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB); @@ -7800,6 +7868,10 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, bool is8bit, // operation unsigned BinOpcode) const { + // If we support part-word atomic mnemonics, just use them + if (Subtarget.hasPartwordAtomics()) + return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode); + // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. const TargetInstrInfo *TII = Subtarget.getInstrInfo(); // In 64 bit mode we have to use 64 bits for addresses, even though the @@ -8365,68 +8437,96 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I32) - BB = EmitAtomicBinary(MI, BB, false, PPC::ADD4); + BB = EmitAtomicBinary(MI, BB, 4, PPC::ADD4); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I64) - BB = EmitAtomicBinary(MI, BB, true, PPC::ADD8); + BB = EmitAtomicBinary(MI, BB, 8, PPC::ADD8); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I32) - BB = EmitAtomicBinary(MI, BB, false, PPC::AND); + BB = EmitAtomicBinary(MI, BB, 4, PPC::AND); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I64) - BB = EmitAtomicBinary(MI, BB, true, PPC::AND8); + BB = EmitAtomicBinary(MI, BB, 8, PPC::AND8); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I32) - BB = EmitAtomicBinary(MI, BB, false, PPC::OR); + BB = EmitAtomicBinary(MI, BB, 4, PPC::OR); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I64) - BB = EmitAtomicBinary(MI, BB, true, PPC::OR8); + BB = EmitAtomicBinary(MI, BB, 8, PPC::OR8); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I32) - BB = EmitAtomicBinary(MI, BB, false, PPC::XOR); + BB = EmitAtomicBinary(MI, BB, 4, PPC::XOR); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I64) - BB = EmitAtomicBinary(MI, BB, true, PPC::XOR8); + BB = EmitAtomicBinary(MI, BB, 8, PPC::XOR8); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I32) - BB = EmitAtomicBinary(MI, BB, false, PPC::NAND); + BB = EmitAtomicBinary(MI, BB, 4, PPC::NAND); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I64) - BB = EmitAtomicBinary(MI, BB, true, PPC::NAND8); + BB = EmitAtomicBinary(MI, BB, 8, PPC::NAND8); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I32) - BB = EmitAtomicBinary(MI, BB, false, PPC::SUBF); + BB = EmitAtomicBinary(MI, BB, 4, PPC::SUBF); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I64) - BB = EmitAtomicBinary(MI, BB, true, PPC::SUBF8); + BB = EmitAtomicBinary(MI, BB, 8, PPC::SUBF8); else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, 0); else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, 0); else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I32) - BB = EmitAtomicBinary(MI, BB, false, 0); + BB = EmitAtomicBinary(MI, BB, 4, 0); else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I64) - BB = EmitAtomicBinary(MI, BB, true, 0); + BB = EmitAtomicBinary(MI, BB, 8, 0); else if (MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 || - MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I64) { + MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 || + (Subtarget.hasPartwordAtomics() && + MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) || + (Subtarget.hasPartwordAtomics() && + MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) { bool is64bit = MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I64; + auto LoadMnemonic = PPC::LDARX; + auto StoreMnemonic = PPC::STDCX; + switch(MI->getOpcode()) { + default: + llvm_unreachable("Compare and swap of unknown size"); + case PPC::ATOMIC_CMP_SWAP_I8: + LoadMnemonic = PPC::LBARX; + StoreMnemonic = PPC::STBCX; + assert(Subtarget.hasPartwordAtomics() && "No support partword atomics."); + break; + case PPC::ATOMIC_CMP_SWAP_I16: + LoadMnemonic = PPC::LHARX; + StoreMnemonic = PPC::STHCX; + assert(Subtarget.hasPartwordAtomics() && "No support partword atomics."); + break; + case PPC::ATOMIC_CMP_SWAP_I32: + LoadMnemonic = PPC::LWARX; + StoreMnemonic = PPC::STWCX; + break; + case PPC::ATOMIC_CMP_SWAP_I64: + LoadMnemonic = PPC::LDARX; + StoreMnemonic = PPC::STDCX; + break; + } unsigned dest = MI->getOperand(0).getReg(); unsigned ptrA = MI->getOperand(1).getReg(); unsigned ptrB = MI->getOperand(2).getReg(); @@ -8452,18 +8552,18 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, BB->addSuccessor(loop1MBB); // loop1MBB: - // l[wd]arx dest, ptr + // l[bhwd]arx dest, ptr // cmp[wd] dest, oldval // bne- midMBB // loop2MBB: - // st[wd]cx. newval, ptr + // st[bhwd]cx. newval, ptr // bne- loopMBB // b exitBB // midMBB: - // st[wd]cx. dest, ptr + // st[bhwd]cx. dest, ptr // exitBB: BB = loop1MBB; - BuildMI(BB, dl, TII->get(is64bit ? PPC::LDARX : PPC::LWARX), dest) + BuildMI(BB, dl, TII->get(LoadMnemonic), dest) .addReg(ptrA).addReg(ptrB); BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0) .addReg(oldval).addReg(dest); @@ -8473,7 +8573,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, BB->addSuccessor(midMBB); BB = loop2MBB; - BuildMI(BB, dl, TII->get(is64bit ? PPC::STDCX : PPC::STWCX)) + BuildMI(BB, dl, TII->get(StoreMnemonic)) .addReg(newval).addReg(ptrA).addReg(ptrB); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB); @@ -8482,7 +8582,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, BB->addSuccessor(exitMBB); BB = midMBB; - BuildMI(BB, dl, TII->get(is64bit ? PPC::STDCX : PPC::STWCX)) + BuildMI(BB, dl, TII->get(StoreMnemonic)) .addReg(dest).addReg(ptrA).addReg(ptrB); BB->addSuccessor(exitMBB); @@ -8682,6 +8782,12 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) .addReg(isEQ ? PPC::CR0EQ : PPC::CR0GT); + } else if (MI->getOpcode() == PPC::TCHECK_RET) { + DebugLoc Dl = MI->getDebugLoc(); + MachineRegisterInfo &RegInfo = F->getRegInfo(); + unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); + BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg); + return BB; } else { llvm_unreachable("Unexpected instr type to insert"); } @@ -10184,7 +10290,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN && isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) && - getAltivecCompareInfo(LHS, CompareOpc, isDot)) { + getAltivecCompareInfo(LHS, CompareOpc, isDot, Subtarget)) { assert(isDot && "Can't compare against a vector result!"); // If this is a comparison against something other than 0/1, then we know @@ -10297,14 +10403,17 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, case Intrinsic::ppc_altivec_vcmpequb_p: case Intrinsic::ppc_altivec_vcmpequh_p: case Intrinsic::ppc_altivec_vcmpequw_p: + case Intrinsic::ppc_altivec_vcmpequd_p: case Intrinsic::ppc_altivec_vcmpgefp_p: case Intrinsic::ppc_altivec_vcmpgtfp_p: case Intrinsic::ppc_altivec_vcmpgtsb_p: case Intrinsic::ppc_altivec_vcmpgtsh_p: case Intrinsic::ppc_altivec_vcmpgtsw_p: + case Intrinsic::ppc_altivec_vcmpgtsd_p: case Intrinsic::ppc_altivec_vcmpgtub_p: case Intrinsic::ppc_altivec_vcmpgtuh_p: case Intrinsic::ppc_altivec_vcmpgtuw_p: + case Intrinsic::ppc_altivec_vcmpgtud_p: KnownZero = ~1U; // All bits but the low one are known to be zero. break; } @@ -10914,11 +11023,27 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { + const Function *F = MF.getFunction(); + // When expanding a memset, require at least two QPX instructions to cover + // the cost of loading the value to be stored from the constant pool. + if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) && + (!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) && + !F->hasFnAttribute(Attribute::NoImplicitFloat)) { + return MVT::v4f64; + } + + // We should use Altivec/VSX loads and stores when available. For unaligned + // addresses, unaligned VSX loads are only fast starting with the P8. + if (Subtarget.hasAltivec() && Size >= 16 && + (((!SrcAlign || SrcAlign >= 16) && (!DstAlign || DstAlign >= 16)) || + ((IsMemset && Subtarget.hasVSX()) || Subtarget.hasP8Vector()))) + return MVT::v4i32; + if (Subtarget.isPPC64()) { return MVT::i64; - } else { - return MVT::i32; } + + return MVT::i32; } /// \brief Returns true if it is beneficial to convert a load of a constant diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 04afe88..8afd7ef 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -166,14 +166,6 @@ namespace llvm { /// F8RC = MFFS - This moves the FPSCR (not modeled) into the register. MFFS, - /// LARX = This corresponds to PPC l{w|d}arx instrcution: load and - /// reserve indexed. This is used to implement atomic operations. - LARX, - - /// STCX = This corresponds to PPC stcx. instrcution: store conditional - /// indexed. This is used to implement atomic operations. - STCX, - /// TC_RETURN - A tail call return. /// operand #0 chain /// operand #1 callee (register or absolute) @@ -489,7 +481,8 @@ namespace llvm { EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const override; MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI, - MachineBasicBlock *MBB, bool is64Bit, + MachineBasicBlock *MBB, + unsigned AtomicSize, unsigned BinOpcode) const; MachineBasicBlock *EmitPartwordAtomicBinary(MachineInstr *MI, MachineBasicBlock *MBB, @@ -526,6 +519,21 @@ namespace llvm { std::vector<SDValue> &Ops, SelectionDAG &DAG) const override; + unsigned getInlineAsmMemConstraint( + const std::string &ConstraintCode) const override { + if (ConstraintCode == "es") + return InlineAsm::Constraint_es; + else if (ConstraintCode == "o") + return InlineAsm::Constraint_o; + else if (ConstraintCode == "Q") + return InlineAsm::Constraint_Q; + else if (ConstraintCode == "Z") + return InlineAsm::Constraint_Z; + else if (ConstraintCode == "Zy") + return InlineAsm::Constraint_Zy; + return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); + } + /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override; diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index 69c0d7d..183d088 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -235,15 +235,19 @@ let usesCustomInserter = 1 in { } // Instructions to support atomic operations +let mayLoad = 1, hasSideEffects = 0 in { def LDARX : XForm_1<31, 84, (outs g8rc:$rD), (ins memrr:$ptr), - "ldarx $rD, $ptr", IIC_LdStLDARX, - [(set i64:$rD, (PPClarx xoaddr:$ptr))]>; + "ldarx $rD, $ptr", IIC_LdStLDARX, []>; + +// Instruction to support lock versions of atomics +// (EH=1 - see Power ISA 2.07 Book II 4.4.2) +def LDARXL : XForm_1<31, 84, (outs g8rc:$rD), (ins memrr:$ptr), + "ldarx $rD, $ptr, 1", IIC_LdStLDARX, []>, isDOT; +} -let Defs = [CR0] in +let Defs = [CR0], mayStore = 1, hasSideEffects = 0 in def STDCX : XForm_1<31, 214, (outs), (ins g8rc:$rS, memrr:$dst), - "stdcx. $rS, $dst", IIC_LdStSTDCX, - [(PPCstcx i64:$rS, xoaddr:$dst)]>, - isDOT; + "stdcx. $rS, $dst", IIC_LdStSTDCX, []>, isDOT; let Interpretation64Bit = 1, isCodeGenOnly = 1 in { let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in @@ -325,6 +329,12 @@ let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in { Requires<[In64BitMode]>; } +def MFSPR8 : XFXForm_1<31, 339, (outs g8rc:$RT), (ins i32imm:$SPR), + "mfspr $RT, $SPR", IIC_SprMFSPR>; +def MTSPR8 : XFXForm_1<31, 467, (outs), (ins i32imm:$SPR, g8rc:$RT), + "mtspr $SPR, $RT", IIC_SprMTSPR>; + + //===----------------------------------------------------------------------===// // 64-bit SPR manipulation instrs. @@ -696,7 +706,7 @@ def ISEL8 : AForm_4<31, 15, // Sign extending loads. -let canFoldAsLoad = 1, PPC970_Unit = 2 in { +let PPC970_Unit = 2 in { let Interpretation64Bit = 1, isCodeGenOnly = 1 in def LHA8: DForm_1<42, (outs g8rc:$rD), (ins memri:$src), "lha $rD, $src", IIC_LdStLHA, @@ -752,7 +762,7 @@ def LWAUX : XForm_1<31, 373, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), let Interpretation64Bit = 1, isCodeGenOnly = 1 in { // Zero extending loads. -let canFoldAsLoad = 1, PPC970_Unit = 2 in { +let PPC970_Unit = 2 in { def LBZ8 : DForm_1<34, (outs g8rc:$rD), (ins memri:$src), "lbz $rD, $src", IIC_LdStLoad, [(set i64:$rD, (zextloadi8 iaddr:$src))]>; @@ -810,7 +820,7 @@ def LWZUX8 : XForm_1<31, 55, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), // Full 8-byte loads. -let canFoldAsLoad = 1, PPC970_Unit = 2 in { +let PPC970_Unit = 2 in { def LD : DSForm_1<58, 0, (outs g8rc:$rD), (ins memrix:$src), "ld $rD, $src", IIC_LdStLD, [(set i64:$rD, (aligned4load ixaddr:$src))]>, isPPC64; diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td index f6acd6e..123808b 100644 --- a/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/lib/Target/PowerPC/PPCInstrAltivec.td @@ -269,6 +269,16 @@ class VX2_Int_Ty2<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy, !strconcat(opc, " $vD, $vB"), IIC_VecFP, [(set OutTy:$vD, (IntID InTy:$vB))]>; +class VXBX_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty> + : VXForm_BX<xo, (outs vrrc:$vD), (ins vrrc:$vA), + !strconcat(opc, " $vD, $vA"), IIC_VecFP, + [(set Ty:$vD, (IntID Ty:$vA))]>; + +class VXCR_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty> + : VXForm_CR<xo, (outs vrrc:$vD), (ins vrrc:$vA, u1imm:$ST, u4imm:$SIX), + !strconcat(opc, " $vD, $vA, $ST, $SIX"), IIC_VecFP, + [(set Ty:$vD, (IntID Ty:$vA, imm:$ST, imm:$SIX))]>; + //===----------------------------------------------------------------------===// // Instruction Definitions. @@ -342,7 +352,7 @@ def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB), "mtvscr $vB", IIC_LdStLoad, [(int_ppc_altivec_mtvscr v4i32:$vB)]>; -let canFoldAsLoad = 1, PPC970_Unit = 2 in { // Loads. +let PPC970_Unit = 2 in { // Loads. def LVEBX: XForm_1<31, 7, (outs vrrc:$vD), (ins memrr:$src), "lvebx $vD, $src", IIC_LdStLoad, [(set v16i8:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>; @@ -750,7 +760,7 @@ def VCMPGTSW : VCMP <902, "vcmpgtsw $vD, $vA, $vB" , v4i32>; def VCMPGTSWo : VCMPo<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>; def VCMPGTUW : VCMP <646, "vcmpgtuw $vD, $vA, $vB" , v4i32>; def VCMPGTUWo : VCMPo<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>; - + let isCodeGenOnly = 1 in { def V_SET0B : VXForm_setzero<1220, (outs vrrc:$vD), (ins), "vxor $vD, $vD, $vD", IIC_VecFP, @@ -939,8 +949,50 @@ def : Pat<(v4f32 (fnearbyint v4f32:$vA)), } // end HasAltivec def HasP8Altivec : Predicate<"PPCSubTarget->hasP8Altivec()">; +def HasP8Crypto : Predicate<"PPCSubTarget->hasP8Crypto()">; let Predicates = [HasP8Altivec] in { +let isCommutable = 1 in { +def VMULESW : VX1_Int_Ty2<904, "vmulesw", int_ppc_altivec_vmulesw, + v2i64, v4i32>; +def VMULEUW : VX1_Int_Ty2<648, "vmuleuw", int_ppc_altivec_vmuleuw, + v2i64, v4i32>; +def VMULOSW : VX1_Int_Ty2<392, "vmulosw", int_ppc_altivec_vmulosw, + v2i64, v4i32>; +def VMULOUW : VX1_Int_Ty2<136, "vmulouw", int_ppc_altivec_vmulouw, + v2i64, v4i32>; +def VMULUWM : VXForm_1<137, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmuluwm $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (mul v4i32:$vA, v4i32:$vB))]>; +def VMAXSD : VX1_Int_Ty<450, "vmaxsd", int_ppc_altivec_vmaxsd, v2i64>; +def VMAXUD : VX1_Int_Ty<194, "vmaxud", int_ppc_altivec_vmaxud, v2i64>; +def VMINSD : VX1_Int_Ty<962, "vminsd", int_ppc_altivec_vminsd, v2i64>; +def VMINUD : VX1_Int_Ty<706, "vminud", int_ppc_altivec_vminud, v2i64>; +} // isCommutable + +// Vector shifts +def VRLD : VX1_Int_Ty<196, "vrld", int_ppc_altivec_vrld, v2i64>; +def VSLD : VXForm_1<1476, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vsld $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (shl v2i64:$vA, v2i64:$vB))]>; +def VSRD : VXForm_1<1732, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vsrd $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (srl v2i64:$vA, v2i64:$vB))]>; +def VSRAD : VXForm_1<964, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vsrad $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (sra v2i64:$vA, v2i64:$vB))]>; + +// Vector Integer Arithmetic Instructions +let isCommutable = 1 in { +def VADDUDM : VXForm_1<192, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vaddudm $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (add v2i64:$vA, v2i64:$vB))]>; +} // isCommutable + +def VSUBUDM : VXForm_1<1216, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vsubudm $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (sub v2i64:$vA, v2i64:$vB))]>; + // Count Leading Zeros def VCLZB : VXForm_2<1794, (outs vrrc:$vD), (ins vrrc:$vB), "vclzb $vD, $vB", IIC_VecGeneral, @@ -992,4 +1044,42 @@ def VORC : VXForm_1<1348, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vorc $vD, $vA, $vB", IIC_VecGeneral, [(set v4i32:$vD, (or v4i32:$vA, (vnot_ppc v4i32:$vB)))]>; + +// i64 element comparisons. +def VCMPEQUD : VCMP <199, "vcmpequd $vD, $vA, $vB" , v2i64>; +def VCMPEQUDo : VCMPo<199, "vcmpequd. $vD, $vA, $vB", v2i64>; +def VCMPGTSD : VCMP <967, "vcmpgtsd $vD, $vA, $vB" , v2i64>; +def VCMPGTSDo : VCMPo<967, "vcmpgtsd. $vD, $vA, $vB", v2i64>; +def VCMPGTUD : VCMP <711, "vcmpgtud $vD, $vA, $vB" , v2i64>; +def VCMPGTUDo : VCMPo<711, "vcmpgtud. $vD, $vA, $vB", v2i64>; + +// The cryptography instructions that do not require Category:Vector.Crypto +def VPMSUMB : VX1_Int_Ty<1032, "vpmsumb", + int_ppc_altivec_crypto_vpmsumb, v16i8>; +def VPMSUMH : VX1_Int_Ty<1096, "vpmsumh", + int_ppc_altivec_crypto_vpmsumh, v8i16>; +def VPMSUMW : VX1_Int_Ty<1160, "vpmsumw", + int_ppc_altivec_crypto_vpmsumw, v4i32>; +def VPMSUMD : VX1_Int_Ty<1224, "vpmsumd", + int_ppc_altivec_crypto_vpmsumd, v2i64>; +def VPERMXOR : VA1a_Int_Ty<45, "vpermxor", + int_ppc_altivec_crypto_vpermxor, v16i8>; + } // end HasP8Altivec + +// Crypto instructions (from builtins) +let Predicates = [HasP8Crypto] in { +def VSHASIGMAW : VXCR_Int_Ty<1666, "vshasigmaw", + int_ppc_altivec_crypto_vshasigmaw, v4i32>; +def VSHASIGMAD : VXCR_Int_Ty<1730, "vshasigmad", + int_ppc_altivec_crypto_vshasigmad, v2i64>; +def VCIPHER : VX1_Int_Ty<1288, "vcipher", int_ppc_altivec_crypto_vcipher, + v2i64>; +def VCIPHERLAST : VX1_Int_Ty<1289, "vcipherlast", + int_ppc_altivec_crypto_vcipherlast, v2i64>; +def VNCIPHER : VX1_Int_Ty<1352, "vncipher", + int_ppc_altivec_crypto_vncipher, v2i64>; +def VNCIPHERLAST : VX1_Int_Ty<1353, "vncipherlast", + int_ppc_altivec_crypto_vncipherlast, v2i64>; +def VSBOX : VXBX_Int_Ty<1480, "vsbox", int_ppc_altivec_crypto_vsbox, v2i64>; +} // HasP8Crypto diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td index 506a2d0..b7a7a1f 100644 --- a/lib/Target/PowerPC/PPCInstrFormats.td +++ b/lib/Target/PowerPC/PPCInstrFormats.td @@ -693,6 +693,60 @@ class XForm_16b<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, let A = 0; } +class XForm_htm0<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bit R; + + bit RC = 1; + + let Inst{6-9} = 0; + let Inst{10} = R; + let Inst{11-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = RC; +} + +class XForm_htm1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bit A; + + bit RC = 1; + + let Inst{6} = A; + let Inst{7-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = RC; +} + +class XForm_htm2<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bit L; + + bit RC = 0; // set by isDOT + + let Inst{7-9} = 0; + let Inst{10} = L; + let Inst{11-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = RC; +} + +class XForm_htm3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<3> BF; + + bit RC = 0; + + let Inst{6-8} = BF; + let Inst{9-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = RC; +} + // XX*-Form (VSX) class XX1Form<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern> @@ -1470,6 +1524,39 @@ class VXForm_5<bits<11> xo, dag OOL, dag IOL, string asmstr, let Inst{21-31} = xo; } +/// VXForm_CR - VX crypto instructions with "VRT, VRA, ST, SIX" +class VXForm_CR<bits<11> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VD; + bits<5> VA; + bits<1> ST; + bits<4> SIX; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = VA; + let Inst{16} = ST; + let Inst{17-20} = SIX; + let Inst{21-31} = xo; +} + +/// VXForm_BX - VX crypto instructions with "VRT, VRA, 0 - like vsbox" +class VXForm_BX<bits<11> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VD; + bits<5> VA; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = VA; + let Inst{16-20} = 0; + let Inst{21-31} = xo; +} + // E-4 VXR-Form class VXRForm_1<bits<10> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern> diff --git a/lib/Target/PowerPC/PPCInstrHTM.td b/lib/Target/PowerPC/PPCInstrHTM.td new file mode 100644 index 0000000..20e6a62 --- /dev/null +++ b/lib/Target/PowerPC/PPCInstrHTM.td @@ -0,0 +1,172 @@ +//===-- PPCInstrHTM.td - The PowerPC Hardware Transactional Memory -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Hardware Transactional Memory extension to the +// PowerPC instruction set. +// +//===----------------------------------------------------------------------===// + + + +def HasHTM : Predicate<"PPCSubTarget->hasHTM()">; + +def HTM_get_imm : SDNodeXForm<imm, [{ + return getI32Imm (N->getZExtValue()); +}]>; + +let hasSideEffects = 1, usesCustomInserter = 1 in { +def TCHECK_RET : Pseudo<(outs crrc:$out), (ins), "#TCHECK_RET", []>; +} + + +let Predicates = [HasHTM] in { + +def TBEGIN : XForm_htm0 <31, 654, + (outs crrc0:$ret), (ins u1imm:$R), "tbegin. $R", IIC_SprMTSPR, []>; + +def TEND : XForm_htm1 <31, 686, + (outs crrc0:$ret), (ins u1imm:$A), "tend. $A", IIC_SprMTSPR, []>; + +def TABORT : XForm_base_r3xo <31, 910, + (outs crrc0:$ret), (ins gprc:$A), "tabort. $A", IIC_SprMTSPR, + []>, isDOT { + let RST = 0; + let B = 0; +} + +def TABORTWC : XForm_base_r3xo <31, 782, + (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, gprc:$B), + "tabortwc. $RTS, $A, $B", IIC_SprMTSPR, []>, + isDOT; + +def TABORTWCI : XForm_base_r3xo <31, 846, + (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, u5imm:$B), + "tabortwci. $RTS, $A, $B", IIC_SprMTSPR, []>, + isDOT; + +def TABORTDC : XForm_base_r3xo <31, 814, + (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, gprc:$B), + "tabortdc. $RTS, $A, $B", IIC_SprMTSPR, []>, + isDOT; + +def TABORTDCI : XForm_base_r3xo <31, 878, + (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, u5imm:$B), + "tabortdci. $RTS, $A, $B", IIC_SprMTSPR, []>, + isDOT; + +def TSR : XForm_htm2 <31, 750, + (outs crrc0:$ret), (ins u1imm:$L), "tsr. $L", IIC_SprMTSPR, []>, + isDOT; + +def TCHECK : XForm_htm3 <31, 718, + (outs), (ins crrc:$BF), "tcheck $BF", IIC_SprMTSPR, []>; + + +def TRECLAIM : XForm_base_r3xo <31, 942, + (outs crrc:$ret), (ins gprc:$A), "treclaim. $A", + IIC_SprMTSPR, []>, + isDOT { + let RST = 0; + let B = 0; +} + +def TRECHKPT : XForm_base_r3xo <31, 1006, + (outs crrc:$ret), (ins), "trechkpt.", IIC_SprMTSPR, []>, + isDOT { + let RST = 0; + let A = 0; + let B = 0; +} + +// Builtins + +// All HTM instructions, with the exception of tcheck, set CR0 with the +// value of the MSR Transaction State (TS) bits that exist before the +// instruction is executed. For tbegin., the EQ bit in CR0 can be used +// to determine whether the transaction was successfully started (0) or +// failed (1). We use an XORI pattern to 'flip' the bit to match the +// tbegin builtin API which defines a return value of 1 as success. + +def : Pat<(int_ppc_tbegin i32:$R), + (XORI + (EXTRACT_SUBREG ( + TBEGIN (HTM_get_imm imm:$R)), sub_eq), + 1)>; + +def : Pat<(int_ppc_tend i32:$R), + (TEND (HTM_get_imm imm:$R))>; + + +def : Pat<(int_ppc_tabort i32:$R), + (TABORT $R)>; + +def : Pat<(int_ppc_tabortwc i32:$TO, i32:$RA, i32:$RB), + (TABORTWC (HTM_get_imm imm:$TO), $RA, $RB)>; + +def : Pat<(int_ppc_tabortwci i32:$TO, i32:$RA, i32:$SI), + (TABORTWCI (HTM_get_imm imm:$TO), $RA, (HTM_get_imm imm:$SI))>; + +def : Pat<(int_ppc_tabortdc i32:$TO, i32:$RA, i32:$RB), + (TABORTDC (HTM_get_imm imm:$TO), $RA, $RB)>; + +def : Pat<(int_ppc_tabortdci i32:$TO, i32:$RA, i32:$SI), + (TABORTDCI (HTM_get_imm imm:$TO), $RA, (HTM_get_imm imm:$SI))>; + +def : Pat<(int_ppc_tcheck), + (TCHECK_RET)>; + +def : Pat<(int_ppc_treclaim i32:$RA), + (TRECLAIM $RA)>; + +def : Pat<(int_ppc_trechkpt), + (TRECHKPT)>; + +def : Pat<(int_ppc_tsr i32:$L), + (TSR (HTM_get_imm imm:$L))>; + +def : Pat<(int_ppc_get_texasr), + (MFSPR8 130)>; + +def : Pat<(int_ppc_get_texasru), + (MFSPR8 131)>; + +def : Pat<(int_ppc_get_tfhar), + (MFSPR8 128)>; + +def : Pat<(int_ppc_get_tfiar), + (MFSPR8 129)>; + + +def : Pat<(int_ppc_set_texasr i64:$V), + (MTSPR8 130, $V)>; + +def : Pat<(int_ppc_set_texasru i64:$V), + (MTSPR8 131, $V)>; + +def : Pat<(int_ppc_set_tfhar i64:$V), + (MTSPR8 128, $V)>; + +def : Pat<(int_ppc_set_tfiar i64:$V), + (MTSPR8 129, $V)>; + + +// Extended mnemonics +def : Pat<(int_ppc_tendall), + (TEND 1)>; + +def : Pat<(int_ppc_tresume), + (TSR 1)>; + +def : Pat<(int_ppc_tsuspend), + (TSR 0)>; + +def : Pat<(i64 (int_ppc_ttest)), + (RLDICL (i64 (COPY (TABORTWCI 0, ZERO, 0))), 36, 28)>; + +} // [HasHTM] diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index fe9474a..c9c2949 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -61,7 +61,7 @@ void PPCInstrInfo::anchor() {} PPCInstrInfo::PPCInstrInfo(PPCSubtarget &STI) : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP), - Subtarget(STI), RI(STI) {} + Subtarget(STI), RI(STI.getTargetMachine()) {} /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for /// this target when scheduling the DAG. @@ -113,9 +113,8 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, const MachineOperand &DefMO = DefMI->getOperand(DefIdx); unsigned Reg = DefMO.getReg(); - const TargetRegisterInfo *TRI = &getRegisterInfo(); bool IsRegCR; - if (TRI->isVirtualRegister(Reg)) { + if (TargetRegisterInfo::isVirtualRegister(Reg)) { const MachineRegisterInfo *MRI = &DefMI->getParent()->getParent()->getRegInfo(); IsRegCR = MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRRCRegClass) || @@ -697,6 +696,33 @@ void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB, .addReg(Cond[1].getReg(), 0, SubIdx); } +static unsigned getCRBitValue(unsigned CRBit) { + unsigned Ret = 4; + if (CRBit == PPC::CR0LT || CRBit == PPC::CR1LT || + CRBit == PPC::CR2LT || CRBit == PPC::CR3LT || + CRBit == PPC::CR4LT || CRBit == PPC::CR5LT || + CRBit == PPC::CR6LT || CRBit == PPC::CR7LT) + Ret = 3; + if (CRBit == PPC::CR0GT || CRBit == PPC::CR1GT || + CRBit == PPC::CR2GT || CRBit == PPC::CR3GT || + CRBit == PPC::CR4GT || CRBit == PPC::CR5GT || + CRBit == PPC::CR6GT || CRBit == PPC::CR7GT) + Ret = 2; + if (CRBit == PPC::CR0EQ || CRBit == PPC::CR1EQ || + CRBit == PPC::CR2EQ || CRBit == PPC::CR3EQ || + CRBit == PPC::CR4EQ || CRBit == PPC::CR5EQ || + CRBit == PPC::CR6EQ || CRBit == PPC::CR7EQ) + Ret = 1; + if (CRBit == PPC::CR0UN || CRBit == PPC::CR1UN || + CRBit == PPC::CR2UN || CRBit == PPC::CR3UN || + CRBit == PPC::CR4UN || CRBit == PPC::CR5UN || + CRBit == PPC::CR6UN || CRBit == PPC::CR7UN) + Ret = 0; + + assert(Ret != 4 && "Invalid CR bit register"); + return Ret; +} + void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, @@ -742,6 +768,32 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, SrcReg = SuperReg; } + // Different class register copy + if (PPC::CRBITRCRegClass.contains(SrcReg) && + PPC::GPRCRegClass.contains(DestReg)) { + unsigned CRReg = getCRFromCRBit(SrcReg); + BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg) + .addReg(CRReg), getKillRegState(KillSrc); + // Rotate the CR bit in the CR fields to be the least significant bit and + // then mask with 0x1 (MB = ME = 31). + BuildMI(MBB, I, DL, get(PPC::RLWINM), DestReg) + .addReg(DestReg, RegState::Kill) + .addImm(TRI->getEncodingValue(CRReg) * 4 + (4 - getCRBitValue(SrcReg))) + .addImm(31) + .addImm(31); + return; + } else if (PPC::CRRCRegClass.contains(SrcReg) && + PPC::G8RCRegClass.contains(DestReg)) { + BuildMI(MBB, I, DL, get(PPC::MFOCRF8), DestReg) + .addReg(SrcReg), getKillRegState(KillSrc); + return; + } else if (PPC::CRRCRegClass.contains(SrcReg) && + PPC::GPRCRegClass.contains(DestReg)) { + BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg) + .addReg(SrcReg), getKillRegState(KillSrc); + return; + } + unsigned Opc; if (PPC::GPRCRegClass.contains(DestReg, SrcReg)) Opc = PPC::OR; diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h index 4add6f9..7fd076a 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.h +++ b/lib/Target/PowerPC/PPCInstrInfo.h @@ -63,7 +63,7 @@ enum PPC970_Unit { }; } // end namespace PPCII - +class PPCSubtarget; class PPCInstrInfo : public PPCGenInstrInfo { PPCSubtarget &Subtarget; const PPCRegisterInfo RI; diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index 1a045b1..5eff156 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -46,13 +46,6 @@ def SDT_PPCstbrx : SDTypeProfile<0, 3, [ SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT> ]>; -def SDT_PPClarx : SDTypeProfile<1, 1, [ - SDTCisInt<0>, SDTCisPtrTy<1> -]>; -def SDT_PPCstcx : SDTypeProfile<0, 2, [ - SDTCisInt<0>, SDTCisPtrTy<1> -]>; - def SDT_PPCTC_ret : SDTypeProfile<0, 2, [ SDTCisPtrTy<0>, SDTCisVT<1, i32> ]>; @@ -225,12 +218,6 @@ def PPCcr6set : SDNode<"PPCISD::CR6SET", SDTNone, def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; -// Instructions to support atomic operations -def PPClarx : SDNode<"PPCISD::LARX", SDT_PPClarx, - [SDNPHasChain, SDNPMayLoad]>; -def PPCstcx : SDNode<"PPCISD::STCX", SDT_PPCstcx, - [SDNPHasChain, SDNPMayStore]>; - // Instructions to support dynamic alloca. def SDTDynOp : SDTypeProfile<1, 2, []>; def PPCdynalloc : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>; @@ -445,6 +432,18 @@ def PPCRegCRRCAsmOperand : AsmOperandClass { def crrc : RegisterOperand<CRRC> { let ParserMatchClass = PPCRegCRRCAsmOperand; } +def crrc0 : RegisterOperand<CRRC0> { + let ParserMatchClass = PPCRegCRRCAsmOperand; +} + +def PPCU1ImmAsmOperand : AsmOperandClass { + let Name = "U1Imm"; let PredicateMethod = "isU1Imm"; + let RenderMethod = "addImmOperands"; +} +def u1imm : Operand<i32> { + let PrintMethod = "printU1ImmOperand"; + let ParserMatchClass = PPCU1ImmAsmOperand; +} def PPCU2ImmAsmOperand : AsmOperandClass { let Name = "U2Imm"; let PredicateMethod = "isU2Imm"; @@ -455,6 +454,15 @@ def u2imm : Operand<i32> { let ParserMatchClass = PPCU2ImmAsmOperand; } +def PPCU3ImmAsmOperand : AsmOperandClass { + let Name = "U3Imm"; let PredicateMethod = "isU3Imm"; + let RenderMethod = "addImmOperands"; +} +def u3imm : Operand<i32> { + let PrintMethod = "printU3ImmOperand"; + let ParserMatchClass = PPCU3ImmAsmOperand; +} + def PPCU4ImmAsmOperand : AsmOperandClass { let Name = "U4Imm"; let PredicateMethod = "isU4Imm"; let RenderMethod = "addImmOperands"; @@ -715,7 +723,7 @@ def IsPPC6xx : Predicate<"PPCSubTarget->isPPC6xx()">; def IsE500 : Predicate<"PPCSubTarget->isE500()">; def HasSPE : Predicate<"PPCSubTarget->HasSPE()">; def HasICBT : Predicate<"PPCSubTarget->hasICBT()">; - +def HasPartwordAtomics : Predicate<"PPCSubTarget->hasPartwordAtomics()">; def NoNaNsFPMath : Predicate<"TM.Options.NoNaNsFPMath">; def NaNsFPMath : Predicate<"!TM.Options.NoNaNsFPMath">; @@ -1446,15 +1454,44 @@ let usesCustomInserter = 1 in { } // Instructions to support atomic operations +let mayLoad = 1, hasSideEffects = 0 in { +def LBARX : XForm_1<31, 52, (outs gprc:$rD), (ins memrr:$src), + "lbarx $rD, $src", IIC_LdStLWARX, []>, + Requires<[HasPartwordAtomics]>; + +def LHARX : XForm_1<31, 116, (outs gprc:$rD), (ins memrr:$src), + "lharx $rD, $src", IIC_LdStLWARX, []>, + Requires<[HasPartwordAtomics]>; + def LWARX : XForm_1<31, 20, (outs gprc:$rD), (ins memrr:$src), - "lwarx $rD, $src", IIC_LdStLWARX, - [(set i32:$rD, (PPClarx xoaddr:$src))]>; + "lwarx $rD, $src", IIC_LdStLWARX, []>; + +// Instructions to support lock versions of atomics +// (EH=1 - see Power ISA 2.07 Book II 4.4.2) +def LBARXL : XForm_1<31, 52, (outs gprc:$rD), (ins memrr:$src), + "lbarx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT, + Requires<[HasPartwordAtomics]>; + +def LHARXL : XForm_1<31, 116, (outs gprc:$rD), (ins memrr:$src), + "lharx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT, + Requires<[HasPartwordAtomics]>; + +def LWARXL : XForm_1<31, 20, (outs gprc:$rD), (ins memrr:$src), + "lwarx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT; +} + +let Defs = [CR0], mayStore = 1, hasSideEffects = 0 in { +def STBCX : XForm_1<31, 694, (outs), (ins gprc:$rS, memrr:$dst), + "stbcx. $rS, $dst", IIC_LdStSTWCX, []>, + isDOT, Requires<[HasPartwordAtomics]>; + +def STHCX : XForm_1<31, 726, (outs), (ins gprc:$rS, memrr:$dst), + "sthcx. $rS, $dst", IIC_LdStSTWCX, []>, + isDOT, Requires<[HasPartwordAtomics]>; -let Defs = [CR0] in def STWCX : XForm_1<31, 150, (outs), (ins gprc:$rS, memrr:$dst), - "stwcx. $rS, $dst", IIC_LdStSTWCX, - [(PPCstcx i32:$rS, xoaddr:$dst)]>, - isDOT; + "stwcx. $rS, $dst", IIC_LdStSTWCX, []>, isDOT; +} let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in def TRAP : XForm_24<31, 4, (outs), (ins), "trap", IIC_LdStLoad, [(trap)]>; @@ -1473,7 +1510,7 @@ def TD : XForm_1<31, 68, (outs), (ins u5imm:$to, g8rc:$rA, g8rc:$rB), // // Unindexed (r+i) Loads. -let canFoldAsLoad = 1, PPC970_Unit = 2 in { +let PPC970_Unit = 2 in { def LBZ : DForm_1<34, (outs gprc:$rD), (ins memri:$src), "lbz $rD, $src", IIC_LdStLoad, [(set i32:$rD, (zextloadi8 iaddr:$src))]>; @@ -1570,7 +1607,7 @@ def LFDUX : XForm_1<31, 631, (outs f8rc:$rD, ptr_rc_nor0:$ea_result), // Indexed (r+r) Loads. // -let canFoldAsLoad = 1, PPC970_Unit = 2 in { +let PPC970_Unit = 2 in { def LBZX : XForm_1<31, 87, (outs gprc:$rD), (ins memrr:$src), "lbzx $rD, $src", IIC_LdStLoad, [(set i32:$rD, (zextloadi8 xaddr:$src))]>; @@ -2683,6 +2720,7 @@ include "PPCInstrSPE.td" include "PPCInstr64Bit.td" include "PPCInstrVSX.td" include "PPCInstrQPX.td" +include "PPCInstrHTM.td" def crnot : OutPatFrag<(ops node:$in), (CRNOR $in, $in)>; diff --git a/lib/Target/PowerPC/PPCInstrQPX.td b/lib/Target/PowerPC/PPCInstrQPX.td index c984d46..5c66b42 100644 --- a/lib/Target/PowerPC/PPCInstrQPX.td +++ b/lib/Target/PowerPC/PPCInstrQPX.td @@ -501,7 +501,7 @@ let Uses = [RM] in { "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>; // Load indexed instructions - let mayLoad = 1, canFoldAsLoad = 1 in { + let mayLoad = 1 in { def QVLFDX : XForm_1<31, 583, (outs qfrc:$FRT), (ins memrr:$src), "qvlfdx $FRT, $src", IIC_LdStLFD, diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td index d6cb3a0..ec04da4 100644 --- a/lib/Target/PowerPC/PPCInstrVSX.td +++ b/lib/Target/PowerPC/PPCInstrVSX.td @@ -66,7 +66,7 @@ let hasSideEffects = 0 in { // VSX instructions don't have side effects. let Uses = [RM] in { // Load indexed instructions - let mayLoad = 1, canFoldAsLoad = 1 in { + let mayLoad = 1 in { def LXSDX : XX1Form<31, 588, (outs vsfrc:$XT), (ins memrr:$src), "lxsdx $XT, $src", IIC_LdStLFD, diff --git a/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp b/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp index efd2d92..005bcaf 100644 --- a/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp +++ b/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp @@ -104,7 +104,7 @@ FunctionPass *llvm::createPPCLoopDataPrefetchPass() { return new PPCLoopDataPref bool PPCLoopDataPrefetch::runOnFunction(Function &F) { LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); SE = &getAnalysis<ScalarEvolution>(); - DL = F.getParent()->getDataLayout(); + DL = &F.getParent()->getDataLayout(); AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); @@ -192,7 +192,7 @@ bool PPCLoopDataPrefetch::runOnLoop(Loop *L) { const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, K->second); if (const SCEVConstant *ConstPtrDiff = dyn_cast<SCEVConstant>(PtrDiff)) { - int64_t PD = abs64(ConstPtrDiff->getValue()->getSExtValue()); + int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue()); if (PD < (int64_t) CacheLineSize) { DupPref = true; break; @@ -211,7 +211,7 @@ bool PPCLoopDataPrefetch::runOnLoop(Loop *L) { PrefLoads.push_back(std::make_pair(MemI, LSCEVAddRec)); Type *I8Ptr = Type::getInt8PtrTy((*I)->getContext(), PtrAddrSpace); - SCEVExpander SCEVE(*SE, "prefaddr"); + SCEVExpander SCEVE(*SE, J->getModule()->getDataLayout(), "prefaddr"); Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, MemI); IRBuilder<> Builder(MemI); diff --git a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp index df65227..092a4ef 100644 --- a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp +++ b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp @@ -36,6 +36,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" @@ -84,7 +85,6 @@ namespace { PPCTargetMachine *TM; LoopInfo *LI; ScalarEvolution *SE; - const DataLayout *DL; }; } @@ -141,9 +141,6 @@ bool PPCLoopPreIncPrep::runOnFunction(Function &F) { LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); SE = &getAnalysis<ScalarEvolution>(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : 0; - bool MadeChange = false; for (LoopInfo::iterator I = LI->begin(), E = LI->end(); @@ -158,9 +155,6 @@ bool PPCLoopPreIncPrep::runOnFunction(Function &F) { bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { bool MadeChange = false; - if (!DL) - return MadeChange; - // Only prep. the inner-most loop if (!L->empty()) return MadeChange; @@ -261,6 +255,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { Value *BasePtr = GetPointerOperand(MemI); assert(BasePtr && "No pointer operand"); + Type *I8Ty = Type::getInt8Ty(MemI->getParent()->getContext()); Type *I8PtrTy = Type::getInt8PtrTy(MemI->getParent()->getContext(), BasePtr->getType()->getPointerAddressSpace()); @@ -280,7 +275,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { MemI->hasName() ? MemI->getName() + ".phi" : "", Header->getFirstNonPHI()); - SCEVExpander SCEVE(*SE, "pistart"); + SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "pistart"); Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy, LoopPredecessor->getTerminator()); @@ -295,8 +290,8 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { } Instruction *InsPoint = Header->getFirstInsertionPt(); - GetElementPtrInst *PtrInc = - GetElementPtrInst::Create(NewPHI, BasePtrIncSCEV->getValue(), + GetElementPtrInst *PtrInc = GetElementPtrInst::Create( + I8Ty, NewPHI, BasePtrIncSCEV->getValue(), MemI->hasName() ? MemI->getName() + ".inc" : "", InsPoint); PtrInc->setIsInBounds(IsPtrInBounds(BasePtr)); for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header); @@ -341,9 +336,9 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { PtrIP = PtrIP->getParent()->getFirstInsertionPt(); else if (!PtrIP) PtrIP = I->second; - - GetElementPtrInst *NewPtr = - GetElementPtrInst::Create(PtrInc, Diff->getValue(), + + GetElementPtrInst *NewPtr = GetElementPtrInst::Create( + I8Ty, PtrInc, Diff->getValue(), I->second->hasName() ? I->second->getName() + ".off" : "", PtrIP); if (!PtrIP) NewPtr->insertAfter(cast<Instruction>(PtrInc)); diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp index 819738b..0965cb3 100644 --- a/lib/Target/PowerPC/PPCMCInstLower.cpp +++ b/lib/Target/PowerPC/PPCMCInstLower.cpp @@ -184,6 +184,9 @@ void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, llvm_unreachable("unknown operand type"); case MachineOperand::MO_Register: assert(!MO.getSubReg() && "Subregs should be eliminated!"); + assert(MO.getReg() > PPC::NoRegister && + MO.getReg() < PPC::NUM_TARGET_REGS && + "Invalid register for this target!"); MCOp = MCOperand::CreateReg(MO.getReg()); break; case MachineOperand::MO_Immediate: diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp index c9a9684..0e568d3 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -18,6 +18,7 @@ #include "PPCInstrBuilder.h" #include "PPCMachineFunctionInfo.h" #include "PPCSubtarget.h" +#include "PPCTargetMachine.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -56,11 +57,11 @@ static cl::opt<bool> AlwaysBasePointer("ppc-always-use-base-pointer", cl::Hidden, cl::init(false), cl::desc("Force the use of a base pointer in every function")); -PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST) - : PPCGenRegisterInfo(ST.isPPC64() ? PPC::LR8 : PPC::LR, - ST.isPPC64() ? 0 : 1, - ST.isPPC64() ? 0 : 1), - Subtarget(ST) { +PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM) + : PPCGenRegisterInfo(TM.isPPC64() ? PPC::LR8 : PPC::LR, + TM.isPPC64() ? 0 : 1, + TM.isPPC64() ? 0 : 1), + TM(TM) { ImmToIdxMap[PPC::LD] = PPC::LDX; ImmToIdxMap[PPC::STD] = PPC::STDX; ImmToIdxMap[PPC::LBZ] = PPC::LBZX; ImmToIdxMap[PPC::STB] = PPC::STBX; ImmToIdxMap[PPC::LHZ] = PPC::LHZX; ImmToIdxMap[PPC::LHA] = PPC::LHAX; @@ -87,18 +88,19 @@ PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) // Note that PPCInstrInfo::FoldImmediate also directly uses this Kind value // when it checks for ZERO folding. if (Kind == 1) { - if (Subtarget.isPPC64()) + if (TM.isPPC64()) return &PPC::G8RC_NOX0RegClass; return &PPC::GPRC_NOR0RegClass; } - if (Subtarget.isPPC64()) + if (TM.isPPC64()) return &PPC::G8RCRegClass; return &PPC::GPRCRegClass; } const MCPhysReg* PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + const PPCSubtarget &Subtarget = MF->getSubtarget<PPCSubtarget>(); if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg) { if (Subtarget.hasVSX()) return CSR_64_AllRegs_VSX_SaveList; @@ -108,28 +110,28 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { } if (Subtarget.isDarwinABI()) - return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ? - CSR_Darwin64_Altivec_SaveList : - CSR_Darwin64_SaveList) : - (Subtarget.hasAltivec() ? - CSR_Darwin32_Altivec_SaveList : - CSR_Darwin32_SaveList); + return TM.isPPC64() + ? (Subtarget.hasAltivec() ? CSR_Darwin64_Altivec_SaveList + : CSR_Darwin64_SaveList) + : (Subtarget.hasAltivec() ? CSR_Darwin32_Altivec_SaveList + : CSR_Darwin32_SaveList); // On PPC64, we might need to save r2 (but only if it is not reserved). bool SaveR2 = MF->getRegInfo().isAllocatable(PPC::X2); - return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ? - (SaveR2 ? CSR_SVR464_R2_Altivec_SaveList : - CSR_SVR464_Altivec_SaveList) : - (SaveR2 ? CSR_SVR464_R2_SaveList : - CSR_SVR464_SaveList)) : - (Subtarget.hasAltivec() ? - CSR_SVR432_Altivec_SaveList : - CSR_SVR432_SaveList); + return TM.isPPC64() + ? (Subtarget.hasAltivec() + ? (SaveR2 ? CSR_SVR464_R2_Altivec_SaveList + : CSR_SVR464_Altivec_SaveList) + : (SaveR2 ? CSR_SVR464_R2_SaveList : CSR_SVR464_SaveList)) + : (Subtarget.hasAltivec() ? CSR_SVR432_Altivec_SaveList + : CSR_SVR432_SaveList); } -const uint32_t* -PPCRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { +const uint32_t * +PPCRegisterInfo::getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); if (CC == CallingConv::AnyReg) { if (Subtarget.hasVSX()) return CSR_64_AllRegs_VSX_RegMask; @@ -139,19 +141,15 @@ PPCRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { } if (Subtarget.isDarwinABI()) - return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ? - CSR_Darwin64_Altivec_RegMask : - CSR_Darwin64_RegMask) : - (Subtarget.hasAltivec() ? - CSR_Darwin32_Altivec_RegMask : - CSR_Darwin32_RegMask); - - return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ? - CSR_SVR464_Altivec_RegMask : - CSR_SVR464_RegMask) : - (Subtarget.hasAltivec() ? - CSR_SVR432_Altivec_RegMask : - CSR_SVR432_RegMask); + return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_Darwin64_Altivec_RegMask + : CSR_Darwin64_RegMask) + : (Subtarget.hasAltivec() ? CSR_Darwin32_Altivec_RegMask + : CSR_Darwin32_RegMask); + + return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_SVR464_Altivec_RegMask + : CSR_SVR464_RegMask) + : (Subtarget.hasAltivec() ? CSR_SVR432_Altivec_RegMask + : CSR_SVR432_RegMask); } const uint32_t* @@ -160,15 +158,13 @@ PPCRegisterInfo::getNoPreservedMask() const { } void PPCRegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const { - unsigned PseudoRegs[] = { PPC::ZERO, PPC::ZERO8, PPC::RM }; - for (unsigned i = 0, ie = array_lengthof(PseudoRegs); i != ie; ++i) { - unsigned Reg = PseudoRegs[i]; - Mask[Reg / 32] &= ~(1u << (Reg % 32)); - } + for (unsigned PseudoReg : {PPC::ZERO, PPC::ZERO8, PPC::RM}) + Mask[PseudoReg / 32] &= ~(1u << (PseudoReg % 32)); } BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); const PPCFrameLowering *PPCFI = static_cast<const PPCFrameLowering *>(Subtarget.getFrameLowering()); @@ -207,7 +203,7 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const { } // On PPC64, r13 is the thread pointer. Never allocate this register. - if (Subtarget.isPPC64()) { + if (TM.isPPC64()) { Reserved.set(PPC::R13); Reserved.set(PPC::X1); @@ -238,15 +234,15 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(PPC::R31); if (hasBasePointer(MF)) { - if (Subtarget.isSVR4ABI() && !Subtarget.isPPC64() && - MF.getTarget().getRelocationModel() == Reloc::PIC_) + if (Subtarget.isSVR4ABI() && !TM.isPPC64() && + TM.getRelocationModel() == Reloc::PIC_) Reserved.set(PPC::R29); else Reserved.set(PPC::R30); } - if (Subtarget.isSVR4ABI() && !Subtarget.isPPC64() && - MF.getTarget().getRelocationModel() == Reloc::PIC_) + if (Subtarget.isSVR4ABI() && !TM.isPPC64() && + TM.getRelocationModel() == Reloc::PIC_) Reserved.set(PPC::R30); // Reserve Altivec registers when Altivec is unavailable. @@ -260,6 +256,7 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const { unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); const TargetFrameLowering *TFI = Subtarget.getFrameLowering(); const unsigned DefaultSafety = 1; @@ -291,8 +288,10 @@ unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, } } -const TargetRegisterClass *PPCRegisterInfo::getLargestLegalSuperClass( - const TargetRegisterClass *RC) const { +const TargetRegisterClass * +PPCRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const { + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); if (Subtarget.hasVSX()) { // With VSX, we can inflate various sub-register classes to the full VSX // register set. @@ -303,7 +302,7 @@ const TargetRegisterClass *PPCRegisterInfo::getLargestLegalSuperClass( return &PPC::VSRCRegClass; } - return TargetRegisterInfo::getLargestLegalSuperClass(RC); + return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF); } //===----------------------------------------------------------------------===// @@ -326,10 +325,11 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const { MachineFunction &MF = *MBB.getParent(); // Get the frame info. MachineFrameInfo *MFI = MF.getFrameInfo(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); // Get the instruction info. const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); // Determine whether 64-bit pointers are used. - bool LP64 = Subtarget.isPPC64(); + bool LP64 = TM.isPPC64(); DebugLoc dl = MI.getDebugLoc(); // Get the maximum call stack size. @@ -443,10 +443,11 @@ void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II, // Get the instruction's basic block. MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); DebugLoc dl = MI.getDebugLoc(); - bool LP64 = Subtarget.isPPC64(); + bool LP64 = TM.isPPC64(); const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; @@ -487,10 +488,11 @@ void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II, // Get the instruction's basic block. MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); DebugLoc dl = MI.getDebugLoc(); - bool LP64 = Subtarget.isPPC64(); + bool LP64 = TM.isPPC64(); const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; @@ -522,37 +524,6 @@ void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II, MBB.erase(II); } -static unsigned getCRFromCRBit(unsigned SrcReg) { - unsigned Reg = 0; - if (SrcReg == PPC::CR0LT || SrcReg == PPC::CR0GT || - SrcReg == PPC::CR0EQ || SrcReg == PPC::CR0UN) - Reg = PPC::CR0; - else if (SrcReg == PPC::CR1LT || SrcReg == PPC::CR1GT || - SrcReg == PPC::CR1EQ || SrcReg == PPC::CR1UN) - Reg = PPC::CR1; - else if (SrcReg == PPC::CR2LT || SrcReg == PPC::CR2GT || - SrcReg == PPC::CR2EQ || SrcReg == PPC::CR2UN) - Reg = PPC::CR2; - else if (SrcReg == PPC::CR3LT || SrcReg == PPC::CR3GT || - SrcReg == PPC::CR3EQ || SrcReg == PPC::CR3UN) - Reg = PPC::CR3; - else if (SrcReg == PPC::CR4LT || SrcReg == PPC::CR4GT || - SrcReg == PPC::CR4EQ || SrcReg == PPC::CR4UN) - Reg = PPC::CR4; - else if (SrcReg == PPC::CR5LT || SrcReg == PPC::CR5GT || - SrcReg == PPC::CR5EQ || SrcReg == PPC::CR5UN) - Reg = PPC::CR5; - else if (SrcReg == PPC::CR6LT || SrcReg == PPC::CR6GT || - SrcReg == PPC::CR6EQ || SrcReg == PPC::CR6UN) - Reg = PPC::CR6; - else if (SrcReg == PPC::CR7LT || SrcReg == PPC::CR7GT || - SrcReg == PPC::CR7EQ || SrcReg == PPC::CR7UN) - Reg = PPC::CR7; - - assert(Reg != 0 && "Invalid CR bit register"); - return Reg; -} - void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II, unsigned FrameIndex) const { // Get the instruction. @@ -560,10 +531,11 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II, // Get the instruction's basic block. MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); DebugLoc dl = MI.getDebugLoc(); - bool LP64 = Subtarget.isPPC64(); + bool LP64 = TM.isPPC64(); const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; @@ -603,10 +575,11 @@ void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II, // Get the instruction's basic block. MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); DebugLoc dl = MI.getDebugLoc(); - bool LP64 = Subtarget.isPPC64(); + bool LP64 = TM.isPPC64(); const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; @@ -650,6 +623,7 @@ void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II, // Get the instruction's basic block. MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); DebugLoc dl = MI.getDebugLoc(); @@ -675,6 +649,7 @@ void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II, // Get the instruction's basic block. MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); DebugLoc dl = MI.getDebugLoc(); @@ -697,14 +672,14 @@ void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II, bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg, int &FrameIdx) const { - + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); // For the nonvolatile condition registers (CR2, CR3, CR4) in an SVR4 // ABI, return true to prevent allocating an additional frame slot. // For 64-bit, the CR save area is at SP+8; the value of FrameIdx = 0 // is arbitrary and will be subsequently ignored. For 32-bit, we have // previously created the stack slot if needed, so return its FrameIdx. if (Subtarget.isSVR4ABI() && PPC::CR2 <= Reg && Reg <= PPC::CR4) { - if (Subtarget.isPPC64()) + if (TM.isPPC64()) FrameIdx = 0; else { const PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); @@ -757,6 +732,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MachineBasicBlock &MBB = *MI.getParent(); // Get the basic block's function. MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); // Get the instruction info. const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); // Get the frame info. @@ -847,7 +823,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // The offset doesn't fit into a single register, scavenge one to build the // offset in. - bool is64Bit = Subtarget.isPPC64(); + bool is64Bit = TM.isPPC64(); const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; const TargetRegisterClass *RC = is64Bit ? G8RC : GPRC; @@ -885,23 +861,25 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const { + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); const TargetFrameLowering *TFI = Subtarget.getFrameLowering(); - if (!Subtarget.isPPC64()) + if (!TM.isPPC64()) return TFI->hasFP(MF) ? PPC::R31 : PPC::R1; else return TFI->hasFP(MF) ? PPC::X31 : PPC::X1; } unsigned PPCRegisterInfo::getBaseRegister(const MachineFunction &MF) const { + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); if (!hasBasePointer(MF)) return getFrameRegister(MF); - if (Subtarget.isPPC64()) + if (TM.isPPC64()) return PPC::X30; if (Subtarget.isSVR4ABI() && - MF.getTarget().getRelocationModel() == Reloc::PIC_) + TM.getRelocationModel() == Reloc::PIC_) return PPC::R29; return PPC::R30; @@ -927,6 +905,7 @@ bool PPCRegisterInfo::canRealignStack(const MachineFunction &MF) const { } bool PPCRegisterInfo::needsStackRealignment(const MachineFunction &MF) const { + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); const MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *F = MF.getFunction(); unsigned StackAlign = Subtarget.getFrameLowering()->getStackAlignment(); @@ -964,7 +943,7 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { MachineBasicBlock &MBB = *MI->getParent(); MachineFunction &MF = *MBB.getParent(); - + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); const PPCFrameLowering *PPCFI = static_cast<const PPCFrameLowering *>(Subtarget.getFrameLowering()); unsigned StackEst = @@ -983,7 +962,7 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { // The frame pointer will point to the end of the stack, so estimate the // offset as the difference between the object offset and the FP location. - return !isFrameOffsetLegal(MI, Offset); + return !isFrameOffsetLegal(MI, getBaseRegister(MF), Offset); } /// Insert defining instruction(s) for BaseReg to @@ -992,7 +971,7 @@ void PPCRegisterInfo:: materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg, int FrameIdx, int64_t Offset) const { - unsigned ADDriOpc = Subtarget.isPPC64() ? PPC::ADDI8 : PPC::ADDI; + unsigned ADDriOpc = TM.isPPC64() ? PPC::ADDI8 : PPC::ADDI; MachineBasicBlock::iterator Ins = MBB->begin(); DebugLoc DL; // Defaults to "unknown" @@ -1000,6 +979,7 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB, DL = Ins->getDebugLoc(); const MachineFunction &MF = *MBB->getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); const MCInstrDesc &MCID = TII.get(ADDriOpc); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); @@ -1025,6 +1005,7 @@ void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); const MCInstrDesc &MCID = MI.getDesc(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -1033,6 +1014,7 @@ void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, } bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, + unsigned BaseReg, int64_t Offset) const { unsigned FIOperandNum = 0; while (!MI->getOperand(FIOperandNum).isFI()) { diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h index 4c2ef90..d304e1d 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/lib/Target/PowerPC/PPCRegisterInfo.h @@ -22,15 +22,44 @@ #include "PPCGenRegisterInfo.inc" namespace llvm { -class PPCSubtarget; -class TargetInstrInfo; -class Type; + +inline static unsigned getCRFromCRBit(unsigned SrcReg) { + unsigned Reg = 0; + if (SrcReg == PPC::CR0LT || SrcReg == PPC::CR0GT || + SrcReg == PPC::CR0EQ || SrcReg == PPC::CR0UN) + Reg = PPC::CR0; + else if (SrcReg == PPC::CR1LT || SrcReg == PPC::CR1GT || + SrcReg == PPC::CR1EQ || SrcReg == PPC::CR1UN) + Reg = PPC::CR1; + else if (SrcReg == PPC::CR2LT || SrcReg == PPC::CR2GT || + SrcReg == PPC::CR2EQ || SrcReg == PPC::CR2UN) + Reg = PPC::CR2; + else if (SrcReg == PPC::CR3LT || SrcReg == PPC::CR3GT || + SrcReg == PPC::CR3EQ || SrcReg == PPC::CR3UN) + Reg = PPC::CR3; + else if (SrcReg == PPC::CR4LT || SrcReg == PPC::CR4GT || + SrcReg == PPC::CR4EQ || SrcReg == PPC::CR4UN) + Reg = PPC::CR4; + else if (SrcReg == PPC::CR5LT || SrcReg == PPC::CR5GT || + SrcReg == PPC::CR5EQ || SrcReg == PPC::CR5UN) + Reg = PPC::CR5; + else if (SrcReg == PPC::CR6LT || SrcReg == PPC::CR6GT || + SrcReg == PPC::CR6EQ || SrcReg == PPC::CR6UN) + Reg = PPC::CR6; + else if (SrcReg == PPC::CR7LT || SrcReg == PPC::CR7GT || + SrcReg == PPC::CR7EQ || SrcReg == PPC::CR7UN) + Reg = PPC::CR7; + + assert(Reg != 0 && "Invalid CR bit register"); + return Reg; +} + class PPCRegisterInfo : public PPCGenRegisterInfo { DenseMap<unsigned, unsigned> ImmToIdxMap; - const PPCSubtarget &Subtarget; + const PPCTargetMachine &TM; public: - PPCRegisterInfo(const PPCSubtarget &SubTarget); + PPCRegisterInfo(const PPCTargetMachine &TM); /// getPointerRegClass - Return the register class to use to hold pointers. /// This is used for addressing modes. @@ -40,13 +69,14 @@ public: unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; - const TargetRegisterClass* - getLargestLegalSuperClass(const TargetRegisterClass *RC) const override; + const TargetRegisterClass * + getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const override; /// Code Generation virtual methods... - const MCPhysReg * - getCalleeSavedRegs(const MachineFunction* MF =nullptr) const override; - const uint32_t *getCallPreservedMask(CallingConv::ID CC) const override; + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const uint32_t *getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const override; const uint32_t *getNoPreservedMask() const; void adjustStackMapLiveOutMask(uint32_t *Mask) const override; @@ -97,7 +127,7 @@ public: int64_t Offset) const override; void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, int64_t Offset) const override; - bool isFrameOffsetLegal(const MachineInstr *MI, + bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg, int64_t Offset) const override; // Debug information queries. diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td index 9a7df96..6ca68ed 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/lib/Target/PowerPC/PPCRegisterInfo.td @@ -341,6 +341,8 @@ def CRBITRC : RegisterClass<"PPC", [i1], 32, def CRRC : RegisterClass<"PPC", [i32], 32, (add CR0, CR1, CR5, CR6, CR7, CR2, CR3, CR4)>; +def CRRC0 : RegisterClass<"PPC", [i32], 32, (add CR0)>; + // The CTR registers are not allocatable because they're used by the // decrement-and-branch instructions, and thus need to stay live across // multiple basic blocks. diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td index 2f3a1f9..d0954a1 100644 --- a/lib/Target/PowerPC/PPCSchedule.td +++ b/lib/Target/PowerPC/PPCSchedule.td @@ -124,401 +124,3 @@ include "PPCScheduleP8.td" include "PPCScheduleA2.td" include "PPCScheduleE500mc.td" include "PPCScheduleE5500.td" - -//===----------------------------------------------------------------------===// -// Instruction to itinerary class map - When add new opcodes to the supported -// set, refer to the following table to determine which itinerary class the -// opcode belongs. -// -// opcode itinerary class -// ====== =============== -// add IIC_IntSimple -// addc IIC_IntGeneral -// adde IIC_IntGeneral -// addi IIC_IntSimple -// addic IIC_IntGeneral -// addic. IIC_IntGeneral -// addis IIC_IntSimple -// addme IIC_IntGeneral -// addze IIC_IntGeneral -// and IIC_IntSimple -// andc IIC_IntSimple -// andi. IIC_IntGeneral -// andis. IIC_IntGeneral -// b IIC_BrB -// bc IIC_BrB -// bcctr IIC_BrB -// bclr IIC_BrB -// cmp IIC_IntCompare -// cmpi IIC_IntCompare -// cmpl IIC_IntCompare -// cmpli IIC_IntCompare -// cntlzd IIC_IntRotateD -// cntlzw IIC_IntGeneral -// crand IIC_BrCR -// crandc IIC_BrCR -// creqv IIC_BrCR -// crnand IIC_BrCR -// crnor IIC_BrCR -// cror IIC_BrCR -// crorc IIC_BrCR -// crxor IIC_BrCR -// dcba IIC_LdStDCBA -// dcbf IIC_LdStDCBF -// dcbi IIC_LdStDCBI -// dcbst IIC_LdStDCBF -// dcbt IIC_LdStLoad -// dcbtst IIC_LdStLoad -// dcbz IIC_LdStDCBF -// divd IIC_IntDivD -// divdu IIC_IntDivD -// divw IIC_IntDivW -// divwu IIC_IntDivW -// dss IIC_LdStDSS -// dst IIC_LdStDSS -// dstst IIC_LdStDSS -// eciwx IIC_LdStLoad -// ecowx IIC_LdStLoad -// eieio IIC_LdStLoad -// eqv IIC_IntSimple -// extsb IIC_IntSimple -// extsh IIC_IntSimple -// extsw IIC_IntSimple -// fabs IIC_FPGeneral -// fadd IIC_FPAddSub -// fadds IIC_FPGeneral -// fcfid IIC_FPGeneral -// fcmpo IIC_FPCompare -// fcmpu IIC_FPCompare -// fctid IIC_FPGeneral -// fctidz IIC_FPGeneral -// fctiw IIC_FPGeneral -// fctiwz IIC_FPGeneral -// fdiv IIC_FPDivD -// fdivs IIC_FPDivS -// fmadd IIC_FPFused -// fmadds IIC_FPGeneral -// fmr IIC_FPGeneral -// fmsub IIC_FPFused -// fmsubs IIC_FPGeneral -// fmul IIC_FPFused -// fmuls IIC_FPGeneral -// fnabs IIC_FPGeneral -// fneg IIC_FPGeneral -// fnmadd IIC_FPFused -// fnmadds IIC_FPGeneral -// fnmsub IIC_FPFused -// fnmsubs IIC_FPGeneral -// fres IIC_FPRes -// frsp IIC_FPGeneral -// frsqrte IIC_FPGeneral -// fsel IIC_FPGeneral -// fsqrt IIC_FPSqrtD -// fsqrts IIC_FPSqrtS -// fsub IIC_FPAddSub -// fsubs IIC_FPGeneral -// icbi IIC_LdStICBI -// isel IIC_IntISEL -// isync IIC_SprISYNC -// lbz IIC_LdStLoad -// lbzu IIC_LdStLoadUpd -// lbzux IIC_LdStLoadUpdX -// lbzx IIC_LdStLoad -// ld IIC_LdStLD -// ldarx IIC_LdStLDARX -// ldu IIC_LdStLDU -// ldux IIC_LdStLDUX -// ldx IIC_LdStLD -// lfd IIC_LdStLFD -// lfdu IIC_LdStLFDU -// lfdux IIC_LdStLFDUX -// lfdx IIC_LdStLFD -// lfs IIC_LdStLFD -// lfsu IIC_LdStLFDU -// lfsux IIC_LdStLFDUX -// lfsx IIC_LdStLFD -// lha IIC_LdStLHA -// lhau IIC_LdStLHAU -// lhaux IIC_LdStLHAUX -// lhax IIC_LdStLHA -// lhbrx IIC_LdStLoad -// lhz IIC_LdStLoad -// lhzu IIC_LdStLoadUpd -// lhzux IIC_LdStLoadUpdX -// lhzx IIC_LdStLoad -// lmw IIC_LdStLMW -// lswi IIC_LdStLMW -// lswx IIC_LdStLMW -// lvebx IIC_LdStLVecX -// lvehx IIC_LdStLVecX -// lvewx IIC_LdStLVecX -// lvsl IIC_LdStLVecX -// lvsr IIC_LdStLVecX -// lvx IIC_LdStLVecX -// lvxl IIC_LdStLVecX -// lwa IIC_LdStLWA -// lwarx IIC_LdStLWARX -// lwaux IIC_LdStLHAUX -// lwax IIC_LdStLHA -// lwbrx IIC_LdStLoad -// lwz IIC_LdStLoad -// lwzu IIC_LdStLoadUpd -// lwzux IIC_LdStLoadUpdX -// lwzx IIC_LdStLoad -// mcrf IIC_BrMCR -// mcrfs IIC_FPGeneral -// mcrxr IIC_BrMCRX -// mfcr IIC_SprMFCR -// mffs IIC_IntMFFS -// mfmsr IIC_SprMFMSR -// mfspr IIC_SprMFSPR -// mfsr IIC_SprMFSR -// mfsrin IIC_SprMFSR -// mftb IIC_SprMFTB -// mfvscr IIC_IntMFVSCR -// mtcrf IIC_BrMCRX -// mtfsb0 IIC_IntMTFSB0 -// mtfsb1 IIC_IntMTFSB0 -// mtfsf IIC_IntMTFSB0 -// mtfsfi IIC_IntMTFSB0 -// mtmsr IIC_SprMTMSR -// mtmsrd IIC_LdStLD -// mtspr IIC_SprMTSPR -// mtsr IIC_SprMTSR -// mtsrd IIC_IntMTSRD -// mtsrdin IIC_IntMTSRD -// mtsrin IIC_SprMTSRIN -// mtvscr IIC_IntMFVSCR -// mulhd IIC_IntMulHD -// mulhdu IIC_IntMulHD -// mulhw IIC_IntMulHW -// mulhwu IIC_IntMulHWU -// mulld IIC_IntMulHD -// mulli IIC_IntMulLI -// mullw IIC_IntMulHW -// nand IIC_IntSimple -// neg IIC_IntSimple -// nor IIC_IntSimple -// or IIC_IntSimple -// orc IIC_IntSimple -// ori IIC_IntSimple -// oris IIC_IntSimple -// rfi IIC_SprRFI -// rfid IIC_IntRFID -// rldcl IIC_IntRotateD -// rldcr IIC_IntRotateD -// rldic IIC_IntRotateDI -// rldicl IIC_IntRotateDI -// rldicr IIC_IntRotateDI -// rldimi IIC_IntRotateDI -// rlwimi IIC_IntRotate -// rlwinm IIC_IntGeneral -// rlwnm IIC_IntGeneral -// sc IIC_SprSC -// slbia IIC_LdStSLBIA -// slbie IIC_LdStSLBIE -// sld IIC_IntRotateD -// slw IIC_IntGeneral -// srad IIC_IntRotateD -// sradi IIC_IntRotateDI -// sraw IIC_IntShift -// srawi IIC_IntShift -// srd IIC_IntRotateD -// srw IIC_IntGeneral -// stb IIC_LdStStore -// stbu IIC_LdStStoreUpd -// stbux IIC_LdStStoreUpd -// stbx IIC_LdStStore -// std IIC_LdStSTD -// stdcx. IIC_LdStSTDCX -// stdu IIC_LdStSTDU -// stdux IIC_LdStSTDUX -// stdx IIC_LdStSTD -// stfd IIC_LdStSTFD -// stfdu IIC_LdStSTFDU -// stfdux IIC_LdStSTFDU -// stfdx IIC_LdStSTFD -// stfiwx IIC_LdStSTFD -// stfs IIC_LdStSTFD -// stfsu IIC_LdStSTFDU -// stfsux IIC_LdStSTFDU -// stfsx IIC_LdStSTFD -// sth IIC_LdStStore -// sthbrx IIC_LdStStore -// sthu IIC_LdStStoreUpd -// sthux IIC_LdStStoreUpd -// sthx IIC_LdStStore -// stmw IIC_LdStLMW -// stswi IIC_LdStLMW -// stswx IIC_LdStLMW -// stvebx IIC_LdStSTVEBX -// stvehx IIC_LdStSTVEBX -// stvewx IIC_LdStSTVEBX -// stvx IIC_LdStSTVEBX -// stvxl IIC_LdStSTVEBX -// stw IIC_LdStStore -// stwbrx IIC_LdStStore -// stwcx. IIC_LdStSTWCX -// stwu IIC_LdStStoreUpd -// stwux IIC_LdStStoreUpd -// stwx IIC_LdStStore -// subf IIC_IntGeneral -// subfc IIC_IntGeneral -// subfe IIC_IntGeneral -// subfic IIC_IntGeneral -// subfme IIC_IntGeneral -// subfze IIC_IntGeneral -// sync IIC_LdStSync -// td IIC_IntTrapD -// tdi IIC_IntTrapD -// tlbia IIC_LdStSLBIA -// tlbie IIC_LdStDCBF -// tlbsync IIC_SprTLBSYNC -// tw IIC_IntTrapW -// twi IIC_IntTrapW -// vaddcuw IIC_VecGeneral -// vaddfp IIC_VecFP -// vaddsbs IIC_VecGeneral -// vaddshs IIC_VecGeneral -// vaddsws IIC_VecGeneral -// vaddubm IIC_VecGeneral -// vaddubs IIC_VecGeneral -// vadduhm IIC_VecGeneral -// vadduhs IIC_VecGeneral -// vadduwm IIC_VecGeneral -// vadduws IIC_VecGeneral -// vand IIC_VecGeneral -// vandc IIC_VecGeneral -// vavgsb IIC_VecGeneral -// vavgsh IIC_VecGeneral -// vavgsw IIC_VecGeneral -// vavgub IIC_VecGeneral -// vavguh IIC_VecGeneral -// vavguw IIC_VecGeneral -// vcfsx IIC_VecFP -// vcfux IIC_VecFP -// vcmpbfp IIC_VecFPCompare -// vcmpeqfp IIC_VecFPCompare -// vcmpequb IIC_VecGeneral -// vcmpequh IIC_VecGeneral -// vcmpequw IIC_VecGeneral -// vcmpgefp IIC_VecFPCompare -// vcmpgtfp IIC_VecFPCompare -// vcmpgtsb IIC_VecGeneral -// vcmpgtsh IIC_VecGeneral -// vcmpgtsw IIC_VecGeneral -// vcmpgtub IIC_VecGeneral -// vcmpgtuh IIC_VecGeneral -// vcmpgtuw IIC_VecGeneral -// vctsxs IIC_VecFP -// vctuxs IIC_VecFP -// vexptefp IIC_VecFP -// vlogefp IIC_VecFP -// vmaddfp IIC_VecFP -// vmaxfp IIC_VecFPCompare -// vmaxsb IIC_VecGeneral -// vmaxsh IIC_VecGeneral -// vmaxsw IIC_VecGeneral -// vmaxub IIC_VecGeneral -// vmaxuh IIC_VecGeneral -// vmaxuw IIC_VecGeneral -// vmhaddshs IIC_VecComplex -// vmhraddshs IIC_VecComplex -// vminfp IIC_VecFPCompare -// vminsb IIC_VecGeneral -// vminsh IIC_VecGeneral -// vminsw IIC_VecGeneral -// vminub IIC_VecGeneral -// vminuh IIC_VecGeneral -// vminuw IIC_VecGeneral -// vmladduhm IIC_VecComplex -// vmrghb IIC_VecPerm -// vmrghh IIC_VecPerm -// vmrghw IIC_VecPerm -// vmrglb IIC_VecPerm -// vmrglh IIC_VecPerm -// vmrglw IIC_VecPerm -// vmsubfp IIC_VecFP -// vmsummbm IIC_VecComplex -// vmsumshm IIC_VecComplex -// vmsumshs IIC_VecComplex -// vmsumubm IIC_VecComplex -// vmsumuhm IIC_VecComplex -// vmsumuhs IIC_VecComplex -// vmulesb IIC_VecComplex -// vmulesh IIC_VecComplex -// vmuleub IIC_VecComplex -// vmuleuh IIC_VecComplex -// vmulosb IIC_VecComplex -// vmulosh IIC_VecComplex -// vmuloub IIC_VecComplex -// vmulouh IIC_VecComplex -// vnor IIC_VecGeneral -// vor IIC_VecGeneral -// vperm IIC_VecPerm -// vpkpx IIC_VecPerm -// vpkshss IIC_VecPerm -// vpkshus IIC_VecPerm -// vpkswss IIC_VecPerm -// vpkswus IIC_VecPerm -// vpkuhum IIC_VecPerm -// vpkuhus IIC_VecPerm -// vpkuwum IIC_VecPerm -// vpkuwus IIC_VecPerm -// vrefp IIC_VecFPRound -// vrfim IIC_VecFPRound -// vrfin IIC_VecFPRound -// vrfip IIC_VecFPRound -// vrfiz IIC_VecFPRound -// vrlb IIC_VecGeneral -// vrlh IIC_VecGeneral -// vrlw IIC_VecGeneral -// vrsqrtefp IIC_VecFP -// vsel IIC_VecGeneral -// vsl IIC_VecVSL -// vslb IIC_VecGeneral -// vsldoi IIC_VecPerm -// vslh IIC_VecGeneral -// vslo IIC_VecPerm -// vslw IIC_VecGeneral -// vspltb IIC_VecPerm -// vsplth IIC_VecPerm -// vspltisb IIC_VecPerm -// vspltish IIC_VecPerm -// vspltisw IIC_VecPerm -// vspltw IIC_VecPerm -// vsr IIC_VecVSR -// vsrab IIC_VecGeneral -// vsrah IIC_VecGeneral -// vsraw IIC_VecGeneral -// vsrb IIC_VecGeneral -// vsrh IIC_VecGeneral -// vsro IIC_VecPerm -// vsrw IIC_VecGeneral -// vsubcuw IIC_VecGeneral -// vsubfp IIC_VecFP -// vsubsbs IIC_VecGeneral -// vsubshs IIC_VecGeneral -// vsubsws IIC_VecGeneral -// vsububm IIC_VecGeneral -// vsububs IIC_VecGeneral -// vsubuhm IIC_VecGeneral -// vsubuhs IIC_VecGeneral -// vsubuwm IIC_VecGeneral -// vsubuws IIC_VecGeneral -// vsum2sws IIC_VecComplex -// vsum4sbs IIC_VecComplex -// vsum4shs IIC_VecComplex -// vsum4ubs IIC_VecComplex -// vsumsws IIC_VecComplex -// vupkhpx IIC_VecPerm -// vupkhsb IIC_VecPerm -// vupkhsh IIC_VecPerm -// vupklpx IIC_VecPerm -// vupklsb IIC_VecPerm -// vupklsh IIC_VecPerm -// vxor IIC_VecGeneral -// xor IIC_IntSimple -// xori IIC_IntSimple -// xoris IIC_IntSimple -// diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp index c91428d..ed88803 100644 --- a/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/lib/Target/PowerPC/PPCSubtarget.cpp @@ -69,6 +69,7 @@ void PPCSubtarget::initializeEnvironment() { HasVSX = false; HasP8Vector = false; HasP8Altivec = false; + HasP8Crypto = false; HasFCPSGN = false; HasFSQRT = false; HasFRE = false; @@ -94,7 +95,9 @@ void PPCSubtarget::initializeEnvironment() { HasLazyResolverStubs = false; HasICBT = false; HasInvariantFunctionDescriptors = false; + HasPartwordAtomics = false; IsQPXStackUnaligned = false; + HasHTM = false; } void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h index 247a96d..b4c1bb1 100644 --- a/lib/Target/PowerPC/PPCSubtarget.h +++ b/lib/Target/PowerPC/PPCSubtarget.h @@ -90,6 +90,7 @@ protected: bool HasVSX; bool HasP8Vector; bool HasP8Altivec; + bool HasP8Crypto; bool HasFCPSGN; bool HasFSQRT; bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES; @@ -113,6 +114,8 @@ protected: bool IsLittleEndian; bool HasICBT; bool HasInvariantFunctionDescriptors; + bool HasPartwordAtomics; + bool HasHTM; /// When targeting QPX running a stock PPC64 Linux kernel where the stack /// alignment has not been changed, we need to keep the 16-byte alignment @@ -218,6 +221,7 @@ public: bool hasVSX() const { return HasVSX; } bool hasP8Vector() const { return HasP8Vector; } bool hasP8Altivec() const { return HasP8Altivec; } + bool hasP8Crypto() const { return HasP8Crypto; } bool hasMFOCRF() const { return HasMFOCRF; } bool hasISEL() const { return HasISEL; } bool hasPOPCNTD() const { return HasPOPCNTD; } @@ -234,6 +238,7 @@ public: bool hasInvariantFunctionDescriptors() const { return HasInvariantFunctionDescriptors; } + bool hasPartwordAtomics() const { return HasPartwordAtomics; } bool isQPXStackUnaligned() const { return IsQPXStackUnaligned; } unsigned getPlatformStackAlignment() const { @@ -242,6 +247,7 @@ public: return 16; } + bool hasHTM() const { return HasHTM; } const Triple &getTargetTriple() const { return TargetTriple; } diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp index b219e93..7267529 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -160,11 +160,10 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT, CPU, computeFSAdditions(FS, OL, TT), Options, RM, - CM, OL), + : LLVMTargetMachine(T, getDataLayoutString(Triple(TT)), TT, CPU, + computeFSAdditions(FS, OL, TT), Options, RM, CM, OL), TLOF(createTLOF(Triple(getTargetTriple()))), - TargetABI(computeTargetABI(Triple(TT), Options)), - DL(getDataLayoutString(Triple(TT))), Subtarget(TT, CPU, TargetFS, *this) { + TargetABI(computeTargetABI(Triple(TT), Options)) { initAsmInfo(); } @@ -208,7 +207,15 @@ PPCTargetMachine::getSubtargetImpl(const Function &F) const { // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = llvm::make_unique<PPCSubtarget>(TargetTriple, CPU, FS, *this); + I = llvm::make_unique<PPCSubtarget>( + TargetTriple, CPU, + // FIXME: It would be good to have the subtarget additions here + // not necessary. Anything that turns them on/off (overrides) ends + // up being put at the end of the feature string, but the defaults + // shouldn't require adding them. Fixing this means pulling Feature64Bit + // out of most of the target cpus in the .td file and making it set only + // as part of initialization via the TargetTriple. + computeFSAdditions(FS, getOptLevel(), getTargetTriple()), *this); } return I.get(); } diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h index 6508484..7a49058 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.h +++ b/lib/Target/PowerPC/PPCTargetMachine.h @@ -29,10 +29,6 @@ public: private: std::unique_ptr<TargetLoweringObjectFile> TLOF; PPCABI TargetABI; - // Calculates type size & alignment - const DataLayout DL; - PPCSubtarget Subtarget; - mutable StringMap<std::unique_ptr<PPCSubtarget>> SubtargetMap; public: @@ -42,8 +38,6 @@ public: ~PPCTargetMachine() override; - const DataLayout *getDataLayout() const override { return &DL; } - const PPCSubtarget *getSubtargetImpl() const override { return &Subtarget; } const PPCSubtarget *getSubtargetImpl(const Function &F) const override; // Pass Pipeline Configuration diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 073bbb0..b46acd4 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -192,6 +192,10 @@ void PPCTTIImpl::getUnrollingPreferences(Loop *L, BaseT::getUnrollingPreferences(L, UP); } +bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { + return LoopHasReductions; +} + unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) { if (Vector && !ST->hasAltivec() && !ST->hasQPX()) return 0; diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h index cef7079..21acea1 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -78,6 +78,7 @@ public: /// \name Vector TTI Implementations /// @{ + bool enableAggressiveInterleaving(bool LoopHasReductions); unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); unsigned getMaxInterleaveFactor(); diff --git a/lib/Target/PowerPC/README.txt b/lib/Target/PowerPC/README.txt index 4132b04..dfe988f 100644 --- a/lib/Target/PowerPC/README.txt +++ b/lib/Target/PowerPC/README.txt @@ -621,3 +621,10 @@ void foo() { bar(x); __asm__("" ::: "cr2"); } + +//===----------------------------------------------------------------------===// + +Instruction fusion was introduced in ISA 2.06 and more opportunities added in +ISA 2.07. LLVM needs to add infrastructure to recognize fusion opportunities +and force instruction pairs to be scheduled together. + diff --git a/lib/Target/PowerPC/README_ALTIVEC.txt b/lib/Target/PowerPC/README_ALTIVEC.txt index 1e4c6fb..43d87d3 100644 --- a/lib/Target/PowerPC/README_ALTIVEC.txt +++ b/lib/Target/PowerPC/README_ALTIVEC.txt @@ -209,3 +209,107 @@ vector float f(vector float a, vector float b) { return b; } +//===----------------------------------------------------------------------===// + +We should do a little better with eliminating dead stores. +The stores to the stack are dead since %a and %b are not needed + +; Function Attrs: nounwind +define <16 x i8> @test_vpmsumb() #0 { + entry: + %a = alloca <16 x i8>, align 16 + %b = alloca <16 x i8>, align 16 + store <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>, <16 x i8>* %a, align 16 + store <16 x i8> <i8 113, i8 114, i8 115, i8 116, i8 117, i8 118, i8 119, i8 120, i8 121, i8 122, i8 123, i8 124, i8 125, i8 126, i8 127, i8 112>, <16 x i8>* %b, align 16 + %0 = load <16 x i8>* %a, align 16 + %1 = load <16 x i8>* %b, align 16 + %2 = call <16 x i8> @llvm.ppc.altivec.crypto.vpmsumb(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %2 +} + + +; Function Attrs: nounwind readnone +declare <16 x i8> @llvm.ppc.altivec.crypto.vpmsumb(<16 x i8>, <16 x i8>) #1 + + +Produces the following code with -mtriple=powerpc64-unknown-linux-gnu: +# BB#0: # %entry + addis 3, 2, .LCPI0_0@toc@ha + addis 4, 2, .LCPI0_1@toc@ha + addi 3, 3, .LCPI0_0@toc@l + addi 4, 4, .LCPI0_1@toc@l + lxvw4x 0, 0, 3 + addi 3, 1, -16 + lxvw4x 35, 0, 4 + stxvw4x 0, 0, 3 + ori 2, 2, 0 + lxvw4x 34, 0, 3 + addi 3, 1, -32 + stxvw4x 35, 0, 3 + vpmsumb 2, 2, 3 + blr + .long 0 + .quad 0 + +The two stxvw4x instructions are not needed. +With -mtriple=powerpc64le-unknown-linux-gnu, the associated permutes +are present too. + +//===----------------------------------------------------------------------===// + +The following example is found in test/CodeGen/PowerPC/vec_add_sub_doubleword.ll: + +define <2 x i64> @increment_by_val(<2 x i64> %x, i64 %val) nounwind { + %tmpvec = insertelement <2 x i64> <i64 0, i64 0>, i64 %val, i32 0 + %tmpvec2 = insertelement <2 x i64> %tmpvec, i64 %val, i32 1 + %result = add <2 x i64> %x, %tmpvec2 + ret <2 x i64> %result + +This will generate the following instruction sequence: + std 5, -8(1) + std 5, -16(1) + addi 3, 1, -16 + ori 2, 2, 0 + lxvd2x 35, 0, 3 + vaddudm 2, 2, 3 + blr + +This will almost certainly cause a load-hit-store hazard. +Since val is a value parameter, it should not need to be saved onto +the stack, unless it's being done set up the vector register. Instead, +it would be better to splat teh value into a vector register, and then +remove the (dead) stores to the stack. + +//===----------------------------------------------------------------------===// + +At the moment we always generate a lxsdx in preference to lfd, or stxsdx in +preference to stfd. When we have a reg-immediate addressing mode, this is a +poor choice, since we have to load the address into an index register. This +should be fixed for P7/P8. + +//===----------------------------------------------------------------------===// + +Right now, ShuffleKind 0 is supported only on BE, and ShuffleKind 2 only on LE. +However, we could actually support both kinds on either endianness, if we check +for the appropriate shufflevector pattern for each case ... this would cause +some additional shufflevectors to be recognized and implemented via the +"swapped" form. + +//===----------------------------------------------------------------------===// + +There is a utility program called PerfectShuffle that generates a table of the +shortest instruction sequence for implementing a shufflevector operation on +PowerPC. However, this was designed for big-endian code generation. We could +modify this program to create a little endian version of the table. The table +is used in PPCISelLowering.cpp, PPCTargetLowering::LOWERVECTOR_SHUFFLE(). + +//===----------------------------------------------------------------------===// + +Opportunies to use instructions from PPCInstrVSX.td during code gen + - Conversion instructions (Sections 7.6.1.5 and 7.6.1.6 of ISA 2.07) + - Scalar comparisons (xscmpodp and xscmpudp) + - Min and max (xsmaxdp, xsmindp, xvmaxdp, xvmindp, xvmaxsp, xvminsp) + +Related to this: we currently do not generate the lxvw4x instruction for either +v4f32 or v4i32, probably because adding a dag pattern to the recognizer requires +a single target type. This should probably be addressed in the PPCISelDAGToDAG logic. diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td index a7d48b3..e5d5ce2 100644 --- a/lib/Target/R600/AMDGPU.td +++ b/lib/Target/R600/AMDGPU.td @@ -103,6 +103,11 @@ def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", "true", "Enable spilling of VGPRs to scratch memory">; +def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", + "SGPRInitBug", + "true", + "VI SGPR initilization bug requiring a fixed SGPR allocation size">; + class SubtargetFeatureFetchLimit <string Value> : SubtargetFeature <"fetch"#Value, "TexVTXClauseSize", diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index 92bc314..d911014 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -105,8 +105,6 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { SetupMachineFunction(MF); - EmitFunctionHeader(); - MCContext &Context = getObjFileLowering().getContext(); const MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); @@ -129,7 +127,6 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { HexLines.clear(); DisasmLineMaxLen = 0; - OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); EmitFunctionBody(); if (isVerbose()) { @@ -339,6 +336,13 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.NumVGPR = MaxVGPR + 1; ProgInfo.NumSGPR = MaxSGPR + 1; + if (STM.hasSGPRInitBug()) { + if (ProgInfo.NumSGPR > AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) + llvm_unreachable("Too many SGPRs used with the SGPR init bug"); + + ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; + } + ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4; ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8; // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index b5ab703..7341cd9 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -95,7 +95,8 @@ private: SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, SDValue &TFE) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, - SDValue &SOffset, SDValue &Offset) const; + SDValue &SOffset, SDValue &Offset, SDValue &GLC, + SDValue &SLC, SDValue &TFE) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &SLC) const; @@ -120,6 +121,11 @@ private: SDNode *SelectADD_SUB_I64(SDNode *N); SDNode *SelectDIV_SCALE(SDNode *N); + SDNode *getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val, + uint32_t Offset, uint32_t Width); + SDNode *SelectS_BFEFromShifts(SDNode *N); + SDNode *SelectS_BFE(SDNode *N); + // Include the pieces autogenerated from the target description. #include "AMDGPUGenDAGISel.inc" }; @@ -519,21 +525,11 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { bool Signed = Opc == AMDGPUISD::BFE_I32; - // Transformation function, pack the offset and width of a BFE into - // the format expected by the S_BFE_I32 / S_BFE_U32. In the second - // source, bits [5:0] contain the offset and bits [22:16] the width. - uint32_t OffsetVal = Offset->getZExtValue(); uint32_t WidthVal = Width->getZExtValue(); - uint32_t PackedVal = OffsetVal | WidthVal << 16; - - SDValue PackedOffsetWidth = CurDAG->getTargetConstant(PackedVal, MVT::i32); - return CurDAG->getMachineNode(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, - SDLoc(N), - MVT::i32, - N->getOperand(0), - PackedOffsetWidth); + return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N), + N->getOperand(0), OffsetVal, WidthVal); } case AMDGPUISD::DIV_SCALE: { @@ -547,6 +543,14 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { } case ISD::ADDRSPACECAST: return SelectAddrSpaceCast(N); + case ISD::AND: + case ISD::SRL: + case ISD::SRA: + if (N->getValueType(0) != MVT::i32 || + Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + break; + + return SelectS_BFE(N); } return SelectCode(N); @@ -966,8 +970,9 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, - SDValue &Offset) const { - SDValue Ptr, Offen, Idxen, Addr64, GLC, SLC, TFE; + SDValue &Offset, SDValue &GLC, + SDValue &SLC, SDValue &TFE) const { + SDValue Ptr, Offen, Idxen, Addr64; SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, GLC, SLC, TFE); @@ -991,8 +996,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &Offset, SDValue &SLC) const { SLC = CurDAG->getTargetConstant(0, MVT::i1); + SDValue GLC, TFE; - return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset); + return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE); } bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, @@ -1147,6 +1153,95 @@ SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode(); } +SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val, + uint32_t Offset, uint32_t Width) { + // Transformation function, pack the offset and width of a BFE into + // the format expected by the S_BFE_I32 / S_BFE_U32. In the second + // source, bits [5:0] contain the offset and bits [22:16] the width. + uint32_t PackedVal = Offset | (Width << 16); + SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, MVT::i32); + + return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst); +} + +SDNode *AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) { + // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c) + // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c) + // Predicate: 0 < b <= c < 32 + + const SDValue &Shl = N->getOperand(0); + ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1)); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); + + if (B && C) { + uint32_t BVal = B->getZExtValue(); + uint32_t CVal = C->getZExtValue(); + + if (0 < BVal && BVal <= CVal && CVal < 32) { + bool Signed = N->getOpcode() == ISD::SRA; + unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; + + return getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), + CVal - BVal, 32 - CVal); + } + } + return SelectCode(N); +} + +SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { + switch (N->getOpcode()) { + case ISD::AND: + if (N->getOperand(0).getOpcode() == ISD::SRL) { + // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)" + // Predicate: isMask(mask) + const SDValue &Srl = N->getOperand(0); + ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1)); + ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1)); + + if (Shift && Mask) { + uint32_t ShiftVal = Shift->getZExtValue(); + uint32_t MaskVal = Mask->getZExtValue(); + + if (isMask_32(MaskVal)) { + uint32_t WidthVal = countPopulation(MaskVal); + + return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), Srl.getOperand(0), + ShiftVal, WidthVal); + } + } + } + break; + case ISD::SRL: + if (N->getOperand(0).getOpcode() == ISD::AND) { + // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)" + // Predicate: isMask(mask >> b) + const SDValue &And = N->getOperand(0); + ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1)); + ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1)); + + if (Shift && Mask) { + uint32_t ShiftVal = Shift->getZExtValue(); + uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal; + + if (isMask_32(MaskVal)) { + uint32_t WidthVal = countPopulation(MaskVal); + + return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), And.getOperand(0), + ShiftVal, WidthVal); + } + } + } else if (N->getOperand(0).getOpcode() == ISD::SHL) + return SelectS_BFEFromShifts(N); + break; + case ISD::SRA: + if (N->getOperand(0).getOpcode() == ISD::SHL) + return SelectS_BFEFromShifts(N); + break; + } + + return SelectCode(N); +} + bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const { diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 4707279..62a33fa 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -885,9 +885,6 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return LowerIntrinsicIABS(Op, DAG); case AMDGPUIntrinsic::AMDGPU_lrp: return LowerIntrinsicLRP(Op, DAG); - case AMDGPUIntrinsic::AMDGPU_fract: - case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. - return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); case AMDGPUIntrinsic::AMDGPU_clamp: case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name. diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp index f4de2d6..f0f10ca 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.cpp +++ b/lib/Target/R600/AMDGPUInstrInfo.cpp @@ -31,7 +31,7 @@ using namespace llvm; void AMDGPUInstrInfo::anchor() {} AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &st) - : AMDGPUGenInstrInfo(-1,-1), RI(st), ST(st) { } + : AMDGPUGenInstrInfo(-1, -1), ST(st) {} const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const { return RI; @@ -152,26 +152,22 @@ bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const return true; } - -MachineInstr * -AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, - int FrameIndex) const { +MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + ArrayRef<unsigned> Ops, + int FrameIndex) const { // TODO: Implement this function return nullptr; } -MachineInstr* -AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, - MachineInstr *LoadMI) const { +MachineInstr * +AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, + MachineInstr *LoadMI) const { // TODO: Implement this function return nullptr; } -bool -AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops) const { +bool AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI, + ArrayRef<unsigned> Ops) const { // TODO: Implement this function return false; } @@ -360,8 +356,8 @@ static enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) { } int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { - int MCOp = AMDGPU::getMCOpcode(Opcode, - AMDGPUSubtargetToSISubtarget(RI.ST.getGeneration())); + int MCOp = AMDGPU::getMCOpcode( + Opcode, AMDGPUSubtargetToSISubtarget(ST.getGeneration())); // -1 means that Opcode is already a native instruction. if (MCOp == -1) diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h index 202183c..07042b5 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.h +++ b/lib/Target/R600/AMDGPUInstrInfo.h @@ -85,14 +85,13 @@ public: const TargetRegisterInfo *TRI) const override; protected: - MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, int FrameIndex) const override; - MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, MachineInstr *LoadMI) const override; + public: /// \returns the smallest register index that will be accessed by an indirect /// read or write or -1 if indirect addressing is not used by this program. @@ -103,7 +102,7 @@ public: int getIndirectIndexEnd(const MachineFunction &MF) const; bool canFoldMemoryOperand(const MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops) const override; + ArrayRef<unsigned> Ops) const override; bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, unsigned Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const override; diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index 849b241..4d08201 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -578,22 +578,20 @@ class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat < // Bitfield extract patterns -/* - -XXX: The BFE pattern is not working correctly because the XForm is not being -applied. +def IMMZeroBasedBitfieldMask : PatLeaf <(imm), [{ + return isMask_32(N->getZExtValue()); +}]>; -def legalshift32 : ImmLeaf <i32, [{return Imm >=0 && Imm < 32;}]>; -def bfemask : PatLeaf <(imm), [{return isMask_32(N->getZExtValue());}], - SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(countTrailingOnes(N->getZExtValue()), MVT::i32);}]>>; +def IMMPopCount : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(countPopulation(N->getZExtValue()), + MVT::i32); +}]>; -class BFEPattern <Instruction BFE> : Pat < - (and (srl i32:$x, legalshift32:$y), bfemask:$z), - (BFE $x, $y, $z) +class BFEPattern <Instruction BFE, Instruction MOV> : Pat < + (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)), + (BFE $src, $rshift, (MOV (i32 (IMMPopCount $mask)))) >; -*/ - // rotr pattern class ROTRPattern <Instruction BIT_ALIGN> : Pat < (rotr i32:$src0, i32:$src1), diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td index eee9c29..ab489cd 100644 --- a/lib/Target/R600/AMDGPUIntrinsics.td +++ b/lib/Target/R600/AMDGPUIntrinsics.td @@ -68,6 +68,7 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_barrier_local : Intrinsic<[], [], []>; def int_AMDGPU_barrier_global : Intrinsic<[], [], []>; } diff --git a/lib/Target/R600/AMDGPUPromoteAlloca.cpp b/lib/Target/R600/AMDGPUPromoteAlloca.cpp index b81fef4..175dcd8 100644 --- a/lib/Target/R600/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/R600/AMDGPUPromoteAlloca.cpp @@ -18,6 +18,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #define DEBUG_TYPE "amdgpu-promote-alloca" @@ -87,7 +88,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { continue; if (Use->getParent()->getParent() == &F) LocalMemAvailable -= - Mod->getDataLayout()->getTypeAllocSize(GVTy->getElementType()); + Mod->getDataLayout().getTypeAllocSize(GVTy->getElementType()); } } } @@ -276,8 +277,8 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { // value from the reqd_work_group_size function attribute if it is // available. unsigned WorkGroupSize = 256; - int AllocaSize = WorkGroupSize * - Mod->getDataLayout()->getTypeAllocSize(AllocaTy); + int AllocaSize = + WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy); if (AllocaSize > LocalMemAvailable) { DEBUG(dbgs() << " Not enough local memory to promote alloca.\n"); @@ -294,9 +295,9 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { DEBUG(dbgs() << "Promoting alloca to local memory\n"); LocalMemAvailable -= AllocaSize; + Type *GVTy = ArrayType::get(I.getAllocatedType(), 256); GlobalVariable *GV = new GlobalVariable( - *Mod, ArrayType::get(I.getAllocatedType(), 256), false, - GlobalValue::ExternalLinkage, 0, I.getName(), 0, + *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0, GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); FunctionType *FTy = FunctionType::get( @@ -332,7 +333,7 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext()))); Indices.push_back(TID); - Value *Offset = Builder.CreateGEP(GV, Indices); + Value *Offset = Builder.CreateGEP(GVTy, GV, Indices); I.mutateType(Offset->getType()); I.replaceAllUsesWith(Offset); I.eraseFromParent(); diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp index 57b054b..3ca0eca 100644 --- a/lib/Target/R600/AMDGPURegisterInfo.cpp +++ b/lib/Target/R600/AMDGPURegisterInfo.cpp @@ -17,10 +17,7 @@ using namespace llvm; -AMDGPURegisterInfo::AMDGPURegisterInfo(const AMDGPUSubtarget &st) -: AMDGPUGenRegisterInfo(0), - ST(st) - { } +AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {} //===----------------------------------------------------------------------===// // Function handling callbacks - Functions are a seldom used feature of GPUS, so diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h index f27576a..cfd800b 100644 --- a/lib/Target/R600/AMDGPURegisterInfo.h +++ b/lib/Target/R600/AMDGPURegisterInfo.h @@ -30,9 +30,8 @@ class TargetInstrInfo; struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { static const MCPhysReg CalleeSavedReg; - const AMDGPUSubtarget &ST; - AMDGPURegisterInfo(const AMDGPUSubtarget &st); + AMDGPURegisterInfo(); BitVector getReservedRegs(const MachineFunction &MF) const override { assert(!"Unimplemented"); return BitVector(); diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index 70c8525..0ead652 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -70,7 +70,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS, CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), - EnableVGPRSpilling(false), + EnableVGPRSpilling(false), SGPRInitBug(false), FrameLowering(TargetFrameLowering::StackGrowsUp, 64 * 16, // Maximum stack alignment (long16) 0), diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index 1b0122c..403a3e4 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -44,6 +44,10 @@ public: VOLCANIC_ISLANDS, }; + enum { + FIXED_SGPR_COUNT_FOR_INIT_BUG = 80 + }; + private: std::string DevName; bool Is64bit; @@ -66,6 +70,7 @@ private: bool CFALUBug; int LocalMemorySize; bool EnableVGPRSpilling; + bool SGPRInitBug; AMDGPUFrameLowering FrameLowering; std::unique_ptr<AMDGPUTargetLowering> TLInfo; @@ -206,6 +211,10 @@ public: return LocalMemorySize; } + bool hasSGPRInitBug() const { + return SGPRInitBug; + } + unsigned getAmdKernelCodeChipID() const; bool enableMachineScheduler() const override { diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index a862f3c..cb95835 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -71,10 +71,10 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT, TargetOptions Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OptLevel) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel), - DL(computeDataLayout(TT)), - TLOF(new TargetLoweringObjectFileELF()), - Subtarget(TT, CPU, FS, *this), IntrinsicInfo() { + : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM, + OptLevel), + TLOF(new TargetLoweringObjectFileELF()), Subtarget(TT, CPU, FS, *this), + IntrinsicInfo() { setRequiresStructuredCFG(true); initAsmInfo(); } @@ -118,7 +118,7 @@ public: ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override { - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) return createR600MachineScheduler(C); return nullptr; @@ -174,7 +174,7 @@ void AMDGPUPassConfig::addIRPasses() { } void AMDGPUPassConfig::addCodeGenPrepare() { - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); if (ST.isPromoteAllocaEnabled()) { addPass(createAMDGPUPromoteAlloca(ST)); addPass(createSROAPass()); @@ -184,7 +184,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() { bool AMDGPUPassConfig::addPreISel() { - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); addPass(createFlattenCFGPass()); if (ST.IsIRStructurizerEnabled()) addPass(createStructurizeCFGPass()); @@ -211,7 +211,7 @@ void R600PassConfig::addPreRegAlloc() { } void R600PassConfig::addPreSched2() { - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); addPass(createR600EmitClauseMarkers(), false); if (ST.isIfCvtEnabled()) addPass(&IfConverterID, false); @@ -251,15 +251,15 @@ bool GCNPassConfig::addInstSelector() { } void GCNPassConfig::addPreRegAlloc() { - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { - // Don't do this with no optimizations since it throws away debug info by - // merging nonadjacent loads. + // Don't do this with no optimizations since it throws away debug info by + // merging nonadjacent loads. - // This should be run after scheduling, but before register allocation. It - // also need extra copies to the address operand to be eliminated. - initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); - insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); + // This should be run after scheduling, but before register allocation. It + // also need extra copies to the address operand to be eliminated. + initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); + insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); } addPass(createSIShrinkInstructionsPass(), false); addPass(createSIFixSGPRLiveRangesPass(), false); diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h index a691536..785c119 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.h +++ b/lib/Target/R600/AMDGPUTargetMachine.h @@ -30,7 +30,6 @@ namespace llvm { class AMDGPUTargetMachine : public LLVMTargetMachine { private: - const DataLayout DL; protected: TargetLoweringObjectFile *TLOF; @@ -42,12 +41,9 @@ public: StringRef CPU, TargetOptions Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); ~AMDGPUTargetMachine(); - // FIXME: This is currently broken, the DataLayout needs to move to - // the target machine. - const DataLayout *getDataLayout() const override { - return &DL; - } - const AMDGPUSubtarget *getSubtargetImpl() const override { + + const AMDGPUSubtarget *getSubtargetImpl() const { return &Subtarget; } + const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override { return &Subtarget; } const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp index 68f4600..96edc41 100644 --- a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" #include "llvm/Target/TargetLowering.h" @@ -36,13 +37,15 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, // TODO: Do we want runtime unrolling? for (const BasicBlock *BB : L->getBlocks()) { + const DataLayout &DL = BB->getModule()->getDataLayout(); for (const Instruction &I : *BB) { const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) continue; const Value *Ptr = GEP->getPointerOperand(); - const AllocaInst *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr)); + const AllocaInst *Alloca = + dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL)); if (Alloca) { // We want to do whatever we can to limit the number of alloca // instructions that make it through to the code generator. allocas diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp index ee6e8ec..ee6551b 100644 --- a/lib/Target/R600/AMDILCFGStructurizer.cpp +++ b/lib/Target/R600/AMDILCFGStructurizer.cpp @@ -10,8 +10,8 @@ #include "AMDGPU.h" #include "AMDGPUInstrInfo.h" -#include "R600InstrInfo.h" #include "AMDGPUSubtarget.h" +#include "R600InstrInfo.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/SmallVector.h" @@ -30,6 +30,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" +#include <deque> using namespace llvm; @@ -165,6 +166,7 @@ public: TRI = &TII->getRegisterInfo(); DEBUG(MF.dump();); OrderedBlks.clear(); + Visited.clear(); FuncRep = &MF; MLI = &getAnalysis<MachineLoopInfo>(); DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI);); @@ -621,7 +623,7 @@ DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) { for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end(); ++It) { MachineInstr *instr = &(*It); - if (instr->getDebugLoc().isUnknown() == false) + if (!instr->getDebugLoc().isUnknown()) DL = instr->getDebugLoc(); } return DL; @@ -1075,21 +1077,19 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) { } int AMDGPUCFGStructurizer::loopendPatternMatch() { - std::vector<MachineLoop *> NestedLoops; - for (MachineLoopInfo::iterator It = MLI->begin(), E = MLI->end(); It != E; - ++It) - for (MachineLoop *ML : depth_first(*It)) - NestedLoops.push_back(ML); + std::deque<MachineLoop *> NestedLoops; + for (auto &It: *MLI) + for (MachineLoop *ML : depth_first(It)) + NestedLoops.push_front(ML); if (NestedLoops.size() == 0) return 0; - // Process nested loop outside->inside, so "continue" to a outside loop won't - // be mistaken as "break" of the current loop. + // Process nested loop outside->inside (we did push_front), + // so "continue" to a outside loop won't be mistaken as "break" + // of the current loop. int Num = 0; - for (std::vector<MachineLoop *>::reverse_iterator It = NestedLoops.rbegin(), - E = NestedLoops.rend(); It != E; ++It) { - MachineLoop *ExaminedLoop = *It; + for (MachineLoop *ExaminedLoop : NestedLoops) { if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop]) continue; DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump();); @@ -1611,7 +1611,7 @@ void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB, bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI); - if (UseContinueLogical == false) { + if (!UseContinueLogical) { int BranchOpcode = TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) : getBranchZeroOpcode(OldOpcode); diff --git a/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp index 3b4ba1a..49f0f23 100644 --- a/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp @@ -46,10 +46,9 @@ class AMDGPUAsmParser : public MCTargetAsmParser { /// } public: - AMDGPUAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser, - const MCInstrInfo &_MII, - const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(_STI), Parser(_Parser) { + AMDGPUAsmParser(MCSubtargetInfo &STI, MCAsmParser &Parser, + const MCInstrInfo &MII, const MCTargetOptions &Options) + : MCTargetAsmParser(), STI(STI), Parser(Parser) { setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); } bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td index 9f9472c..5560146 100644 --- a/lib/Target/R600/EvergreenInstructions.td +++ b/lib/Target/R600/EvergreenInstructions.td @@ -287,9 +287,8 @@ def BFE_INT_eg : R600_3OP <0x5, "BFE_INT", VecALU >; -// XXX: This pattern is broken, disabling for now. See comment in -// AMDGPUInstructions.td for more info. -// def : BFEPattern <BFE_UINT_eg>; +def : BFEPattern <BFE_UINT_eg, MOV_IMM_I32>; + def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))], VecALU diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp index b66ed10..d62fd3f 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp @@ -99,6 +99,12 @@ void AMDGPUInstPrinter::printDSOffset1(const MCInst *MI, unsigned OpNo, printU8ImmDecOperand(MI, OpNo, O); } +void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " gds"; +} + void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O) { if (MI->getOperand(OpNo).getImm()) @@ -208,6 +214,16 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) { O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']'; } +void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3) + O << "_e64 "; + else + O << "_e32 "; + + printOperand(MI, OpNo, O); +} + void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, raw_ostream &O) { int32_t SImm = static_cast<int32_t>(Imm); if (SImm >= -16 && SImm <= 64) { diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h index 1d43c7a..5289718 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h @@ -44,10 +44,12 @@ private: void printDSOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printDSOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printDSOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printGDS(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printRegOperand(unsigned RegNo, raw_ostream &O); + void printVOPDst(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printImmediate32(uint32_t I, raw_ostream &O); void printImmediate64(uint64_t I, raw_ostream &O); void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 83403ba..fb2deef 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -17,6 +17,7 @@ #include "InstPrinter/AMDGPUInstPrinter.h" #include "SIDefines.h" #include "llvm/MC/MCCodeGenInfo.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" @@ -72,50 +73,19 @@ static MCInstPrinter *createAMDGPUMCInstPrinter(const Target &T, return new AMDGPUInstPrinter(MAI, MII, MRI); } -static MCCodeEmitter *createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, - MCContext &Ctx) { - if (STI.getFeatureBits() & AMDGPU::Feature64BitPtr) { - return createSIMCCodeEmitter(MCII, MRI, STI, Ctx); - } else { - return createR600MCCodeEmitter(MCII, MRI, STI); - } -} - -static MCStreamer *createMCStreamer(const Target &T, StringRef TT, - MCContext &Ctx, MCAsmBackend &MAB, - raw_ostream &_OS, MCCodeEmitter *_Emitter, - const MCSubtargetInfo &STI, bool RelaxAll) { - return createELFStreamer(Ctx, MAB, _OS, _Emitter, false); -} - extern "C" void LLVMInitializeR600TargetMC() { + for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) { + RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T); + + TargetRegistry::RegisterMCCodeGenInfo(*T, createAMDGPUMCCodeGenInfo); + TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo); + TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo); + TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo); + TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter); + TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend); + } - RegisterMCAsmInfo<AMDGPUMCAsmInfo> Y(TheAMDGPUTarget); - RegisterMCAsmInfo<AMDGPUMCAsmInfo> Z(TheGCNTarget); - - TargetRegistry::RegisterMCCodeGenInfo(TheAMDGPUTarget, createAMDGPUMCCodeGenInfo); - TargetRegistry::RegisterMCCodeGenInfo(TheGCNTarget, createAMDGPUMCCodeGenInfo); - - TargetRegistry::RegisterMCInstrInfo(TheAMDGPUTarget, createAMDGPUMCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(TheGCNTarget, createAMDGPUMCInstrInfo); - - TargetRegistry::RegisterMCRegInfo(TheAMDGPUTarget, createAMDGPUMCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(TheGCNTarget, createAMDGPUMCRegisterInfo); - - TargetRegistry::RegisterMCSubtargetInfo(TheAMDGPUTarget, createAMDGPUMCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(TheGCNTarget, createAMDGPUMCSubtargetInfo); - - TargetRegistry::RegisterMCInstPrinter(TheAMDGPUTarget, createAMDGPUMCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(TheGCNTarget, createAMDGPUMCInstPrinter); - - TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, createAMDGPUMCCodeEmitter); - TargetRegistry::RegisterMCCodeEmitter(TheGCNTarget, createAMDGPUMCCodeEmitter); - - TargetRegistry::RegisterMCAsmBackend(TheAMDGPUTarget, createAMDGPUAsmBackend); - TargetRegistry::RegisterMCAsmBackend(TheGCNTarget, createAMDGPUAsmBackend); - - TargetRegistry::RegisterMCObjectStreamer(TheAMDGPUTarget, createMCStreamer); - TargetRegistry::RegisterMCObjectStreamer(TheGCNTarget, createMCStreamer); + TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, + createR600MCCodeEmitter); + TargetRegistry::RegisterMCCodeEmitter(TheGCNTarget, createSIMCCodeEmitter); } diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h index bc8cd53..23f0196 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -16,6 +16,7 @@ #ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H #define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H +#include "llvm/Support/DataTypes.h" #include "llvm/ADT/StringRef.h" namespace llvm { @@ -34,11 +35,10 @@ extern Target TheGCNTarget; MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI); + MCContext &Ctx); MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx); MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI, diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index 8a555ff..fa25f59 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -81,8 +81,8 @@ enum FCInstr { }; MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI) { + const MCRegisterInfo &MRI, + MCContext &Ctx) { return new R600MCCodeEmitter(MCII, MRI); } diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp index 7e23772..760aa37 100644 --- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp @@ -72,7 +72,6 @@ public: MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx) { return new SIMCCodeEmitter(MCII, MRI, Ctx); } diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td index fb5aa61..82c6d13 100644 --- a/lib/Target/R600/Processors.td +++ b/lib/Target/R600/Processors.td @@ -119,8 +119,12 @@ def : ProcessorModel<"mullins", SIQuarterSpeedModel, [FeatureSeaIslands]>; // Volcanic Islands //===----------------------------------------------------------------------===// -def : ProcessorModel<"tonga", SIQuarterSpeedModel, [FeatureVolcanicIslands]>; +def : ProcessorModel<"tonga", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureSGPRInitBug] +>; -def : ProcessorModel<"iceland", SIQuarterSpeedModel, [FeatureVolcanicIslands]>; +def : ProcessorModel<"iceland", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureSGPRInitBug] +>; def : ProcessorModel<"carrizo", SIQuarterSpeedModel, [FeatureVolcanicIslands]>; diff --git a/lib/Target/R600/R600ClauseMergePass.cpp b/lib/Target/R600/R600ClauseMergePass.cpp index f07be00..3cb9021 100644 --- a/lib/Target/R600/R600ClauseMergePass.cpp +++ b/lib/Target/R600/R600ClauseMergePass.cpp @@ -14,11 +14,11 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" -#include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index c738611..a34e2dc 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -837,6 +837,10 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case Intrinsic::AMDGPU_rsq: // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_fract: + case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. + return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); } // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) break; @@ -1479,8 +1483,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const // Lower loads constant address space global variable loads if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && - isa<GlobalVariable>( - GetUnderlyingObject(LoadNode->getMemOperand()->getValue()))) { + isa<GlobalVariable>(GetUnderlyingObject( + LoadNode->getMemOperand()->getValue(), *getDataLayout()))) { SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL, getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); @@ -1867,7 +1871,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, SelectCC.getOperand(0), // LHS SelectCC.getOperand(1), // RHS DAG.getConstant(-1, MVT::i32), // True - DAG.getConstant(0, MVT::i32), // Flase + DAG.getConstant(0, MVT::i32), // False SelectCC.getOperand(4)); // CC break; diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index 653fd0d..5f0bdf3 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -29,9 +29,7 @@ using namespace llvm; #include "AMDGPUGenDFAPacketizer.inc" R600InstrInfo::R600InstrInfo(const AMDGPUSubtarget &st) - : AMDGPUInstrInfo(st), - RI(st) - { } + : AMDGPUInstrInfo(st), RI() {} const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const { return RI; @@ -268,9 +266,8 @@ int R600InstrInfo::getSrcIdx(unsigned Opcode, unsigned SrcNum) const { return getOperandIdx(Opcode, OpTable[SrcNum]); } -#define SRC_SEL_ROWS 11 int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const { - static const unsigned SrcSelTable[SRC_SEL_ROWS][2] = { + static const unsigned SrcSelTable[][2] = { {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel}, {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel}, {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel}, @@ -284,14 +281,13 @@ int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const { {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W} }; - for (unsigned i = 0; i < SRC_SEL_ROWS; ++i) { - if (getOperandIdx(Opcode, SrcSelTable[i][0]) == (int)SrcIdx) { - return getOperandIdx(Opcode, SrcSelTable[i][1]); + for (const auto &Row : SrcSelTable) { + if (getOperandIdx(Opcode, Row[0]) == (int)SrcIdx) { + return getOperandIdx(Opcode, Row[1]); } } return -1; } -#undef SRC_SEL_ROWS SmallVector<std::pair<MachineOperand *, int64_t>, 3> R600InstrInfo::getSrcs(MachineInstr *MI) const { diff --git a/lib/Target/R600/R600OptimizeVectorRegisters.cpp b/lib/Target/R600/R600OptimizeVectorRegisters.cpp index 742c0e0..0c06ccc 100644 --- a/lib/Target/R600/R600OptimizeVectorRegisters.cpp +++ b/lib/Target/R600/R600OptimizeVectorRegisters.cpp @@ -27,10 +27,9 @@ /// to reduce MOV count. //===----------------------------------------------------------------------===// -#include "llvm/Support/Debug.h" #include "AMDGPU.h" -#include "R600InstrInfo.h" #include "AMDGPUSubtarget.h" +#include "R600InstrInfo.h" #include "llvm/CodeGen/DFAPacketizer.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -38,6 +37,7 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp index dc95675..fb0359c 100644 --- a/lib/Target/R600/R600RegisterInfo.cpp +++ b/lib/Target/R600/R600RegisterInfo.cpp @@ -20,14 +20,16 @@ using namespace llvm; -R600RegisterInfo::R600RegisterInfo(const AMDGPUSubtarget &st) -: AMDGPURegisterInfo(st) - { RCW.RegWeight = 0; RCW.WeightLimit = 0;} +R600RegisterInfo::R600RegisterInfo() : AMDGPURegisterInfo() { + RCW.RegWeight = 0; + RCW.WeightLimit = 0; +} BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); - const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(ST.getInstrInfo()); + const R600InstrInfo *TII = + static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); Reserved.set(AMDGPU::ZERO); Reserved.set(AMDGPU::HALF); diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h index f1a8a41..9713e60 100644 --- a/lib/Target/R600/R600RegisterInfo.h +++ b/lib/Target/R600/R600RegisterInfo.h @@ -24,7 +24,7 @@ class AMDGPUSubtarget; struct R600RegisterInfo : public AMDGPURegisterInfo { RegClassWeight RCW; - R600RegisterInfo(const AMDGPUSubtarget &st); + R600RegisterInfo(); BitVector getReservedRegs(const MachineFunction &MF) const override; diff --git a/lib/Target/R600/SIFixSGPRLiveRanges.cpp b/lib/Target/R600/SIFixSGPRLiveRanges.cpp index f34c375..0c54446 100644 --- a/lib/Target/R600/SIFixSGPRLiveRanges.cpp +++ b/lib/Target/R600/SIFixSGPRLiveRanges.cpp @@ -54,6 +54,7 @@ #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; diff --git a/lib/Target/R600/SIFoldOperands.cpp b/lib/Target/R600/SIFoldOperands.cpp index ae4b05d..7ba5a6d 100644 --- a/lib/Target/R600/SIFoldOperands.cpp +++ b/lib/Target/R600/SIFoldOperands.cpp @@ -17,9 +17,10 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "si-fold-operands" diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 7d794b8..bd0c3c2 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -172,16 +172,12 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setOperationAction(ISD::UDIV, MVT::i64, Expand); setOperationAction(ISD::UREM, MVT::i64, Expand); - // We only support LOAD/STORE and vector manipulation ops for vectors - // with > 4 elements. - MVT VecTypes[] = { - MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32 - }; - setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); setOperationAction(ISD::SELECT, MVT::i1, Promote); - for (MVT VT : VecTypes) { + // We only support LOAD/STORE and vector manipulation ops for vectors + // with > 4 elements. + for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch(Op) { case ISD::LOAD: @@ -206,10 +202,10 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); - setOperationAction(ISD::FFLOOR, MVT::f64, Legal); setOperationAction(ISD::FRINT, MVT::f64, Legal); } + setOperationAction(ISD::FFLOOR, MVT::f64, Legal); setOperationAction(ISD::FDIV, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Custom); @@ -932,6 +928,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_fract: + case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. + return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1), + DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1))); + default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); } @@ -1346,6 +1348,35 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, return SDValue(); } +/// \brief Return true if the given offset Size in bytes can be folded into +/// the immediate offsets of a memory instruction for the given address space. +static bool canFoldOffset(unsigned OffsetSize, unsigned AS, + const AMDGPUSubtarget &STI) { + switch (AS) { + case AMDGPUAS::GLOBAL_ADDRESS: { + // MUBUF instructions a 12-bit offset in bytes. + return isUInt<12>(OffsetSize); + } + case AMDGPUAS::CONSTANT_ADDRESS: { + // SMRD instructions have an 8-bit offset in dwords on SI and + // a 20-bit offset in bytes on VI. + if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return isUInt<20>(OffsetSize); + else + return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); + } + case AMDGPUAS::LOCAL_ADDRESS: + case AMDGPUAS::REGION_ADDRESS: { + // The single offset versions have a 16-bit offset in bytes. + return isUInt<16>(OffsetSize); + } + case AMDGPUAS::PRIVATE_ADDRESS: + // Indirect register addressing does not use any offsets. + default: + return 0; + } +} + // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) // This is a variant of @@ -1377,13 +1408,10 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, if (!CAdd) return SDValue(); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); - // If the resulting offset is too large, we can't fold it into the addressing // mode offset. APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); - if (!TII->canFoldOffset(Offset.getZExtValue(), AddrSpace)) + if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget)) return SDValue(); SelectionDAG &DAG = DCI.DAG; @@ -1595,6 +1623,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case AMDGPUISD::UMAX: case AMDGPUISD::UMIN: { if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && + N->getValueType(0) != MVT::f64 && getTargetMachine().getOptLevel() > CodeGenOpt::None) return performMin3Max3Combine(N, DCI); break; diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp index 50f20ac..90a37f1 100644 --- a/lib/Target/R600/SIInsertWaits.cpp +++ b/lib/Target/R600/SIInsertWaits.cpp @@ -259,7 +259,8 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, return; } - if (TRI->ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >= + AMDGPUSubtarget::VOLCANIC_ISLANDS) { // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM // or SMEM clause, respectively. // @@ -412,7 +413,8 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) { void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { - if (TRI->ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() < + AMDGPUSubtarget::VOLCANIC_ISLANDS) return; // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG. diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td index c90c741..4167590 100644 --- a/lib/Target/R600/SIInstrFormats.td +++ b/lib/Target/R600/SIInstrFormats.td @@ -83,6 +83,9 @@ class Enc64 { int Size = 8; } +class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">; +def VOPDstVCC : VOPDstOperand <VCCReg>; + let Uses = [EXEC] in { class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> : @@ -96,7 +99,7 @@ class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> : } class VOPCCommon <dag ins, string asm, list<dag> pattern> : - VOPAnyCommon <(outs VCCReg:$dst), ins, asm, pattern> { + VOPAnyCommon <(outs VOPDstVCC:$dst), ins, asm, pattern> { let DisableEncoding = "$dst"; let VOPC = 1; @@ -577,6 +580,12 @@ class DS <dag outs, dag ins, string asm, list<dag> pattern> : let DS = 1; let UseNamedOperandTable = 1; let DisableEncoding = "$m0"; + + // Most instruction load and store data, so set this as the default. + let mayLoad = 1; + let mayStore = 1; + + let hasSideEffects = 0; let SchedRW = [WriteLDS]; } diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index 4f1e5ad..ba98ad7 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -28,7 +28,7 @@ using namespace llvm; SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) - : AMDGPUInstrInfo(st), RI(st) {} + : AMDGPUInstrInfo(st), RI() {} //===----------------------------------------------------------------------===// // TargetInstrInfo callbacks @@ -120,12 +120,20 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, if (Load0->getOperand(0) != Load1->getOperand(0)) return false; + const ConstantSDNode *Load0Offset = + dyn_cast<ConstantSDNode>(Load0->getOperand(1)); + const ConstantSDNode *Load1Offset = + dyn_cast<ConstantSDNode>(Load1->getOperand(1)); + + if (!Load0Offset || !Load1Offset) + return false; + // Check chain. if (findChainOperand(Load0) != findChainOperand(Load1)) return false; - Offset0 = cast<ConstantSDNode>(Load0->getOperand(1))->getZExtValue(); - Offset1 = cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue(); + Offset0 = Load0Offset->getZExtValue(); + Offset1 = Load1Offset->getZExtValue(); return true; } @@ -418,7 +426,9 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } } -unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const { +unsigned SIInstrInfo::commuteOpcode(const MachineInstr &MI) const { + const unsigned Opcode = MI.getOpcode(); + int NewOpc; // Try to map original to commuted opcode @@ -583,10 +593,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z); unsigned InputPtrReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR); - static const unsigned TIDIGRegs[3] = { - TIDIGXReg, TIDIGYReg, TIDIGZReg - }; - for (unsigned Reg : TIDIGRegs) { + for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { if (!Entry.isLiveIn(Reg)) Entry.addLiveIn(Reg); } @@ -720,6 +727,26 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { MI->eraseFromParent(); break; } + + case AMDGPU::V_CNDMASK_B64_PSEUDO: { + unsigned Dst = MI->getOperand(0).getReg(); + unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); + unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); + unsigned Src0 = MI->getOperand(1).getReg(); + unsigned Src1 = MI->getOperand(2).getReg(); + const MachineOperand &SrcCond = MI->getOperand(3); + + BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) + .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) + .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) + .addOperand(SrcCond); + BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) + .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) + .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) + .addOperand(SrcCond); + MI->eraseFromParent(); + break; + } } return true; } @@ -792,7 +819,7 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, } if (MI) - MI->setDesc(get(commuteOpcode(MI->getOpcode()))); + MI->setDesc(get(commuteOpcode(*MI))); return MI; } @@ -1172,32 +1199,6 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, return RI.opCanUseInlineConstant(OpInfo.OperandType); } -bool SIInstrInfo::canFoldOffset(unsigned OffsetSize, unsigned AS) const { - switch (AS) { - case AMDGPUAS::GLOBAL_ADDRESS: { - // MUBUF instructions a 12-bit offset in bytes. - return isUInt<12>(OffsetSize); - } - case AMDGPUAS::CONSTANT_ADDRESS: { - // SMRD instructions have an 8-bit offset in dwords on SI and - // a 20-bit offset in bytes on VI. - if (RI.ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - return isUInt<20>(OffsetSize); - else - return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); - } - case AMDGPUAS::LOCAL_ADDRESS: - case AMDGPUAS::REGION_ADDRESS: { - // The single offset versions have a 16-bit offset in bytes. - return isUInt<16>(OffsetSize); - } - case AMDGPUAS::PRIVATE_ADDRESS: - // Indirect register addressing does not use any offsets. - default: - return 0; - } -} - bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { int Op32 = AMDGPU::getVOPe32(Opcode); if (Op32 == -1) @@ -1405,6 +1406,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; + case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; @@ -1423,6 +1425,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; + case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; } } @@ -1865,12 +1868,15 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { MachineInstr *Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) .addOperand(*VData) - .addOperand(*SRsrc) .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. // This will be replaced later // with the new value of vaddr. + .addOperand(*SRsrc) .addOperand(*SOffset) - .addOperand(*Offset); + .addOperand(*Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0); // tfe MI->removeFromParent(); MI = Addr64; @@ -1914,14 +1920,20 @@ void SIInstrInfo::splitSMRD(MachineInstr *MI, // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes // on VI. + + bool IsKill = SBase->isKill(); if (OffOp) { - bool isVI = RI.ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS; + bool isVI = + MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >= + AMDGPUSubtarget::VOLCANIC_ISLANDS; unsigned OffScale = isVI ? 1 : 4; // Handle the _IMM variant unsigned LoOffset = OffOp->getImm() * OffScale; unsigned HiOffset = LoOffset + HalfSize; Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo) - .addOperand(*SBase) + // Use addReg instead of addOperand + // to make sure kill flag is cleared. + .addReg(SBase->getReg(), 0, SBase->getSubReg()) .addImm(LoOffset / OffScale); if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) { @@ -1930,25 +1942,28 @@ void SIInstrInfo::splitSMRD(MachineInstr *MI, BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR) .addImm(HiOffset); // The offset in register is in bytes. Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) - .addOperand(*SBase) + .addReg(SBase->getReg(), getKillRegState(IsKill), + SBase->getSubReg()) .addReg(OffsetSGPR); } else { Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi) - .addOperand(*SBase) + .addReg(SBase->getReg(), getKillRegState(IsKill), + SBase->getSubReg()) .addImm(HiOffset / OffScale); } } else { // Handle the _SGPR variant MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff); Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo) - .addOperand(*SBase) + .addReg(SBase->getReg(), 0, SBase->getSubReg()) .addOperand(*SOff); unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR) .addOperand(*SOff) .addImm(HalfSize); Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp)) - .addOperand(*SBase) + .addReg(SBase->getReg(), getKillRegState(IsKill), + SBase->getSubReg()) .addReg(OffsetSGPR); } @@ -2003,7 +2018,8 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con // SMRD instructions take a dword offsets on SI and byte offset on VI // and MUBUF instructions always take a byte offset. ImmOffset = MI->getOperand(2).getImm(); - if (RI.ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) + if (MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <= + AMDGPUSubtarget::SEA_ISLANDS) ImmOffset <<= 2; RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); @@ -2043,13 +2059,15 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con .addImm(AMDGPU::sub3); MI->setDesc(get(NewOpcode)); if (MI->getOperand(2).isReg()) { - MI->getOperand(2).setReg(MI->getOperand(1).getReg()); + MI->getOperand(2).setReg(SRsrc); } else { - MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false); + MI->getOperand(2).ChangeToRegister(SRsrc, false); } - MI->getOperand(1).setReg(SRsrc); MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset)); + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // glc + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // slc + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // tfe const TargetRegisterClass *NewDstRC = RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass); diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index 12dc3f3..a9aa99f 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -114,7 +114,7 @@ public: // register. If there is no hardware instruction that can store to \p // DstRC, then AMDGPU::COPY is returned. unsigned getMovOpcode(const TargetRegisterClass *DstRC) const; - unsigned commuteOpcode(unsigned Opcode) const; + unsigned commuteOpcode(const MachineInstr &MI) const; MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI = false) const override; @@ -218,10 +218,6 @@ public: bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, const MachineOperand &MO) const; - /// \brief Return true if the given offset Size in bytes can be folded into - /// the immediate offsets of a memory instruction for the given address space. - bool canFoldOffset(unsigned OffsetSize, unsigned AS) const; - /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding. /// This function will return false if you pass it a 32-bit instruction. bool hasVALU32BitEncoding(unsigned Opcode) const; diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index e2747dc..d603ecb 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -264,6 +264,9 @@ def ds_offset0 : Operand<i8> { def ds_offset1 : Operand<i8> { let PrintMethod = "printDSOffset1"; } +def gds : Operand <i1> { + let PrintMethod = "printGDS"; +} def glc : Operand <i1> { let PrintMethod = "printGLC"; } @@ -284,6 +287,8 @@ def ClampMod : Operand <i1> { } // End OperandType = "OPERAND_IMMEDIATE" +def VOPDstS64 : VOPDstOperand <SReg_64>; + //===----------------------------------------------------------------------===// // Complex patterns //===----------------------------------------------------------------------===// @@ -292,7 +297,7 @@ def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">; def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">; def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">; -def MUBUFAddr64 : ComplexPattern<i64, 4, "SelectMUBUFAddr64">; +def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">; def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">; def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">; def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; @@ -315,6 +320,7 @@ def SIOperand { def SRCMODS { int NONE = 0; + int NEG = 1; } def DSTCLAMP { @@ -516,7 +522,7 @@ multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m < class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt, string opName, PatLeaf cond> : SOPC < op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1), - opName#" $dst, $src0, $src1", []>; + opName#" $src0, $src1", []>; class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL> : SOPC_Helper<op, SSrc_32, i32, opName, cond>; @@ -637,9 +643,9 @@ class getNumSrcArgs<ValueType Src1, ValueType Src2> { // Returns the register class to use for the destination of VOP[123C] // instructions for the given VT. class getVALUDstForVT<ValueType VT> { - RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, - !if(!eq(VT.Size, 64), VReg_64, - SReg_64)); // else VT == i1 + RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>, + !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>, + VOPDstOperand<SReg_64>)); // else VT == i1 } // Returns the register class to use for source 0 of VOP[12C] @@ -717,7 +723,7 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, class getAsm32 <int NumSrcArgs> { string src1 = ", $src1"; string src2 = ", $src2"; - string ret = " $dst, $src0"# + string ret = "$dst, $src0"# !if(!eq(NumSrcArgs, 1), "", src1)# !if(!eq(NumSrcArgs, 3), src2, ""); } @@ -733,7 +739,7 @@ class getAsm64 <int NumSrcArgs, bit HasModifiers> { string ret = !if(!eq(HasModifiers, 0), getAsm32<NumSrcArgs>.ret, - " $dst, "#src0#src1#src2#"$clamp"#"$omod"); + "$dst, "#src0#src1#src2#"$clamp"#"$omod"); } @@ -745,7 +751,7 @@ class VOPProfile <list<ValueType> _ArgVT> { field ValueType Src0VT = ArgVT[1]; field ValueType Src1VT = ArgVT[2]; field ValueType Src2VT = ArgVT[3]; - field RegisterClass DstRC = getVALUDstForVT<DstVT>.ret; + field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret; field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret; field RegisterClass Src1RC32 = getVOPSrc1ForVT<Src1VT>.ret; field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret; @@ -761,7 +767,7 @@ class VOPProfile <list<ValueType> _ArgVT> { field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, HasModifiers>.ret; - field string Asm32 = "_e32"#getAsm32<NumSrcArgs>.ret; + field string Asm32 = getAsm32<NumSrcArgs>.ret; field string Asm64 = getAsm64<NumSrcArgs, HasModifiers>.ret; } @@ -788,22 +794,27 @@ def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> { def VOP_I1_F32_I32 : VOPProfile <[i1, f32, i32, untyped]> { let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); - let Asm64 = " $dst, $src0_modifiers, $src1"; + let Asm64 = "$dst, $src0_modifiers, $src1"; } def VOP_I1_F64_I32 : VOPProfile <[i1, f64, i32, untyped]> { let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); - let Asm64 = " $dst, $src0_modifiers, $src1"; + let Asm64 = "$dst, $src0_modifiers, $src1"; } def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>; +def VOP_CNDMASK : VOPProfile <[i32, i32, i32, untyped]> { + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VCCReg:$src2); + let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, SSrc_64:$src2); + let Asm64 = "$dst, $src0, $src1, $src2"; +} def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>; def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> { field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2); - field string Asm = " $dst, $src0, $vsrc1, $src2"; + field string Asm = "$dst, $src0, $vsrc1, $src2"; } def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; @@ -835,23 +846,28 @@ class VOP1_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : field bits<9> src0; } +class VOP1_Real_si <string opName, vop1 op, dag outs, dag ins, string asm> : + VOP1<op.SI, outs, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.SI>; + +class VOP1_Real_vi <string opName, vop1 op, dag outs, dag ins, string asm> : + VOP1<op.VI, outs, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.VI>; + multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern, string opName> { def "" : VOP1_Pseudo <outs, ins, pattern, opName>; - def _si : VOP1<op.SI, outs, ins, asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.SI>; - def _vi : VOP1<op.VI, outs, ins, asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.VI>; + def _si : VOP1_Real_si <opName, op, outs, ins, asm>; + + def _vi : VOP1_Real_vi <opName, op, outs, ins, asm>; } multiclass VOP1SI_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern, string opName> { def "" : VOP1_Pseudo <outs, ins, pattern, opName>; - def _si : VOP1<op.SI, outs, ins, asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.SI>; - // No VI instruction. This class is for SI only. + def _si : VOP1_Real_si <opName, op, outs, ins, asm>; } class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : @@ -862,13 +878,20 @@ class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : let isCodeGenOnly = 1; } +class VOP2_Real_si <string opName, vop2 op, dag outs, dag ins, string asm> : + VOP2 <op.SI, outs, ins, opName#asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.SI>; + +class VOP2_Real_vi <string opName, vop2 op, dag outs, dag ins, string asm> : + VOP2 <op.SI, outs, ins, opName#asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.VI>; + multiclass VOP2SI_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, string opName, string revOp> { def "" : VOP2_Pseudo <outs, ins, pattern, opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; - def _si : VOP2 <op.SI, outs, ins, opName#asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.SI>; + def _si : VOP2_Real_si <opName, op, outs, ins, asm>; } multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, @@ -876,10 +899,10 @@ multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, def "" : VOP2_Pseudo <outs, ins, pattern, opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; - def _si : VOP2 <op.SI, outs, ins, opName#asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.SI>; - def _vi : VOP2 <op.VI, outs, ins, opName#asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.VI>; + def _si : VOP2_Real_si <opName, op, outs, ins, asm>; + + def _vi : VOP2_Real_vi <opName, op, outs, ins, asm>; + } class VOP3DisableFields <bit HasSrc1, bit HasSrc2, bit HasModifiers> { @@ -1047,9 +1070,10 @@ multiclass VOP3b_3_m <vop op, dag outs, dag ins, string asm, multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, - bit HasMods, bit defExec> { + bit HasMods, bit defExec, string revOp> { - def "" : VOP3_Pseudo <outs, ins, pattern, opName>; + def "" : VOP3_Pseudo <outs, ins, pattern, opName>, + VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, VOP3DisableFields<1, 0, HasMods> { @@ -1086,7 +1110,7 @@ multiclass VOP1_Helper <vop1 op, string opName, dag outs, defm _e32 : VOP1_m <op, outs, ins32, opName#asm32, pat32, opName>; - defm _e64 : VOP3_1_m <op, outs, ins64, opName#"_e64"#asm64, pat64, opName, HasMods>; + defm _e64 : VOP3_1_m <op, outs, ins64, opName#asm64, pat64, opName, HasMods>; } multiclass VOP1Inst <vop1 op, string opName, VOPProfile P, @@ -1121,7 +1145,7 @@ multiclass VOP2_Helper <vop2 op, string opName, dag outs, defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>; defm _e64 : VOP3_2_m <op, - outs, ins64, opName#"_e64"#asm64, pat64, opName, revOp, HasMods + outs, ins64, opName#asm64, pat64, opName, revOp, HasMods >; } @@ -1145,7 +1169,7 @@ multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P, string revOp = opName> { defm _e32 : VOP2SI_m <op, P.Outs, P.Ins32, P.Asm32, [], opName, revOp>; - defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#"_e64"#P.Asm64, + defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#P.Asm64, !if(P.HasModifiers, [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, @@ -1163,7 +1187,7 @@ multiclass VOP2b_Helper <vop2 op, string opName, dag outs, defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>; defm _e64 : VOP3b_2_m <op, - outs, ins64, opName#"_e64"#asm64, pat64, opName, revOp, HasMods + outs, ins64, opName#asm64, pat64, opName, revOp, HasMods >; } @@ -1189,7 +1213,7 @@ multiclass VOP2_VI3_Helper <vop23 op, string opName, dag outs, string revOp, bit HasMods> { defm _e32 : VOP2SI_m <op, outs, ins32, asm32, pat32, opName, revOp>; - defm _e64 : VOP3_2_m <op, outs, ins64, opName#"_e64"#asm64, pat64, opName, + defm _e64 : VOP3_2_m <op, outs, ins64, opName#asm64, pat64, opName, revOp, HasMods>; } @@ -1235,28 +1259,30 @@ class VOPC_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : } multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern, - string opName, bit DefExec> { + string opName, bit DefExec, string revOpName = ""> { def "" : VOPC_Pseudo <outs, ins, pattern, opName>; def _si : VOPC<op.SI, ins, asm, []>, SIMCInstr <opName#"_e32", SISubtarget.SI> { let Defs = !if(DefExec, [EXEC], []); + let hasSideEffects = DefExec; } def _vi : VOPC<op.VI, ins, asm, []>, SIMCInstr <opName#"_e32", SISubtarget.VI> { let Defs = !if(DefExec, [EXEC], []); + let hasSideEffects = DefExec; } } multiclass VOPC_Helper <vopc op, string opName, dag ins32, string asm32, list<dag> pat32, dag out64, dag ins64, string asm64, list<dag> pat64, - bit HasMods, bit DefExec> { + bit HasMods, bit DefExec, string revOp> { defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>; - defm _e64 : VOP3_C_m <op, out64, ins64, opName#"_e64"#asm64, pat64, - opName, HasMods, DefExec>; + defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64, + opName, HasMods, DefExec, revOp>; } // Special case for class instructions which only have modifiers on @@ -1264,20 +1290,21 @@ multiclass VOPC_Helper <vopc op, string opName, multiclass VOPC_Class_Helper <vopc op, string opName, dag ins32, string asm32, list<dag> pat32, dag out64, dag ins64, string asm64, list<dag> pat64, - bit HasMods, bit DefExec> { + bit HasMods, bit DefExec, string revOp> { defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>; - defm _e64 : VOP3_C_m <op, out64, ins64, opName#"_e64"#asm64, pat64, - opName, HasMods, DefExec>, + defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64, + opName, HasMods, DefExec, revOp>, VOP3DisableModFields<1, 0, 0>; } multiclass VOPCInst <vopc op, string opName, VOPProfile P, PatLeaf cond = COND_NULL, + string revOp = opName, bit DefExec = 0> : VOPC_Helper < op, opName, P.Ins32, P.Asm32, [], - (outs SReg_64:$dst), P.Ins64, P.Asm64, + (outs VOPDstS64:$dst), P.Ins64, P.Asm64, !if(P.HasModifiers, [(set i1:$dst, (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, @@ -1285,54 +1312,55 @@ multiclass VOPCInst <vopc op, string opName, (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), cond))], [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]), - P.HasModifiers, DefExec + P.HasModifiers, DefExec, revOp >; multiclass VOPCClassInst <vopc op, string opName, VOPProfile P, bit DefExec = 0> : VOPC_Class_Helper < op, opName, P.Ins32, P.Asm32, [], - (outs SReg_64:$dst), P.Ins64, P.Asm64, + (outs VOPDstS64:$dst), P.Ins64, P.Asm64, !if(P.HasModifiers, [(set i1:$dst, (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))], [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]), - P.HasModifiers, DefExec + P.HasModifiers, DefExec, opName >; -multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL> : - VOPCInst <op, opName, VOP_F32_F32_F32, cond>; +multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPCInst <op, opName, VOP_F32_F32_F32, cond, revOp>; -multiclass VOPC_F64 <vopc op, string opName, PatLeaf cond = COND_NULL> : - VOPCInst <op, opName, VOP_F64_F64_F64, cond>; +multiclass VOPC_F64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPCInst <op, opName, VOP_F64_F64_F64, cond, revOp>; -multiclass VOPC_I32 <vopc op, string opName, PatLeaf cond = COND_NULL> : - VOPCInst <op, opName, VOP_I32_I32_I32, cond>; +multiclass VOPC_I32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPCInst <op, opName, VOP_I32_I32_I32, cond, revOp>; -multiclass VOPC_I64 <vopc op, string opName, PatLeaf cond = COND_NULL> : - VOPCInst <op, opName, VOP_I64_I64_I64, cond>; +multiclass VOPC_I64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPCInst <op, opName, VOP_I64_I64_I64, cond, revOp>; multiclass VOPCX <vopc op, string opName, VOPProfile P, - PatLeaf cond = COND_NULL> - : VOPCInst <op, opName, P, cond, 1>; + PatLeaf cond = COND_NULL, + string revOp = ""> + : VOPCInst <op, opName, P, cond, revOp, 1>; -multiclass VOPCX_F32 <vopc op, string opName, PatLeaf cond = COND_NULL> : - VOPCX <op, opName, VOP_F32_F32_F32, cond>; +multiclass VOPCX_F32 <vopc op, string opName, string revOp = opName> : + VOPCX <op, opName, VOP_F32_F32_F32, COND_NULL, revOp>; -multiclass VOPCX_F64 <vopc op, string opName, PatLeaf cond = COND_NULL> : - VOPCX <op, opName, VOP_F64_F64_F64, cond>; +multiclass VOPCX_F64 <vopc op, string opName, string revOp = opName> : + VOPCX <op, opName, VOP_F64_F64_F64, COND_NULL, revOp>; -multiclass VOPCX_I32 <vopc op, string opName, PatLeaf cond = COND_NULL> : - VOPCX <op, opName, VOP_I32_I32_I32, cond>; +multiclass VOPCX_I32 <vopc op, string opName, string revOp = opName> : + VOPCX <op, opName, VOP_I32_I32_I32, COND_NULL, revOp>; -multiclass VOPCX_I64 <vopc op, string opName, PatLeaf cond = COND_NULL> : - VOPCX <op, opName, VOP_I64_I64_I64, cond>; +multiclass VOPCX_I64 <vopc op, string opName, string revOp = opName> : + VOPCX <op, opName, VOP_I64_I64_I64, COND_NULL, revOp>; multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm, list<dag> pat, int NumSrcArgs, bit HasMods> : VOP3_m < - op, outs, ins, opName#asm, pat, opName, NumSrcArgs, HasMods + op, outs, ins, opName#" "#asm, pat, opName, NumSrcArgs, HasMods >; multiclass VOPC_CLASS_F32 <vopc op, string opName> : @@ -1349,7 +1377,7 @@ multiclass VOPCX_CLASS_F64 <vopc op, string opName> : multiclass VOP3Inst <vop3 op, string opName, VOPProfile P, SDPatternOperator node = null_frag> : VOP3_Helper < - op, opName, P.Outs, P.Ins64, P.Asm64, + op, opName, (outs P.DstRC.RegClass:$dst), P.Ins64, P.Asm64, !if(!eq(P.NumSrcArgs, 3), !if(P.HasModifiers, [(set P.DstVT:$dst, @@ -1381,7 +1409,7 @@ multiclass VOP3_VCC_Inst <vop3 op, string opName, VOPProfile P, SDPatternOperator node = null_frag> : VOP3_Helper < op, opName, - P.Outs, + (outs P.DstRC.RegClass:$dst), (ins InputModsNoDefault:$src0_modifiers, P.Src0RC64:$src0, InputModsNoDefault:$src1_modifiers, P.Src1RC64:$src1, InputModsNoDefault:$src2_modifiers, P.Src2RC64:$src2, @@ -1483,10 +1511,8 @@ class DS_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> : DSe_vi <op>, SIMCInstr <opName, SISubtarget.VI>; -class DS_1A_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> : - DS <outs, ins, asm, []>, - DSe <op>, - SIMCInstr <opName, SISubtarget.SI> { +class DS_Off16_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> : + DS_Real_si <op,opName, outs, ins, asm> { // Single load interpret the 2 i8imm operands as a single i16 offset. bits<16> offset; @@ -1494,10 +1520,8 @@ class DS_1A_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> : let offset1 = offset{15-8}; } -class DS_1A_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> : - DS <outs, ins, asm, []>, - DSe_vi <op>, - SIMCInstr <opName, SISubtarget.VI> { +class DS_Off16_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> : + DS_Real_vi <op, opName, outs, ins, asm> { // Single load interpret the 2 i8imm operands as a single i16 offset. bits<16> offset; @@ -1505,180 +1529,168 @@ class DS_1A_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> : let offset1 = offset{15-8}; } -multiclass DS_1A_Load_m <bits<8> op, string opName, dag outs, dag ins, string asm, - list<dag> pat> { - let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { - def "" : DS_Pseudo <opName, outs, ins, pat>; +multiclass DS_1A_RET <bits<8> op, string opName, RegisterClass rc, + dag outs = (outs rc:$vdst), + dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds, M0Reg:$m0), + string asm = opName#" $vdst, $addr"#"$offset$gds"> { - let data0 = 0, data1 = 0 in { - def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; - } + def "" : DS_Pseudo <opName, outs, ins, []>; + + let data0 = 0, data1 = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; } } -multiclass DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> - : DS_1A_Load_m < - op, - asm, - (outs regClass:$vdst), - (ins i1imm:$gds, VGPR_32:$addr, ds_offset:$offset, M0Reg:$m0), - asm#" $vdst, $addr"#"$offset", - []>; - -multiclass DS_Load2_m <bits<8> op, string opName, dag outs, dag ins, string asm, - list<dag> pat> { - let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { - def "" : DS_Pseudo <opName, outs, ins, pat>; - - let data0 = 0, data1 = 0 in { - def _si : DS_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Real_vi <op, opName, outs, ins, asm>; - } +multiclass DS_1A_Off8_RET <bits<8> op, string opName, RegisterClass rc, + dag outs = (outs rc:$vdst), + dag ins = (ins VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1, + gds:$gds, M0Reg:$m0), + string asm = opName#" $vdst, $addr"#"$offset0"#"$offset1$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>; + + let data0 = 0, data1 = 0 in { + def _si : DS_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Real_vi <op, opName, outs, ins, asm>; } } -multiclass DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> - : DS_Load2_m < - op, - asm, - (outs regClass:$vdst), - (ins i1imm:$gds, VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1, - M0Reg:$m0), - asm#" $vdst, $addr"#"$offset0"#"$offset1", - []>; - -multiclass DS_1A_Store_m <bits<8> op, string opName, dag outs, dag ins, - string asm, list<dag> pat> { - let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in { - def "" : DS_Pseudo <opName, outs, ins, pat>; - - let data1 = 0, vdst = 0 in { - def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; - } +multiclass DS_1A1D_NORET <bits<8> op, string opName, RegisterClass rc, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds, + M0Reg:$m0), + string asm = opName#" $addr, $data0"#"$offset$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<opName, 0>; + + let data1 = 0, vdst = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; } } -multiclass DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> - : DS_1A_Store_m < - op, - asm, - (outs), - (ins i1imm:$gds, VGPR_32:$addr, regClass:$data0, ds_offset:$offset, M0Reg:$m0), - asm#" $addr, $data0"#"$offset", - []>; - -multiclass DS_Store_m <bits<8> op, string opName, dag outs, dag ins, - string asm, list<dag> pat> { - let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in { - def "" : DS_Pseudo <opName, outs, ins, pat>; - - let vdst = 0 in { - def _si : DS_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Real_vi <op, opName, outs, ins, asm>; - } +multiclass DS_1A1D_Off8_NORET <bits<8> op, string opName, RegisterClass rc, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, + ds_offset0:$offset0, ds_offset1:$offset1, gds:$gds, M0Reg:$m0), + string asm = opName#" $addr, $data0, $data1"#"$offset0"#"$offset1"#"$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>; + + let vdst = 0 in { + def _si : DS_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Real_vi <op, opName, outs, ins, asm>; } } -multiclass DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> - : DS_Store_m < - op, - asm, - (outs), - (ins i1imm:$gds, VGPR_32:$addr, regClass:$data0, regClass:$data1, - ds_offset0:$offset0, ds_offset1:$offset1, M0Reg:$m0), - asm#" $addr, $data0, $data1"#"$offset0"#"$offset1", - []>; - -// 1 address, 1 data. -multiclass DS_1A1D_RET_m <bits<8> op, string opName, dag outs, dag ins, - string asm, list<dag> pat, string noRetOp> { - let mayLoad = 1, mayStore = 1, - hasPostISelHook = 1 // Adjusted to no return version. - in { - def "" : DS_Pseudo <opName, outs, ins, pat>, - AtomicNoRet<noRetOp, 1>; - - let data1 = 0 in { - def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; - } +multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc, + string noRetOp = "", + dag outs = (outs rc:$vdst), + dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds, + M0Reg:$m0), + string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<noRetOp, 1>; + + let data1 = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; } } -multiclass DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc, - string noRetOp = ""> : DS_1A1D_RET_m < - op, asm, - (outs rc:$vdst), - (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, ds_offset:$offset, M0Reg:$m0), - asm#" $vdst, $addr, $data0"#"$offset", [], noRetOp>; - -// 1 address, 2 data. -multiclass DS_1A2D_RET_m <bits<8> op, string opName, dag outs, dag ins, - string asm, list<dag> pat, string noRetOp> { - let mayLoad = 1, mayStore = 1, - hasPostISelHook = 1 // Adjusted to no return version. - in { - def "" : DS_Pseudo <opName, outs, ins, pat>, - AtomicNoRet<noRetOp, 1>; - - def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; - } +multiclass DS_1A2D_RET_m <bits<8> op, string opName, RegisterClass rc, + string noRetOp = "", dag ins, + dag outs = (outs rc:$vdst), + string asm = opName#" $vdst, $addr, $data0, $data1"#"$offset"#"$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<noRetOp, 1>; + + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; } multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc, - string noRetOp = ""> : DS_1A2D_RET_m < - op, asm, - (outs rc:$vdst), - (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset, M0Reg:$m0), - asm#" $vdst, $addr, $data0, $data1"#"$offset", - [], noRetOp>; - -// 1 address, 2 data. -multiclass DS_1A2D_NORET_m <bits<8> op, string opName, dag outs, dag ins, - string asm, list<dag> pat, string noRetOp> { - let mayLoad = 1, mayStore = 1 in { - def "" : DS_Pseudo <opName, outs, ins, pat>, - AtomicNoRet<noRetOp, 0>; + string noRetOp = "", RegisterClass src = rc> : + DS_1A2D_RET_m <op, asm, rc, noRetOp, + (ins VGPR_32:$addr, src:$data0, src:$data1, + ds_offset:$offset, gds:$gds, M0Reg:$m0) +>; - let vdst = 0 in { - def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; - } +multiclass DS_1A2D_NORET <bits<8> op, string opName, RegisterClass rc, + string noRetOp = opName, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, + ds_offset:$offset, gds:$gds, M0Reg:$m0), + string asm = opName#" $addr, $data0, $data1"#"$offset"#"$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<noRetOp, 0>; + + let vdst = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; } } -multiclass DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc, - string noRetOp = asm> : DS_1A2D_NORET_m < - op, asm, - (outs), - (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset, M0Reg:$m0), - asm#" $addr, $data0, $data1"#"$offset", - [], noRetOp>; +multiclass DS_0A_RET <bits<8> op, string opName, + dag outs = (outs VGPR_32:$vdst), + dag ins = (ins ds_offset:$offset, gds:$gds, M0Reg:$m0), + string asm = opName#" $vdst"#"$offset"#"$gds"> { -// 1 address, 1 data. -multiclass DS_1A1D_NORET_m <bits<8> op, string opName, dag outs, dag ins, - string asm, list<dag> pat, string noRetOp> { let mayLoad = 1, mayStore = 1 in { - def "" : DS_Pseudo <opName, outs, ins, pat>, - AtomicNoRet<noRetOp, 0>; + def "" : DS_Pseudo <opName, outs, ins, []>; - let data1 = 0, vdst = 0 in { - def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; - } - } + let addr = 0, data0 = 0, data1 = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } // end addr = 0, data0 = 0, data1 = 0 + } // end mayLoad = 1, mayStore = 1 } -multiclass DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc, - string noRetOp = asm> : DS_1A1D_NORET_m < - op, asm, - (outs), - (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, ds_offset:$offset, M0Reg:$m0), - asm#" $addr, $data0"#"$offset", - [], noRetOp>; +multiclass DS_1A_RET_GDS <bits<8> op, string opName, + dag outs = (outs VGPR_32:$vdst), + dag ins = (ins VGPR_32:$addr, ds_offset:$offset, M0Reg:$m0), + string asm = opName#" $vdst, $addr"#"$offset gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>; + + let data0 = 0, data1 = 0, gds = 1 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } // end data0 = 0, data1 = 0, gds = 1 +} + +multiclass DS_1A_GDS <bits<8> op, string opName, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, M0Reg:$m0), + string asm = opName#" $addr gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>; + + let vdst = 0, data0 = 0, data1 = 0, offset0 = 0, offset1 = 0, gds = 1 in { + def _si : DS_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Real_vi <op, opName, outs, ins, asm>; + } // end vdst = 0, data = 0, data1 = 0, gds = 1 +} + +multiclass DS_1A <bits<8> op, string opName, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, ds_offset:$offset, M0Reg:$m0, gds:$gds), + string asm = opName#" $addr"#"$offset"#"$gds"> { + + let mayLoad = 1, mayStore = 1 in { + def "" : DS_Pseudo <opName, outs, ins, []>; + + let vdst = 0, data0 = 0, data1 = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } // let vdst = 0, data0 = 0, data1 = 0 + } // end mayLoad = 1, mayStore = 1 +} //===----------------------------------------------------------------------===// // MTBUF classes @@ -1861,14 +1873,14 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, defm _ADDR64 : MUBUFAtomicAddr64_m < op, name#"_addr64", (outs), (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, - mbuf_offset:$offset, SCSrc_32:$soffset, slc:$slc), + SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0 >; defm _OFFSET : MUBUFAtomicOffset_m < op, name#"_offset", (outs), - (ins rc:$vdata, SReg_128:$srsrc, mbuf_offset:$offset, - SCSrc_32:$soffset, slc:$slc), + (ins rc:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, + slc:$slc), name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [], 0 >; } // glc = 0 @@ -1880,7 +1892,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, defm _RTN_ADDR64 : MUBUFAtomicAddr64_m < op, name#"_rtn_addr64", (outs rc:$vdata), (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr, - mbuf_offset:$offset, SSrc_32:$soffset, slc:$slc), + SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc", [(set vt:$vdata, (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, @@ -1889,8 +1901,8 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, defm _RTN_OFFSET : MUBUFAtomicOffset_m < op, name#"_rtn_offset", (outs rc:$vdata), - (ins rc:$vdata_in, SReg_128:$srsrc, mbuf_offset:$offset, - SCSrc_32:$soffset, slc:$slc), + (ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, slc:$slc), name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc", [(set vt:$vdata, (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, @@ -1909,9 +1921,8 @@ multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass, let mayLoad = 1, mayStore = 0 in { let offen = 0, idxen = 0, vaddr = 0 in { defm _OFFSET : MUBUF_m <op, name#"_offset", (outs regClass:$vdata), - (ins SReg_128:$srsrc, - mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc, - slc:$slc, tfe:$tfe), + (ins SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, @@ -1920,7 +1931,7 @@ multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass, let offen = 1, idxen = 0 in { defm _OFFEN : MUBUF_m <op, name#"_offen", (outs regClass:$vdata), - (ins SReg_128:$srsrc, VGPR_32:$vaddr, + (ins VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; @@ -1928,45 +1939,48 @@ multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass, let offen = 0, idxen = 1 in { defm _IDXEN : MUBUF_m <op, name#"_idxen", (outs regClass:$vdata), - (ins SReg_128:$srsrc, VGPR_32:$vaddr, - mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc, + (ins VGPR_32:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; } let offen = 1, idxen = 1 in { defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs regClass:$vdata), - (ins SReg_128:$srsrc, VReg_64:$vaddr, - SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + (ins VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; } - let offen = 0, idxen = 0, glc = 0, slc = 0, tfe = 0 in { + let offen = 0, idxen = 0 in { defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs regClass:$vdata), - (ins SReg_128:$srsrc, VReg_64:$vaddr, - SCSrc_32:$soffset, mbuf_offset:$offset), - name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset", + (ins VReg_64:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, mbuf_offset:$offset, + glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"# + "$glc"#"$slc"#"$tfe", [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset)))]>; + i16:$offset, i1:$glc, i1:$slc, + i1:$tfe)))]>; } } } multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass, - ValueType store_vt, SDPatternOperator st> { + ValueType store_vt = i32, SDPatternOperator st = null_frag> { let mayLoad = 0, mayStore = 1 in { defm : MUBUF_m <op, name, (outs), - (ins vdataClass:$vdata, SReg_128:$srsrc, VGPR_32:$vaddr, SCSrc_32:$soffset, + (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, offen:$offen, idxen:$idxen, glc:$glc, slc:$slc, tfe:$tfe), name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"# - "$glc"#"$slc"#"$tfe", []>; + "$glc"#"$slc"#"$tfe", []>; let offen = 0, idxen = 0, vaddr = 0 in { defm _OFFSET : MUBUF_m <op, name#"_offset",(outs), - (ins vdataClass:$vdata, SReg_128:$srsrc, mbuf_offset:$offset, - SCSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe), + (ins vdataClass:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>; @@ -1974,21 +1988,40 @@ multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass, let offen = 1, idxen = 0 in { defm _OFFEN : MUBUF_m <op, name#"_offen", (outs), - (ins vdataClass:$vdata, SReg_128:$srsrc, VGPR_32:$vaddr, SCSrc_32:$soffset, - mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, + slc:$slc, tfe:$tfe), name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"# "$glc"#"$slc"#"$tfe", []>; } // end offen = 1, idxen = 0 - let offen = 0, idxen = 0, glc = 0, slc = 0, tfe = 0 in { + let offen = 0, idxen = 1 in { + defm _IDXEN : MUBUF_m <op, name#"_idxen", (outs), + (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, + slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + } + + let offen = 1, idxen = 1 in { + defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs), + (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + } + + let offen = 0, idxen = 0 in { defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs), - (ins vdataClass:$vdata, SReg_128:$srsrc, - VReg_64:$vaddr, SCSrc_32:$soffset, - mbuf_offset:$offset), - name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset", + (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, + tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset addr64"# + "$offset"#"$glc"#"$slc"#"$tfe", [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, - i32:$soffset, i16:$offset))]>; + i32:$soffset, i16:$offset, + i1:$glc, i1:$slc, i1:$tfe))]>; } } // End mayLoad = 0, mayStore = 1 } @@ -2182,15 +2215,6 @@ def getVOPe32 : InstrMapping { let ValueCols = [["4"]]; } -// Maps an original opcode to its commuted version -def getCommuteRev : InstrMapping { - let FilterClass = "VOP2_REV"; - let RowFields = ["RevOp"]; - let ColFields = ["IsOrig"]; - let KeyCol = ["1"]; - let ValueCols = [["0"]]; -} - def getMaskedMIMGOp : InstrMapping { let FilterClass = "MIMG_Mask"; let RowFields = ["Op"]; @@ -2208,6 +2232,33 @@ def getCommuteOrig : InstrMapping { let ValueCols = [["1"]]; } +// Maps an original opcode to its commuted version +def getCommuteRev : InstrMapping { + let FilterClass = "VOP2_REV"; + let RowFields = ["RevOp"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + +def getCommuteCmpOrig : InstrMapping { + let FilterClass = "VOP2_REV"; + let RowFields = ["RevOp"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Maps an original opcode to its commuted version +def getCommuteCmpRev : InstrMapping { + let FilterClass = "VOP2_REV"; + let RowFields = ["RevOp"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + + def getMCOpcodeGen : InstrMapping { let FilterClass = "SIMCInstr"; let RowFields = ["PseudoInstr"]; diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 4f72e99..95b2470 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -28,6 +28,8 @@ def SendMsgImm : Operand<i32> { def isGCN : Predicate<"Subtarget->getGeneration() " ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">; +def isSI : Predicate<"Subtarget->getGeneration() " + "== AMDGPUSubtarget::SOUTHERN_ISLANDS">; def isSICI : Predicate< "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS" @@ -153,7 +155,9 @@ defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32", >; defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>; -defm S_FLBIT_I32 : SOP1_32 <sop1<0x17, 0x14>, "s_flbit_i32", []>; +defm S_FLBIT_I32 : SOP1_32 <sop1<0x17, 0x14>, "s_flbit_i32", + [(set i32:$dst, (int_AMDGPU_flbit_i32 i32:$src0))] +>; defm S_FLBIT_I32_I64 : SOP1_32_64 <sop1<0x18, 0x15>, "s_flbit_i32_i64", []>; defm S_SEXT_I32_I8 : SOP1_32 <sop1<0x19, 0x16>, "s_sext_i32_i8", [(set i32:$dst, (sext_inreg i32:$src0, i8))] @@ -304,7 +308,8 @@ defm S_ASHR_I64 : SOP2_64_32 <sop2<0x23, 0x21>, "s_ashr_i64", >; } // End Defs = [SCC] -defm S_BFM_B32 : SOP2_32 <sop2<0x24, 0x22>, "s_bfm_b32", []>; +defm S_BFM_B32 : SOP2_32 <sop2<0x24, 0x22>, "s_bfm_b32", + [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))]>; defm S_BFM_B64 : SOP2_64 <sop2<0x25, 0x23>, "s_bfm_b64", []>; defm S_MUL_I32 : SOP2_32 <sop2<0x26, 0x24>, "s_mul_i32", [(set i32:$dst, (mul i32:$src0, i32:$src1))] @@ -505,31 +510,30 @@ def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> { // VOPC Instructions //===----------------------------------------------------------------------===// -let isCompare = 1 in { +let isCompare = 1, isCommutable = 1 in { defm V_CMP_F_F32 : VOPC_F32 <vopc<0x0, 0x40>, "v_cmp_f_f32">; -defm V_CMP_LT_F32 : VOPC_F32 <vopc<0x1, 0x41>, "v_cmp_lt_f32", COND_OLT>; +defm V_CMP_LT_F32 : VOPC_F32 <vopc<0x1, 0x41>, "v_cmp_lt_f32", COND_OLT, "v_cmp_gt_f32">; defm V_CMP_EQ_F32 : VOPC_F32 <vopc<0x2, 0x42>, "v_cmp_eq_f32", COND_OEQ>; -defm V_CMP_LE_F32 : VOPC_F32 <vopc<0x3, 0x43>, "v_cmp_le_f32", COND_OLE>; +defm V_CMP_LE_F32 : VOPC_F32 <vopc<0x3, 0x43>, "v_cmp_le_f32", COND_OLE, "v_cmp_ge_f32">; defm V_CMP_GT_F32 : VOPC_F32 <vopc<0x4, 0x44>, "v_cmp_gt_f32", COND_OGT>; defm V_CMP_LG_F32 : VOPC_F32 <vopc<0x5, 0x45>, "v_cmp_lg_f32", COND_ONE>; defm V_CMP_GE_F32 : VOPC_F32 <vopc<0x6, 0x46>, "v_cmp_ge_f32", COND_OGE>; defm V_CMP_O_F32 : VOPC_F32 <vopc<0x7, 0x47>, "v_cmp_o_f32", COND_O>; defm V_CMP_U_F32 : VOPC_F32 <vopc<0x8, 0x48>, "v_cmp_u_f32", COND_UO>; -defm V_CMP_NGE_F32 : VOPC_F32 <vopc<0x9, 0x49>, "v_cmp_nge_f32", COND_ULT>; +defm V_CMP_NGE_F32 : VOPC_F32 <vopc<0x9, 0x49>, "v_cmp_nge_f32", COND_ULT, "v_cmp_nle_f32">; defm V_CMP_NLG_F32 : VOPC_F32 <vopc<0xa, 0x4a>, "v_cmp_nlg_f32", COND_UEQ>; -defm V_CMP_NGT_F32 : VOPC_F32 <vopc<0xb, 0x4b>, "v_cmp_ngt_f32", COND_ULE>; +defm V_CMP_NGT_F32 : VOPC_F32 <vopc<0xb, 0x4b>, "v_cmp_ngt_f32", COND_ULE, "v_cmp_nlt_f32">; defm V_CMP_NLE_F32 : VOPC_F32 <vopc<0xc, 0x4c>, "v_cmp_nle_f32", COND_UGT>; defm V_CMP_NEQ_F32 : VOPC_F32 <vopc<0xd, 0x4d>, "v_cmp_neq_f32", COND_UNE>; defm V_CMP_NLT_F32 : VOPC_F32 <vopc<0xe, 0x4e>, "v_cmp_nlt_f32", COND_UGE>; defm V_CMP_TRU_F32 : VOPC_F32 <vopc<0xf, 0x4f>, "v_cmp_tru_f32">; -let hasSideEffects = 1 in { defm V_CMPX_F_F32 : VOPCX_F32 <vopc<0x10, 0x50>, "v_cmpx_f_f32">; -defm V_CMPX_LT_F32 : VOPCX_F32 <vopc<0x11, 0x51>, "v_cmpx_lt_f32">; +defm V_CMPX_LT_F32 : VOPCX_F32 <vopc<0x11, 0x51>, "v_cmpx_lt_f32", "v_cmpx_gt_f32">; defm V_CMPX_EQ_F32 : VOPCX_F32 <vopc<0x12, 0x52>, "v_cmpx_eq_f32">; -defm V_CMPX_LE_F32 : VOPCX_F32 <vopc<0x13, 0x53>, "v_cmpx_le_f32">; +defm V_CMPX_LE_F32 : VOPCX_F32 <vopc<0x13, 0x53>, "v_cmpx_le_f32", "v_cmpx_ge_f32">; defm V_CMPX_GT_F32 : VOPCX_F32 <vopc<0x14, 0x54>, "v_cmpx_gt_f32">; defm V_CMPX_LG_F32 : VOPCX_F32 <vopc<0x15, 0x55>, "v_cmpx_lg_f32">; defm V_CMPX_GE_F32 : VOPCX_F32 <vopc<0x16, 0x56>, "v_cmpx_ge_f32">; @@ -543,233 +547,207 @@ defm V_CMPX_NEQ_F32 : VOPCX_F32 <vopc<0x1d, 0x5d>, "v_cmpx_neq_f32">; defm V_CMPX_NLT_F32 : VOPCX_F32 <vopc<0x1e, 0x5e>, "v_cmpx_nlt_f32">; defm V_CMPX_TRU_F32 : VOPCX_F32 <vopc<0x1f, 0x5f>, "v_cmpx_tru_f32">; -} // End hasSideEffects = 1 defm V_CMP_F_F64 : VOPC_F64 <vopc<0x20, 0x60>, "v_cmp_f_f64">; -defm V_CMP_LT_F64 : VOPC_F64 <vopc<0x21, 0x61>, "v_cmp_lt_f64", COND_OLT>; +defm V_CMP_LT_F64 : VOPC_F64 <vopc<0x21, 0x61>, "v_cmp_lt_f64", COND_OLT, "v_cmp_gt_f64">; defm V_CMP_EQ_F64 : VOPC_F64 <vopc<0x22, 0x62>, "v_cmp_eq_f64", COND_OEQ>; -defm V_CMP_LE_F64 : VOPC_F64 <vopc<0x23, 0x63>, "v_cmp_le_f64", COND_OLE>; +defm V_CMP_LE_F64 : VOPC_F64 <vopc<0x23, 0x63>, "v_cmp_le_f64", COND_OLE, "v_cmp_ge_f64">; defm V_CMP_GT_F64 : VOPC_F64 <vopc<0x24, 0x64>, "v_cmp_gt_f64", COND_OGT>; defm V_CMP_LG_F64 : VOPC_F64 <vopc<0x25, 0x65>, "v_cmp_lg_f64", COND_ONE>; defm V_CMP_GE_F64 : VOPC_F64 <vopc<0x26, 0x66>, "v_cmp_ge_f64", COND_OGE>; defm V_CMP_O_F64 : VOPC_F64 <vopc<0x27, 0x67>, "v_cmp_o_f64", COND_O>; defm V_CMP_U_F64 : VOPC_F64 <vopc<0x28, 0x68>, "v_cmp_u_f64", COND_UO>; -defm V_CMP_NGE_F64 : VOPC_F64 <vopc<0x29, 0x69>, "v_cmp_nge_f64", COND_ULT>; +defm V_CMP_NGE_F64 : VOPC_F64 <vopc<0x29, 0x69>, "v_cmp_nge_f64", COND_ULT, "v_cmp_nle_f64">; defm V_CMP_NLG_F64 : VOPC_F64 <vopc<0x2a, 0x6a>, "v_cmp_nlg_f64", COND_UEQ>; -defm V_CMP_NGT_F64 : VOPC_F64 <vopc<0x2b, 0x6b>, "v_cmp_ngt_f64", COND_ULE>; +defm V_CMP_NGT_F64 : VOPC_F64 <vopc<0x2b, 0x6b>, "v_cmp_ngt_f64", COND_ULE, "v_cmp_nlt_f64">; defm V_CMP_NLE_F64 : VOPC_F64 <vopc<0x2c, 0x6c>, "v_cmp_nle_f64", COND_UGT>; defm V_CMP_NEQ_F64 : VOPC_F64 <vopc<0x2d, 0x6d>, "v_cmp_neq_f64", COND_UNE>; defm V_CMP_NLT_F64 : VOPC_F64 <vopc<0x2e, 0x6e>, "v_cmp_nlt_f64", COND_UGE>; defm V_CMP_TRU_F64 : VOPC_F64 <vopc<0x2f, 0x6f>, "v_cmp_tru_f64">; -let hasSideEffects = 1 in { defm V_CMPX_F_F64 : VOPCX_F64 <vopc<0x30, 0x70>, "v_cmpx_f_f64">; -defm V_CMPX_LT_F64 : VOPCX_F64 <vopc<0x31, 0x71>, "v_cmpx_lt_f64">; +defm V_CMPX_LT_F64 : VOPCX_F64 <vopc<0x31, 0x71>, "v_cmpx_lt_f64", "v_cmpx_gt_f64">; defm V_CMPX_EQ_F64 : VOPCX_F64 <vopc<0x32, 0x72>, "v_cmpx_eq_f64">; -defm V_CMPX_LE_F64 : VOPCX_F64 <vopc<0x33, 0x73>, "v_cmpx_le_f64">; +defm V_CMPX_LE_F64 : VOPCX_F64 <vopc<0x33, 0x73>, "v_cmpx_le_f64", "v_cmpx_ge_f64">; defm V_CMPX_GT_F64 : VOPCX_F64 <vopc<0x34, 0x74>, "v_cmpx_gt_f64">; defm V_CMPX_LG_F64 : VOPCX_F64 <vopc<0x35, 0x75>, "v_cmpx_lg_f64">; defm V_CMPX_GE_F64 : VOPCX_F64 <vopc<0x36, 0x76>, "v_cmpx_ge_f64">; defm V_CMPX_O_F64 : VOPCX_F64 <vopc<0x37, 0x77>, "v_cmpx_o_f64">; defm V_CMPX_U_F64 : VOPCX_F64 <vopc<0x38, 0x78>, "v_cmpx_u_f64">; -defm V_CMPX_NGE_F64 : VOPCX_F64 <vopc<0x39, 0x79>, "v_cmpx_nge_f64">; +defm V_CMPX_NGE_F64 : VOPCX_F64 <vopc<0x39, 0x79>, "v_cmpx_nge_f64", "v_cmpx_nle_f64">; defm V_CMPX_NLG_F64 : VOPCX_F64 <vopc<0x3a, 0x7a>, "v_cmpx_nlg_f64">; -defm V_CMPX_NGT_F64 : VOPCX_F64 <vopc<0x3b, 0x7b>, "v_cmpx_ngt_f64">; +defm V_CMPX_NGT_F64 : VOPCX_F64 <vopc<0x3b, 0x7b>, "v_cmpx_ngt_f64", "v_cmpx_nlt_f64">; defm V_CMPX_NLE_F64 : VOPCX_F64 <vopc<0x3c, 0x7c>, "v_cmpx_nle_f64">; defm V_CMPX_NEQ_F64 : VOPCX_F64 <vopc<0x3d, 0x7d>, "v_cmpx_neq_f64">; defm V_CMPX_NLT_F64 : VOPCX_F64 <vopc<0x3e, 0x7e>, "v_cmpx_nlt_f64">; defm V_CMPX_TRU_F64 : VOPCX_F64 <vopc<0x3f, 0x7f>, "v_cmpx_tru_f64">; -} // End hasSideEffects = 1 let SubtargetPredicate = isSICI in { defm V_CMPS_F_F32 : VOPC_F32 <vopc<0x40>, "v_cmps_f_f32">; -defm V_CMPS_LT_F32 : VOPC_F32 <vopc<0x41>, "v_cmps_lt_f32">; +defm V_CMPS_LT_F32 : VOPC_F32 <vopc<0x41>, "v_cmps_lt_f32", COND_NULL, "v_cmps_gt_f32">; defm V_CMPS_EQ_F32 : VOPC_F32 <vopc<0x42>, "v_cmps_eq_f32">; -defm V_CMPS_LE_F32 : VOPC_F32 <vopc<0x43>, "v_cmps_le_f32">; +defm V_CMPS_LE_F32 : VOPC_F32 <vopc<0x43>, "v_cmps_le_f32", COND_NULL, "v_cmps_ge_f32">; defm V_CMPS_GT_F32 : VOPC_F32 <vopc<0x44>, "v_cmps_gt_f32">; defm V_CMPS_LG_F32 : VOPC_F32 <vopc<0x45>, "v_cmps_lg_f32">; defm V_CMPS_GE_F32 : VOPC_F32 <vopc<0x46>, "v_cmps_ge_f32">; defm V_CMPS_O_F32 : VOPC_F32 <vopc<0x47>, "v_cmps_o_f32">; defm V_CMPS_U_F32 : VOPC_F32 <vopc<0x48>, "v_cmps_u_f32">; -defm V_CMPS_NGE_F32 : VOPC_F32 <vopc<0x49>, "v_cmps_nge_f32">; +defm V_CMPS_NGE_F32 : VOPC_F32 <vopc<0x49>, "v_cmps_nge_f32", COND_NULL, "v_cmps_nle_f32">; defm V_CMPS_NLG_F32 : VOPC_F32 <vopc<0x4a>, "v_cmps_nlg_f32">; -defm V_CMPS_NGT_F32 : VOPC_F32 <vopc<0x4b>, "v_cmps_ngt_f32">; +defm V_CMPS_NGT_F32 : VOPC_F32 <vopc<0x4b>, "v_cmps_ngt_f32", COND_NULL, "v_cmps_nlt_f32">; defm V_CMPS_NLE_F32 : VOPC_F32 <vopc<0x4c>, "v_cmps_nle_f32">; defm V_CMPS_NEQ_F32 : VOPC_F32 <vopc<0x4d>, "v_cmps_neq_f32">; defm V_CMPS_NLT_F32 : VOPC_F32 <vopc<0x4e>, "v_cmps_nlt_f32">; defm V_CMPS_TRU_F32 : VOPC_F32 <vopc<0x4f>, "v_cmps_tru_f32">; -let hasSideEffects = 1 in { defm V_CMPSX_F_F32 : VOPCX_F32 <vopc<0x50>, "v_cmpsx_f_f32">; -defm V_CMPSX_LT_F32 : VOPCX_F32 <vopc<0x51>, "v_cmpsx_lt_f32">; +defm V_CMPSX_LT_F32 : VOPCX_F32 <vopc<0x51>, "v_cmpsx_lt_f32", "v_cmpsx_gt_f32">; defm V_CMPSX_EQ_F32 : VOPCX_F32 <vopc<0x52>, "v_cmpsx_eq_f32">; -defm V_CMPSX_LE_F32 : VOPCX_F32 <vopc<0x53>, "v_cmpsx_le_f32">; +defm V_CMPSX_LE_F32 : VOPCX_F32 <vopc<0x53>, "v_cmpsx_le_f32", "v_cmpsx_ge_f32">; defm V_CMPSX_GT_F32 : VOPCX_F32 <vopc<0x54>, "v_cmpsx_gt_f32">; defm V_CMPSX_LG_F32 : VOPCX_F32 <vopc<0x55>, "v_cmpsx_lg_f32">; defm V_CMPSX_GE_F32 : VOPCX_F32 <vopc<0x56>, "v_cmpsx_ge_f32">; defm V_CMPSX_O_F32 : VOPCX_F32 <vopc<0x57>, "v_cmpsx_o_f32">; defm V_CMPSX_U_F32 : VOPCX_F32 <vopc<0x58>, "v_cmpsx_u_f32">; -defm V_CMPSX_NGE_F32 : VOPCX_F32 <vopc<0x59>, "v_cmpsx_nge_f32">; +defm V_CMPSX_NGE_F32 : VOPCX_F32 <vopc<0x59>, "v_cmpsx_nge_f32", "v_cmpsx_nle_f32">; defm V_CMPSX_NLG_F32 : VOPCX_F32 <vopc<0x5a>, "v_cmpsx_nlg_f32">; -defm V_CMPSX_NGT_F32 : VOPCX_F32 <vopc<0x5b>, "v_cmpsx_ngt_f32">; +defm V_CMPSX_NGT_F32 : VOPCX_F32 <vopc<0x5b>, "v_cmpsx_ngt_f32", "v_cmpsx_nlt_f32">; defm V_CMPSX_NLE_F32 : VOPCX_F32 <vopc<0x5c>, "v_cmpsx_nle_f32">; defm V_CMPSX_NEQ_F32 : VOPCX_F32 <vopc<0x5d>, "v_cmpsx_neq_f32">; defm V_CMPSX_NLT_F32 : VOPCX_F32 <vopc<0x5e>, "v_cmpsx_nlt_f32">; defm V_CMPSX_TRU_F32 : VOPCX_F32 <vopc<0x5f>, "v_cmpsx_tru_f32">; -} // End hasSideEffects = 1 defm V_CMPS_F_F64 : VOPC_F64 <vopc<0x60>, "v_cmps_f_f64">; -defm V_CMPS_LT_F64 : VOPC_F64 <vopc<0x61>, "v_cmps_lt_f64">; +defm V_CMPS_LT_F64 : VOPC_F64 <vopc<0x61>, "v_cmps_lt_f64", COND_NULL, "v_cmps_gt_f64">; defm V_CMPS_EQ_F64 : VOPC_F64 <vopc<0x62>, "v_cmps_eq_f64">; -defm V_CMPS_LE_F64 : VOPC_F64 <vopc<0x63>, "v_cmps_le_f64">; +defm V_CMPS_LE_F64 : VOPC_F64 <vopc<0x63>, "v_cmps_le_f64", COND_NULL, "v_cmps_ge_f64">; defm V_CMPS_GT_F64 : VOPC_F64 <vopc<0x64>, "v_cmps_gt_f64">; defm V_CMPS_LG_F64 : VOPC_F64 <vopc<0x65>, "v_cmps_lg_f64">; defm V_CMPS_GE_F64 : VOPC_F64 <vopc<0x66>, "v_cmps_ge_f64">; defm V_CMPS_O_F64 : VOPC_F64 <vopc<0x67>, "v_cmps_o_f64">; defm V_CMPS_U_F64 : VOPC_F64 <vopc<0x68>, "v_cmps_u_f64">; -defm V_CMPS_NGE_F64 : VOPC_F64 <vopc<0x69>, "v_cmps_nge_f64">; +defm V_CMPS_NGE_F64 : VOPC_F64 <vopc<0x69>, "v_cmps_nge_f64", COND_NULL, "v_cmps_nle_f64">; defm V_CMPS_NLG_F64 : VOPC_F64 <vopc<0x6a>, "v_cmps_nlg_f64">; -defm V_CMPS_NGT_F64 : VOPC_F64 <vopc<0x6b>, "v_cmps_ngt_f64">; +defm V_CMPS_NGT_F64 : VOPC_F64 <vopc<0x6b>, "v_cmps_ngt_f64", COND_NULL, "v_cmps_nlt_f64">; defm V_CMPS_NLE_F64 : VOPC_F64 <vopc<0x6c>, "v_cmps_nle_f64">; defm V_CMPS_NEQ_F64 : VOPC_F64 <vopc<0x6d>, "v_cmps_neq_f64">; defm V_CMPS_NLT_F64 : VOPC_F64 <vopc<0x6e>, "v_cmps_nlt_f64">; defm V_CMPS_TRU_F64 : VOPC_F64 <vopc<0x6f>, "v_cmps_tru_f64">; -let hasSideEffects = 1, Defs = [EXEC] in { - -defm V_CMPSX_F_F64 : VOPC_F64 <vopc<0x70>, "v_cmpsx_f_f64">; -defm V_CMPSX_LT_F64 : VOPC_F64 <vopc<0x71>, "v_cmpsx_lt_f64">; -defm V_CMPSX_EQ_F64 : VOPC_F64 <vopc<0x72>, "v_cmpsx_eq_f64">; -defm V_CMPSX_LE_F64 : VOPC_F64 <vopc<0x73>, "v_cmpsx_le_f64">; -defm V_CMPSX_GT_F64 : VOPC_F64 <vopc<0x74>, "v_cmpsx_gt_f64">; -defm V_CMPSX_LG_F64 : VOPC_F64 <vopc<0x75>, "v_cmpsx_lg_f64">; -defm V_CMPSX_GE_F64 : VOPC_F64 <vopc<0x76>, "v_cmpsx_ge_f64">; -defm V_CMPSX_O_F64 : VOPC_F64 <vopc<0x77>, "v_cmpsx_o_f64">; -defm V_CMPSX_U_F64 : VOPC_F64 <vopc<0x78>, "v_cmpsx_u_f64">; -defm V_CMPSX_NGE_F64 : VOPC_F64 <vopc<0x79>, "v_cmpsx_nge_f64">; -defm V_CMPSX_NLG_F64 : VOPC_F64 <vopc<0x7a>, "v_cmpsx_nlg_f64">; -defm V_CMPSX_NGT_F64 : VOPC_F64 <vopc<0x7b>, "v_cmpsx_ngt_f64">; -defm V_CMPSX_NLE_F64 : VOPC_F64 <vopc<0x7c>, "v_cmpsx_nle_f64">; -defm V_CMPSX_NEQ_F64 : VOPC_F64 <vopc<0x7d>, "v_cmpsx_neq_f64">; -defm V_CMPSX_NLT_F64 : VOPC_F64 <vopc<0x7e>, "v_cmpsx_nlt_f64">; -defm V_CMPSX_TRU_F64 : VOPC_F64 <vopc<0x7f>, "v_cmpsx_tru_f64">; - -} // End hasSideEffects = 1, Defs = [EXEC] + +defm V_CMPSX_F_F64 : VOPCX_F64 <vopc<0x70>, "v_cmpsx_f_f64">; +defm V_CMPSX_LT_F64 : VOPCX_F64 <vopc<0x71>, "v_cmpsx_lt_f64", "v_cmpsx_gt_f64">; +defm V_CMPSX_EQ_F64 : VOPCX_F64 <vopc<0x72>, "v_cmpsx_eq_f64">; +defm V_CMPSX_LE_F64 : VOPCX_F64 <vopc<0x73>, "v_cmpsx_le_f64", "v_cmpsx_ge_f64">; +defm V_CMPSX_GT_F64 : VOPCX_F64 <vopc<0x74>, "v_cmpsx_gt_f64">; +defm V_CMPSX_LG_F64 : VOPCX_F64 <vopc<0x75>, "v_cmpsx_lg_f64">; +defm V_CMPSX_GE_F64 : VOPCX_F64 <vopc<0x76>, "v_cmpsx_ge_f64">; +defm V_CMPSX_O_F64 : VOPCX_F64 <vopc<0x77>, "v_cmpsx_o_f64">; +defm V_CMPSX_U_F64 : VOPCX_F64 <vopc<0x78>, "v_cmpsx_u_f64">; +defm V_CMPSX_NGE_F64 : VOPCX_F64 <vopc<0x79>, "v_cmpsx_nge_f64", "v_cmpsx_nle_f64">; +defm V_CMPSX_NLG_F64 : VOPCX_F64 <vopc<0x7a>, "v_cmpsx_nlg_f64">; +defm V_CMPSX_NGT_F64 : VOPCX_F64 <vopc<0x7b>, "v_cmpsx_ngt_f64", "v_cmpsx_nlt_f64">; +defm V_CMPSX_NLE_F64 : VOPCX_F64 <vopc<0x7c>, "v_cmpsx_nle_f64">; +defm V_CMPSX_NEQ_F64 : VOPCX_F64 <vopc<0x7d>, "v_cmpsx_neq_f64">; +defm V_CMPSX_NLT_F64 : VOPCX_F64 <vopc<0x7e>, "v_cmpsx_nlt_f64">; +defm V_CMPSX_TRU_F64 : VOPCX_F64 <vopc<0x7f>, "v_cmpsx_tru_f64">; } // End SubtargetPredicate = isSICI defm V_CMP_F_I32 : VOPC_I32 <vopc<0x80, 0xc0>, "v_cmp_f_i32">; -defm V_CMP_LT_I32 : VOPC_I32 <vopc<0x81, 0xc1>, "v_cmp_lt_i32", COND_SLT>; +defm V_CMP_LT_I32 : VOPC_I32 <vopc<0x81, 0xc1>, "v_cmp_lt_i32", COND_SLT, "v_cmp_gt_i32">; defm V_CMP_EQ_I32 : VOPC_I32 <vopc<0x82, 0xc2>, "v_cmp_eq_i32", COND_EQ>; -defm V_CMP_LE_I32 : VOPC_I32 <vopc<0x83, 0xc3>, "v_cmp_le_i32", COND_SLE>; +defm V_CMP_LE_I32 : VOPC_I32 <vopc<0x83, 0xc3>, "v_cmp_le_i32", COND_SLE, "v_cmp_ge_i32">; defm V_CMP_GT_I32 : VOPC_I32 <vopc<0x84, 0xc4>, "v_cmp_gt_i32", COND_SGT>; defm V_CMP_NE_I32 : VOPC_I32 <vopc<0x85, 0xc5>, "v_cmp_ne_i32", COND_NE>; defm V_CMP_GE_I32 : VOPC_I32 <vopc<0x86, 0xc6>, "v_cmp_ge_i32", COND_SGE>; defm V_CMP_T_I32 : VOPC_I32 <vopc<0x87, 0xc7>, "v_cmp_t_i32">; -let hasSideEffects = 1 in { defm V_CMPX_F_I32 : VOPCX_I32 <vopc<0x90, 0xd0>, "v_cmpx_f_i32">; -defm V_CMPX_LT_I32 : VOPCX_I32 <vopc<0x91, 0xd1>, "v_cmpx_lt_i32">; +defm V_CMPX_LT_I32 : VOPCX_I32 <vopc<0x91, 0xd1>, "v_cmpx_lt_i32", "v_cmpx_gt_i32">; defm V_CMPX_EQ_I32 : VOPCX_I32 <vopc<0x92, 0xd2>, "v_cmpx_eq_i32">; -defm V_CMPX_LE_I32 : VOPCX_I32 <vopc<0x93, 0xd3>, "v_cmpx_le_i32">; +defm V_CMPX_LE_I32 : VOPCX_I32 <vopc<0x93, 0xd3>, "v_cmpx_le_i32", "v_cmpx_ge_i32">; defm V_CMPX_GT_I32 : VOPCX_I32 <vopc<0x94, 0xd4>, "v_cmpx_gt_i32">; defm V_CMPX_NE_I32 : VOPCX_I32 <vopc<0x95, 0xd5>, "v_cmpx_ne_i32">; defm V_CMPX_GE_I32 : VOPCX_I32 <vopc<0x96, 0xd6>, "v_cmpx_ge_i32">; defm V_CMPX_T_I32 : VOPCX_I32 <vopc<0x97, 0xd7>, "v_cmpx_t_i32">; -} // End hasSideEffects = 1 defm V_CMP_F_I64 : VOPC_I64 <vopc<0xa0, 0xe0>, "v_cmp_f_i64">; -defm V_CMP_LT_I64 : VOPC_I64 <vopc<0xa1, 0xe1>, "v_cmp_lt_i64", COND_SLT>; +defm V_CMP_LT_I64 : VOPC_I64 <vopc<0xa1, 0xe1>, "v_cmp_lt_i64", COND_SLT, "v_cmp_gt_i64">; defm V_CMP_EQ_I64 : VOPC_I64 <vopc<0xa2, 0xe2>, "v_cmp_eq_i64", COND_EQ>; -defm V_CMP_LE_I64 : VOPC_I64 <vopc<0xa3, 0xe3>, "v_cmp_le_i64", COND_SLE>; +defm V_CMP_LE_I64 : VOPC_I64 <vopc<0xa3, 0xe3>, "v_cmp_le_i64", COND_SLE, "v_cmp_ge_i64">; defm V_CMP_GT_I64 : VOPC_I64 <vopc<0xa4, 0xe4>, "v_cmp_gt_i64", COND_SGT>; defm V_CMP_NE_I64 : VOPC_I64 <vopc<0xa5, 0xe5>, "v_cmp_ne_i64", COND_NE>; defm V_CMP_GE_I64 : VOPC_I64 <vopc<0xa6, 0xe6>, "v_cmp_ge_i64", COND_SGE>; defm V_CMP_T_I64 : VOPC_I64 <vopc<0xa7, 0xe7>, "v_cmp_t_i64">; -let hasSideEffects = 1 in { defm V_CMPX_F_I64 : VOPCX_I64 <vopc<0xb0, 0xf0>, "v_cmpx_f_i64">; -defm V_CMPX_LT_I64 : VOPCX_I64 <vopc<0xb1, 0xf1>, "v_cmpx_lt_i64">; +defm V_CMPX_LT_I64 : VOPCX_I64 <vopc<0xb1, 0xf1>, "v_cmpx_lt_i64", "v_cmpx_gt_i64">; defm V_CMPX_EQ_I64 : VOPCX_I64 <vopc<0xb2, 0xf2>, "v_cmpx_eq_i64">; -defm V_CMPX_LE_I64 : VOPCX_I64 <vopc<0xb3, 0xf3>, "v_cmpx_le_i64">; +defm V_CMPX_LE_I64 : VOPCX_I64 <vopc<0xb3, 0xf3>, "v_cmpx_le_i64", "v_cmpx_ge_i64">; defm V_CMPX_GT_I64 : VOPCX_I64 <vopc<0xb4, 0xf4>, "v_cmpx_gt_i64">; defm V_CMPX_NE_I64 : VOPCX_I64 <vopc<0xb5, 0xf5>, "v_cmpx_ne_i64">; defm V_CMPX_GE_I64 : VOPCX_I64 <vopc<0xb6, 0xf6>, "v_cmpx_ge_i64">; defm V_CMPX_T_I64 : VOPCX_I64 <vopc<0xb7, 0xf7>, "v_cmpx_t_i64">; -} // End hasSideEffects = 1 defm V_CMP_F_U32 : VOPC_I32 <vopc<0xc0, 0xc8>, "v_cmp_f_u32">; -defm V_CMP_LT_U32 : VOPC_I32 <vopc<0xc1, 0xc9>, "v_cmp_lt_u32", COND_ULT>; +defm V_CMP_LT_U32 : VOPC_I32 <vopc<0xc1, 0xc9>, "v_cmp_lt_u32", COND_ULT, "v_cmp_gt_u32">; defm V_CMP_EQ_U32 : VOPC_I32 <vopc<0xc2, 0xca>, "v_cmp_eq_u32", COND_EQ>; -defm V_CMP_LE_U32 : VOPC_I32 <vopc<0xc3, 0xcb>, "v_cmp_le_u32", COND_ULE>; +defm V_CMP_LE_U32 : VOPC_I32 <vopc<0xc3, 0xcb>, "v_cmp_le_u32", COND_ULE, "v_cmp_ge_u32">; defm V_CMP_GT_U32 : VOPC_I32 <vopc<0xc4, 0xcc>, "v_cmp_gt_u32", COND_UGT>; defm V_CMP_NE_U32 : VOPC_I32 <vopc<0xc5, 0xcd>, "v_cmp_ne_u32", COND_NE>; defm V_CMP_GE_U32 : VOPC_I32 <vopc<0xc6, 0xce>, "v_cmp_ge_u32", COND_UGE>; defm V_CMP_T_U32 : VOPC_I32 <vopc<0xc7, 0xcf>, "v_cmp_t_u32">; -let hasSideEffects = 1 in { defm V_CMPX_F_U32 : VOPCX_I32 <vopc<0xd0, 0xd8>, "v_cmpx_f_u32">; -defm V_CMPX_LT_U32 : VOPCX_I32 <vopc<0xd1, 0xd9>, "v_cmpx_lt_u32">; +defm V_CMPX_LT_U32 : VOPCX_I32 <vopc<0xd1, 0xd9>, "v_cmpx_lt_u32", "v_cmpx_gt_u32">; defm V_CMPX_EQ_U32 : VOPCX_I32 <vopc<0xd2, 0xda>, "v_cmpx_eq_u32">; -defm V_CMPX_LE_U32 : VOPCX_I32 <vopc<0xd3, 0xdb>, "v_cmpx_le_u32">; +defm V_CMPX_LE_U32 : VOPCX_I32 <vopc<0xd3, 0xdb>, "v_cmpx_le_u32", "v_cmpx_le_u32">; defm V_CMPX_GT_U32 : VOPCX_I32 <vopc<0xd4, 0xdc>, "v_cmpx_gt_u32">; defm V_CMPX_NE_U32 : VOPCX_I32 <vopc<0xd5, 0xdd>, "v_cmpx_ne_u32">; defm V_CMPX_GE_U32 : VOPCX_I32 <vopc<0xd6, 0xde>, "v_cmpx_ge_u32">; defm V_CMPX_T_U32 : VOPCX_I32 <vopc<0xd7, 0xdf>, "v_cmpx_t_u32">; -} // End hasSideEffects = 1 defm V_CMP_F_U64 : VOPC_I64 <vopc<0xe0, 0xe8>, "v_cmp_f_u64">; -defm V_CMP_LT_U64 : VOPC_I64 <vopc<0xe1, 0xe9>, "v_cmp_lt_u64", COND_ULT>; +defm V_CMP_LT_U64 : VOPC_I64 <vopc<0xe1, 0xe9>, "v_cmp_lt_u64", COND_ULT, "v_cmp_gt_u64">; defm V_CMP_EQ_U64 : VOPC_I64 <vopc<0xe2, 0xea>, "v_cmp_eq_u64", COND_EQ>; -defm V_CMP_LE_U64 : VOPC_I64 <vopc<0xe3, 0xeb>, "v_cmp_le_u64", COND_ULE>; +defm V_CMP_LE_U64 : VOPC_I64 <vopc<0xe3, 0xeb>, "v_cmp_le_u64", COND_ULE, "v_cmp_ge_u64">; defm V_CMP_GT_U64 : VOPC_I64 <vopc<0xe4, 0xec>, "v_cmp_gt_u64", COND_UGT>; defm V_CMP_NE_U64 : VOPC_I64 <vopc<0xe5, 0xed>, "v_cmp_ne_u64", COND_NE>; defm V_CMP_GE_U64 : VOPC_I64 <vopc<0xe6, 0xee>, "v_cmp_ge_u64", COND_UGE>; defm V_CMP_T_U64 : VOPC_I64 <vopc<0xe7, 0xef>, "v_cmp_t_u64">; -let hasSideEffects = 1 in { - defm V_CMPX_F_U64 : VOPCX_I64 <vopc<0xf0, 0xf8>, "v_cmpx_f_u64">; -defm V_CMPX_LT_U64 : VOPCX_I64 <vopc<0xf1, 0xf9>, "v_cmpx_lt_u64">; +defm V_CMPX_LT_U64 : VOPCX_I64 <vopc<0xf1, 0xf9>, "v_cmpx_lt_u64", "v_cmpx_gt_u64">; defm V_CMPX_EQ_U64 : VOPCX_I64 <vopc<0xf2, 0xfa>, "v_cmpx_eq_u64">; -defm V_CMPX_LE_U64 : VOPCX_I64 <vopc<0xf3, 0xfb>, "v_cmpx_le_u64">; +defm V_CMPX_LE_U64 : VOPCX_I64 <vopc<0xf3, 0xfb>, "v_cmpx_le_u64", "v_cmpx_ge_u64">; defm V_CMPX_GT_U64 : VOPCX_I64 <vopc<0xf4, 0xfc>, "v_cmpx_gt_u64">; defm V_CMPX_NE_U64 : VOPCX_I64 <vopc<0xf5, 0xfd>, "v_cmpx_ne_u64">; defm V_CMPX_GE_U64 : VOPCX_I64 <vopc<0xf6, 0xfe>, "v_cmpx_ge_u64">; defm V_CMPX_T_U64 : VOPCX_I64 <vopc<0xf7, 0xff>, "v_cmpx_t_u64">; -} // End hasSideEffects = 1 +} // End isCompare = 1, isCommutable = 1 defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <vopc<0x88, 0x10>, "v_cmp_class_f32">; - -let hasSideEffects = 1 in { defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <vopc<0x98, 0x11>, "v_cmpx_class_f32">; -} // End hasSideEffects = 1 - defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <vopc<0xa8, 0x12>, "v_cmp_class_f64">; - -let hasSideEffects = 1 in { defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <vopc<0xb8, 0x13>, "v_cmpx_class_f64">; -} // End hasSideEffects = 1 - -} // End isCompare = 1 //===----------------------------------------------------------------------===// // DS Instructions //===----------------------------------------------------------------------===// - defm DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VGPR_32>; defm DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VGPR_32>; defm DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VGPR_32>; @@ -782,12 +760,26 @@ defm DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VGPR_32>; defm DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VGPR_32>; defm DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VGPR_32>; defm DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>; -defm DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "ds_mskor_b32", VGPR_32>; +defm DS_MSKOR_B32 : DS_1A2D_NORET <0xc, "ds_mskor_b32", VGPR_32>; +let mayLoad = 0 in { +defm DS_WRITE_B32 : DS_1A1D_NORET <0xd, "ds_write_b32", VGPR_32>; +defm DS_WRITE2_B32 : DS_1A1D_Off8_NORET <0xe, "ds_write2_b32", VGPR_32>; +defm DS_WRITE2ST64_B32 : DS_1A1D_Off8_NORET <0xf, "ds_write2st64_b32", VGPR_32>; +} defm DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>; defm DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>; -defm DS_MIN_F32 : DS_1A1D_NORET <0x12, "ds_min_f32", VGPR_32>; -defm DS_MAX_F32 : DS_1A1D_NORET <0x13, "ds_max_f32", VGPR_32>; - +defm DS_MIN_F32 : DS_1A2D_NORET <0x12, "ds_min_f32", VGPR_32>; +defm DS_MAX_F32 : DS_1A2D_NORET <0x13, "ds_max_f32", VGPR_32>; + +defm DS_GWS_INIT : DS_1A_GDS <0x19, "ds_gws_init">; +defm DS_GWS_SEMA_V : DS_1A_GDS <0x1a, "ds_gws_sema_v">; +defm DS_GWS_SEMA_BR : DS_1A_GDS <0x1b, "ds_gws_sema_br">; +defm DS_GWS_SEMA_P : DS_1A_GDS <0x1c, "ds_gws_sema_p">; +defm DS_GWS_BARRIER : DS_1A_GDS <0x1d, "ds_gws_barrier">; +let mayLoad = 0 in { +defm DS_WRITE_B8 : DS_1A1D_NORET <0x1e, "ds_write_b8", VGPR_32>; +defm DS_WRITE_B16 : DS_1A1D_NORET <0x1f, "ds_write_b16", VGPR_32>; +} defm DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VGPR_32, "ds_add_u32">; defm DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">; defm DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">; @@ -800,20 +792,34 @@ defm DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VGPR_32, "ds_max_u32" defm DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VGPR_32, "ds_and_b32">; defm DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VGPR_32, "ds_or_b32">; defm DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">; -defm DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">; +defm DS_MSKOR_RTN_B32 : DS_1A2D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">; defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VGPR_32>; -//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "ds_wrxchg2_rtn_b32", VGPR_32, "ds_wrxchg2_b32">; -//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "ds_wrxchg2_rtn_b32", VGPR_32, "ds_wrxchg2st64_b32">; +defm DS_WRXCHG2_RTN_B32 : DS_1A2D_RET < + 0x2e, "ds_wrxchg2_rtn_b32", VReg_64, "", VGPR_32 +>; +defm DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_RET < + 0x2f, "ds_wrxchg2st64_rtn_b32", VReg_64, "", VGPR_32 +>; defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">; defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; -defm DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">; -defm DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">; - +defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">; +defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">; let SubtargetPredicate = isCI in { defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">; } // End isCI - - +defm DS_SWIZZLE_B32 : DS_1A_RET <0x35, "ds_swizzle_b32", VGPR_32>; +let mayStore = 0 in { +defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>; +defm DS_READ2_B32 : DS_1A_Off8_RET <0x37, "ds_read2_b32", VReg_64>; +defm DS_READ2ST64_B32 : DS_1A_Off8_RET <0x38, "ds_read2st64_b32", VReg_64>; +defm DS_READ_I8 : DS_1A_RET <0x39, "ds_read_i8", VGPR_32>; +defm DS_READ_U8 : DS_1A_RET <0x3a, "ds_read_u8", VGPR_32>; +defm DS_READ_I16 : DS_1A_RET <0x3b, "ds_read_i16", VGPR_32>; +defm DS_READ_U16 : DS_1A_RET <0x3c, "ds_read_u16", VGPR_32>; +} +defm DS_CONSUME : DS_0A_RET <0x3d, "ds_consume">; +defm DS_APPEND : DS_0A_RET <0x3e, "ds_append">; +defm DS_ORDERED_COUNT : DS_1A_RET_GDS <0x3f, "ds_ordered_count">; defm DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>; defm DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>; defm DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>; @@ -826,7 +832,12 @@ defm DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>; defm DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>; defm DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>; defm DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>; -defm DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "ds_mskor_b64", VReg_64>; +defm DS_MSKOR_B64 : DS_1A2D_NORET <0x4c, "ds_mskor_b64", VReg_64>; +let mayLoad = 0 in { +defm DS_WRITE_B64 : DS_1A1D_NORET <0x4d, "ds_write_b64", VReg_64>; +defm DS_WRITE2_B64 : DS_1A1D_Off8_NORET <0x4E, "ds_write2_b64", VReg_64>; +defm DS_WRITE2ST64_B64 : DS_1A1D_Off8_NORET <0x4f, "ds_write2st64_b64", VReg_64>; +} defm DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>; defm DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>; defm DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>; @@ -844,57 +855,88 @@ defm DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64" defm DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">; defm DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">; defm DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">; -defm DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">; +defm DS_MSKOR_RTN_B64 : DS_1A2D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">; defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">; -//def DS_WRXCHG2_RTN_B64 : DS_2A0D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2_b64">; -//def DS_WRXCHG2ST64_RTN_B64 : DS_2A0D_RET <0x6f, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2st64_b64">; +defm DS_WRXCHG2_RTN_B64 : DS_1A2D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_128, "ds_wrxchg2_b64", VReg_64>; +defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_RET <0x6f, "ds_wrxchg2st64_rtn_b64", VReg_128, "ds_wrxchg2st64_b64", VReg_64>; defm DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">; defm DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">; defm DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_rtn_f64", VReg_64, "ds_min_f64">; defm DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_rtn_f64", VReg_64, "ds_max_f64">; +let mayStore = 0 in { +defm DS_READ_B64 : DS_1A_RET <0x76, "ds_read_b64", VReg_64>; +defm DS_READ2_B64 : DS_1A_Off8_RET <0x77, "ds_read2_b64", VReg_128>; +defm DS_READ2ST64_B64 : DS_1A_Off8_RET <0x78, "ds_read2st64_b64", VReg_128>; +} + +defm DS_ADD_SRC2_U32 : DS_1A <0x80, "ds_add_src2_u32">; +defm DS_SUB_SRC2_U32 : DS_1A <0x81, "ds_sub_src2_u32">; +defm DS_RSUB_SRC2_U32 : DS_1A <0x82, "ds_rsub_src2_u32">; +defm DS_INC_SRC2_U32 : DS_1A <0x83, "ds_inc_src2_u32">; +defm DS_DEC_SRC2_U32 : DS_1A <0x84, "ds_dec_src2_u32">; +defm DS_MIN_SRC2_I32 : DS_1A <0x85, "ds_min_src2_i32">; +defm DS_MAX_SRC2_I32 : DS_1A <0x86, "ds_max_src2_i32">; +defm DS_MIN_SRC2_U32 : DS_1A <0x87, "ds_min_src2_u32">; +defm DS_MAX_SRC2_U32 : DS_1A <0x88, "ds_max_src2_u32">; +defm DS_AND_SRC2_B32 : DS_1A <0x89, "ds_and_src_b32">; +defm DS_OR_SRC2_B32 : DS_1A <0x8a, "ds_or_src2_b32">; +defm DS_XOR_SRC2_B32 : DS_1A <0x8b, "ds_xor_src2_b32">; +defm DS_WRITE_SRC2_B32 : DS_1A <0x8c, "ds_write_src2_b32">; + +defm DS_MIN_SRC2_F32 : DS_1A <0x92, "ds_min_src2_f32">; +defm DS_MAX_SRC2_F32 : DS_1A <0x93, "ds_max_src2_f32">; + +defm DS_ADD_SRC2_U64 : DS_1A <0xc0, "ds_add_src2_u64">; +defm DS_SUB_SRC2_U64 : DS_1A <0xc1, "ds_sub_src2_u64">; +defm DS_RSUB_SRC2_U64 : DS_1A <0xc2, "ds_rsub_src2_u64">; +defm DS_INC_SRC2_U64 : DS_1A <0xc3, "ds_inc_src2_u64">; +defm DS_DEC_SRC2_U64 : DS_1A <0xc4, "ds_dec_src2_u64">; +defm DS_MIN_SRC2_I64 : DS_1A <0xc5, "ds_min_src2_i64">; +defm DS_MAX_SRC2_I64 : DS_1A <0xc6, "ds_max_src2_i64">; +defm DS_MIN_SRC2_U64 : DS_1A <0xc7, "ds_min_src2_u64">; +defm DS_MAX_SRC2_U64 : DS_1A <0xc8, "ds_max_src2_u64">; +defm DS_AND_SRC2_B64 : DS_1A <0xc9, "ds_and_src2_b64">; +defm DS_OR_SRC2_B64 : DS_1A <0xca, "ds_or_src2_b64">; +defm DS_XOR_SRC2_B64 : DS_1A <0xcb, "ds_xor_src2_b64">; +defm DS_WRITE_SRC2_B64 : DS_1A <0xcc, "ds_write_src2_b64">; + +defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">; +defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">; + //let SubtargetPredicate = isCI in { // DS_CONDXCHG32_RTN_B64 // DS_CONDXCHG32_RTN_B128 //} // End isCI -// TODO: _SRC2_* forms - -defm DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "ds_write_b32", VGPR_32>; -defm DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "ds_write_b8", VGPR_32>; -defm DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "ds_write_b16", VGPR_32>; -defm DS_WRITE_B64 : DS_Store_Helper <0x00000004d, "ds_write_b64", VReg_64>; - -defm DS_READ_B32 : DS_Load_Helper <0x00000036, "ds_read_b32", VGPR_32>; -defm DS_READ_I8 : DS_Load_Helper <0x00000039, "ds_read_i8", VGPR_32>; -defm DS_READ_U8 : DS_Load_Helper <0x0000003a, "ds_read_u8", VGPR_32>; -defm DS_READ_I16 : DS_Load_Helper <0x0000003b, "ds_read_i16", VGPR_32>; -defm DS_READ_U16 : DS_Load_Helper <0x0000003c, "ds_read_u16", VGPR_32>; -defm DS_READ_B64 : DS_Load_Helper <0x00000076, "ds_read_b64", VReg_64>; - -// 2 forms. -defm DS_WRITE2_B32 : DS_Store2_Helper <0x0000000E, "ds_write2_b32", VGPR_32>; -defm DS_WRITE2ST64_B32 : DS_Store2_Helper <0x0000000F, "ds_write2st64_b32", VGPR_32>; -defm DS_WRITE2_B64 : DS_Store2_Helper <0x0000004E, "ds_write2_b64", VReg_64>; -defm DS_WRITE2ST64_B64 : DS_Store2_Helper <0x0000004F, "ds_write2st64_b64", VReg_64>; - -defm DS_READ2_B32 : DS_Load2_Helper <0x00000037, "ds_read2_b32", VReg_64>; -defm DS_READ2ST64_B32 : DS_Load2_Helper <0x00000038, "ds_read2st64_b32", VReg_64>; -defm DS_READ2_B64 : DS_Load2_Helper <0x00000075, "ds_read2_b64", VReg_128>; -defm DS_READ2ST64_B64 : DS_Load2_Helper <0x00000076, "ds_read2st64_b64", VReg_128>; - //===----------------------------------------------------------------------===// // MUBUF Instructions //===----------------------------------------------------------------------===// -//def BUFFER_LOAD_FORMAT_X : MUBUF_ <mubuf<0x00>, "buffer_load_format_x", []>; -//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <mubuf<0x01>, "buffer_load_format_xy", []>; -//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <mubuf<0x02>, "buffer_load_format_xyz", []>; -defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <mubuf<0x03>, "buffer_load_format_xyzw", VReg_128>; -//def BUFFER_STORE_FORMAT_X : MUBUF_ <mubuf<0x04>, "buffer_store_format_x", []>; -//def BUFFER_STORE_FORMAT_XY : MUBUF_ <mubuf<0x05>, "buffer_store_format_xy", []>; -//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <mubuf<0x06>, "buffer_store_format_xyz", []>; -//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <mubuf<0x07>, "buffer_store_format_xyzw", []>; +defm BUFFER_LOAD_FORMAT_X : MUBUF_Load_Helper < + mubuf<0x00>, "buffer_load_format_x", VGPR_32 +>; +defm BUFFER_LOAD_FORMAT_XY : MUBUF_Load_Helper < + mubuf<0x01>, "buffer_load_format_xy", VReg_64 +>; +defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Load_Helper < + mubuf<0x02>, "buffer_load_format_xyz", VReg_96 +>; +defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper < + mubuf<0x03>, "buffer_load_format_xyzw", VReg_128 +>; +defm BUFFER_STORE_FORMAT_X : MUBUF_Store_Helper < + mubuf<0x04>, "buffer_store_format_x", VGPR_32 +>; +defm BUFFER_STORE_FORMAT_XY : MUBUF_Store_Helper < + mubuf<0x05>, "buffer_store_format_xy", VReg_64 +>; +defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Store_Helper < + mubuf<0x06>, "buffer_store_format_xyz", VReg_96 +>; +defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Store_Helper < + mubuf<0x07>, "buffer_store_format_xyzw", VReg_128 +>; defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper < mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global >; @@ -1418,13 +1460,17 @@ defm V_INTERP_MOV_F32 : VINTRP_m < // VOP2 Instructions //===----------------------------------------------------------------------===// -defm V_CNDMASK_B32_e64 : VOP3_m_nomods <vop3<0x100>, (outs VGPR_32:$dst), - (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2), - "v_cndmask_b32_e64 $dst, $src0, $src1, $src2", - [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))], - "v_cndmask_b32_e64", 3 ->; +multiclass V_CNDMASK <vop2 op, string name> { + defm _e32 : VOP2_m < + op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins32, VOP_CNDMASK.Asm32, [], + name, name>; + + defm _e64 : VOP3_m < + op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins64, + name#!cast<string>(VOP_CNDMASK.Asm64), [], name, 3>; +} +defm V_CNDMASK_B32 : V_CNDMASK<vop2<0x0>, "v_cndmask_b32">; let isCommutable = 1 in { defm V_ADD_F32 : VOP2Inst <vop2<0x3, 0x1>, "v_add_f32", @@ -1568,8 +1614,8 @@ defm V_MAC_LEGACY_F32 : VOP2_VI3_Inst <vop23<0x6, 0x28e>, "v_mac_legacy_f32", >; } // End isCommutable = 1 -defm V_BFM_B32 : VOP2_VI3_Inst <vop23<0x1e, 0x293>, "v_bfm_b32", VOP_I32_I32_I32, - AMDGPUbfm +defm V_BFM_B32 : VOP2_VI3_Inst <vop23<0x1e, 0x293>, "v_bfm_b32", + VOP_I32_I32_I32 >; defm V_BCNT_U32_B32 : VOP2_VI3_Inst <vop23<0x22, 0x28b>, "v_bcnt_u32_b32", VOP_I32_I32_I32 @@ -1638,14 +1684,12 @@ defm V_CUBEMA_F32 : VOP3Inst <vop3<0x147, 0x1c7>, "v_cubema_f32", VOP_F32_F32_F32_F32 >; -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { defm V_BFE_U32 : VOP3Inst <vop3<0x148, 0x1c8>, "v_bfe_u32", VOP_I32_I32_I32_I32, AMDGPUbfe_u32 >; defm V_BFE_I32 : VOP3Inst <vop3<0x149, 0x1c9>, "v_bfe_i32", VOP_I32_I32_I32_I32, AMDGPUbfe_i32 >; -} defm V_BFI_B32 : VOP3Inst <vop3<0x14a, 0x1ca>, "v_bfi_b32", VOP_I32_I32_I32_I32, AMDGPUbfi @@ -1833,6 +1877,11 @@ defm V_ASHRREV_I64 : VOP3Inst <vop3<0, 0x291>, "v_ashrrev_i64", //===----------------------------------------------------------------------===// let isCodeGenOnly = 1, isPseudo = 1 in { +// For use in patterns +def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst), + (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", [] +>; + let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { // 64-bit vector move instruction. This is mainly used by the SIFoldOperands // pass to enable folding of inline immediates. @@ -2049,7 +2098,7 @@ def : Pat < /* int_SI_vs_load_input */ def : Pat< (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr), - (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0) + (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0) >; /* int_SI_export */ @@ -2196,6 +2245,11 @@ def : Pat < (V_BCNT_U32_B32_e64 $popcnt, $val) >; +def : Pat < + (i32 (select i1:$src0, i32:$src1, i32:$src2)), + (V_CNDMASK_B32_e64 $src2, $src1, $src0) +>; + /********** ======================= **********/ /********** Image sampling patterns **********/ /********** ======================= **********/ @@ -2738,7 +2792,7 @@ def : Ext32Pat <anyext>; // Offset in an 32Bit VGPR def : Pat < (SIload_constant v4i32:$sbase, i32:$voff), - (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0, 0) + (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0) >; // The multiplication scales from [0,1] to the unsigned integer range @@ -2781,7 +2835,7 @@ def : ROTRPattern <V_ALIGNBIT_B32>; class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat < (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), - (inst (i1 0), $ptr, (as_i16imm $offset), (S_MOV_B32 -1)) + (inst $ptr, (as_i16imm $offset), (i1 0), (S_MOV_B32 -1)) >; def : DSReadPat <DS_READ_I8, i32, sextloadi8_local>; @@ -2799,12 +2853,12 @@ def : DSReadPat <DS_READ_B64, v2i32, local_load_aligned8bytes>; def : Pat < (v2i32 (local_load (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))), - (DS_READ2_B32 (i1 0), $ptr, $offset0, $offset1, (S_MOV_B32 -1)) + (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0), (S_MOV_B32 -1)) >; class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat < (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), - (inst (i1 0), $ptr, $value, (as_i16imm $offset), (S_MOV_B32 -1)) + (inst $ptr, $value, (as_i16imm $offset), (i1 0), (S_MOV_B32 -1)) >; def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>; @@ -2819,14 +2873,14 @@ def : DSWritePat <DS_WRITE_B64, v2i32, local_store_aligned8bytes>; def : Pat < (local_store v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)), - (DS_WRITE2_B32 (i1 0), $ptr, (EXTRACT_SUBREG $value, sub0), - (EXTRACT_SUBREG $value, sub1), $offset0, $offset1, - (S_MOV_B32 -1)) + (DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0), + (EXTRACT_SUBREG $value, sub1), $offset0, $offset1, + (i1 0), (S_MOV_B32 -1)) >; class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), - (inst (i1 0), $ptr, $value, (as_i16imm $offset), (S_MOV_B32 -1)) + (inst $ptr, $value, (as_i16imm $offset), (i1 0), (S_MOV_B32 -1)) >; // Special case of DSAtomicRetPat for add / sub 1 -> inc / dec @@ -2842,13 +2896,13 @@ class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat < class DSAtomicIncRetPat<DS inst, ValueType vt, Instruction LoadImm, PatFrag frag> : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)), - (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (S_MOV_B32 -1)) + (inst $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (i1 0), (S_MOV_B32 -1)) >; class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), - (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset), (S_MOV_B32 -1)) + (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0), (S_MOV_B32 -1)) >; @@ -2898,8 +2952,9 @@ def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>; multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt, PatFrag constant_ld> { def : Pat < - (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset))), - (Instr_ADDR64 $srsrc, $vaddr, $soffset, $offset) + (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe) >; } @@ -2916,7 +2971,7 @@ defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, constant_load>; class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat < (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset))), - (Instr $srsrc, $vaddr, $soffset, $offset, 0, 0, 0) + (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) >; def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>; @@ -2935,7 +2990,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset, imm:$offset, 0, 0, imm:$glc, imm:$slc, imm:$tfe)), - (offset $rsrc, (as_i16imm $offset), $soffset, (as_i1imm $glc), + (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), (as_i1imm $tfe)) >; @@ -2943,7 +2998,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, imm:$offset, 1, 0, imm:$glc, imm:$slc, imm:$tfe)), - (offen $rsrc, $vaddr, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), + (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), (as_i1imm $tfe)) >; @@ -2951,7 +3006,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, imm:$offset, 0, 1, imm:$glc, imm:$slc, imm:$tfe)), - (idxen $rsrc, $vaddr, (as_i16imm $offset), $soffset, (as_i1imm $glc), + (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), (as_i1imm $tfe)) >; @@ -2959,7 +3014,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset, imm:$offset, 1, 1, imm:$glc, imm:$slc, imm:$tfe)), - (bothen $rsrc, $vaddr, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), + (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), (as_i1imm $tfe)) >; } @@ -2974,7 +3029,7 @@ defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_ class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat < (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset)), - (Instr $value, $srsrc, $vaddr, $soffset, $offset, 0, 0, 0) + (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) >; def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>; @@ -3104,26 +3159,26 @@ multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST I // 1. Extract with offset def : Pat< - (vector_extract vt:$vec, (add i32:$idx, imm:$off)), - (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off)) + (eltvt (vector_extract vt:$vec, (add i32:$idx, imm:$off))), + (SI_INDIRECT_SRC $vec, $idx, imm:$off) >; // 2. Extract without offset def : Pat< - (vector_extract vt:$vec, i32:$idx), - (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0)) + (eltvt (vector_extract vt:$vec, i32:$idx)), + (SI_INDIRECT_SRC $vec, $idx, 0) >; // 3. Insert with offset def : Pat< (vector_insert vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)), - (IndDst (IMPLICIT_DEF), $vec, $idx, imm:$off, $val) + (IndDst $vec, $idx, imm:$off, $val) >; // 4. Insert without offset def : Pat< (vector_insert vt:$vec, eltvt:$val, i32:$idx), - (IndDst (IMPLICIT_DEF), $vec, $idx, 0, $val) + (IndDst $vec, $idx, 0, $val) >; } @@ -3269,6 +3324,89 @@ def : Pat < (V_CNDMASK_B32_e64 $src0, $src1, $src2) >; +multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> { + def : Pat < + (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)), + (BFM $a, $b) + >; + + def : Pat < + (vt (add (vt (shl 1, vt:$a)), -1)), + (BFM $a, (MOV 0)) + >; +} + +defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>; +// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>; + +def : BFEPattern <V_BFE_U32, S_MOV_B32>; + +//===----------------------------------------------------------------------===// +// Fract Patterns +//===----------------------------------------------------------------------===// + +let Predicates = [isSI] in { + +// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is +// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient +// way to implement it is using V_FRACT_F64. +// The workaround for the V_FRACT bug is: +// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) + +// Convert (x + (-floor(x)) to fract(x) +def : Pat < + (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), + (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), + (V_CNDMASK_B64_PSEUDO + $x, + (V_MIN_F64 + SRCMODS.NONE, + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), + SRCMODS.NONE, + (V_MOV_B64_PSEUDO 0x3fefffffffffffff), + DSTCLAMP.NONE, DSTOMOD.NONE), + (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)) +>; + +// Convert floor(x) to (x - fract(x)) +def : Pat < + (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), + (V_ADD_F64 + $mods, + $x, + SRCMODS.NEG, + (V_CNDMASK_B64_PSEUDO + $x, + (V_MIN_F64 + SRCMODS.NONE, + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), + SRCMODS.NONE, + (V_MOV_B64_PSEUDO 0x3fefffffffffffff), + DSTCLAMP.NONE, DSTOMOD.NONE), + (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)), + DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +} // End Predicates = [isSI] + +let Predicates = [isCI] in { + +// Convert (x - floor(x)) to fract(x) +def : Pat < + (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), + (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), + (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +// Convert (x + (-floor(x))) to fract(x) +def : Pat < + (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), + (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +} // End Predicates = [isCI] + //============================================================================// // Miscellaneous Optimization Patterns //============================================================================// diff --git a/lib/Target/R600/SILoadStoreOptimizer.cpp b/lib/Target/R600/SILoadStoreOptimizer.cpp index 46630d0..a927ad8 100644 --- a/lib/Target/R600/SILoadStoreOptimizer.cpp +++ b/lib/Target/R600/SILoadStoreOptimizer.cpp @@ -45,6 +45,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; @@ -249,10 +250,10 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( DebugLoc DL = I->getDebugLoc(); MachineInstrBuilder Read2 = BuildMI(*MBB, I, DL, Read2Desc, DestReg) - .addImm(0) // gds .addOperand(*AddrReg) // addr .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 + .addImm(0) // gds .addOperand(*M0Reg) // M0 .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); @@ -332,12 +333,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( MachineInstrBuilder Write2 = BuildMI(*MBB, I, DL, Write2Desc) - .addImm(0) // gds .addOperand(*Addr) // addr .addOperand(*Data0) // data0 .addOperand(*Data1) // data1 .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 + .addImm(0) // gds .addOperand(*M0Reg) // m0 .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp index 9224e14..13a8974 100644 --- a/lib/Target/R600/SIRegisterInfo.cpp +++ b/lib/Target/R600/SIRegisterInfo.cpp @@ -14,7 +14,6 @@ #include "SIRegisterInfo.h" -#include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -25,9 +24,7 @@ using namespace llvm; -SIRegisterInfo::SIRegisterInfo(const AMDGPUSubtarget &st) -: AMDGPURegisterInfo(st) - { } +SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {} BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); @@ -47,14 +44,34 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(AMDGPU::VGPR255); Reserved.set(AMDGPU::VGPR254); + // Tonga and Iceland can only allocate a fixed number of SGPRs due + // to a hw bug. + if (MF.getSubtarget<AMDGPUSubtarget>().hasSGPRInitBug()) { + unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); + // Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs). + // Assume XNACK_MASK is unused. + unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4; + + for (unsigned i = Limit; i < NumSGPRs; ++i) { + unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); + MCRegAliasIterator R = MCRegAliasIterator(Reg, this, true); + + for (; R.isValid(); ++R) + Reserved.set(*R); + } + } + return Reserved; } -unsigned SIRegisterInfo::getRegPressureSetLimit(unsigned Idx) const { +unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const { + const AMDGPUSubtarget &STI = MF.getSubtarget<AMDGPUSubtarget>(); // FIXME: We should adjust the max number of waves based on LDS size. - unsigned SGPRLimit = getNumSGPRsAllowed(ST.getMaxWavesPerCU()); - unsigned VGPRLimit = getNumVGPRsAllowed(ST.getMaxWavesPerCU()); + unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(), + STI.getMaxWavesPerCU()); + unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU()); for (regclass_iterator I = regclass_begin(), E = regclass_end(); I != E; ++I) { @@ -125,9 +142,10 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, int64_t Offset, RegScavenger *RS) const { - const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo()); MachineBasicBlock *MBB = MI->getParent(); const MachineFunction *MF = MI->getParent()->getParent(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo()); LLVMContext &Ctx = MF->getFunction()->getContext(); DebugLoc DL = MI->getDebugLoc(); bool IsLoad = TII->get(LoadStoreOp).mayLoad(); @@ -162,8 +180,8 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) .addReg(SubReg, getDefRegState(IsLoad)) .addReg(ScratchRsrcReg, getKillRegState(IsKill)) - .addImm(Offset) .addReg(SOffset) + .addImm(Offset) .addImm(0) // glc .addImm(0) // slc .addImm(0) // tfe @@ -178,7 +196,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo *FrameInfo = MF->getFrameInfo(); - const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo()); DebugLoc DL = MI->getDebugLoc(); MachineOperand &FIOp = MI->getOperand(FIOperandNum); @@ -249,7 +268,22 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, .addReg(SubReg); } } - TII->insertNOPs(MI, 3); + + // TODO: only do this when it is needed + switch (MF->getSubtarget<AMDGPUSubtarget>().getGeneration()) { + case AMDGPUSubtarget::SOUTHERN_ISLANDS: + // "VALU writes SGPR" -> "SMRD reads that SGPR" needs "S_NOP 3" on SI + TII->insertNOPs(MI, 3); + break; + case AMDGPUSubtarget::SEA_ISLANDS: + break; + default: // VOLCANIC_ISLANDS and later + // "VALU writes SGPR -> VMEM reads that SGPR" needs "S_NOP 4" on VI + // and later. This also applies to VALUs which write VCC, but we're + // unlikely to see VMEM use VCC. + TII->insertNOPs(MI, 4); + } + MI->eraseFromParent(); break; } @@ -494,14 +528,24 @@ unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const { } } -unsigned SIRegisterInfo::getNumSGPRsAllowed(unsigned WaveCount) const { - switch(WaveCount) { - case 10: return 48; - case 9: return 56; - case 8: return 64; - case 7: return 72; - case 6: return 80; - case 5: return 96; - default: return 103; +unsigned SIRegisterInfo::getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen, + unsigned WaveCount) const { + if (gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + switch (WaveCount) { + case 10: return 80; + case 9: return 80; + case 8: return 96; + default: return 102; + } + } else { + switch(WaveCount) { + case 10: return 48; + case 9: return 56; + case 8: return 64; + case 7: return 72; + case 6: return 80; + case 5: return 96; + default: return 103; + } } } diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h index d908ffd..bfdb67c 100644 --- a/lib/Target/R600/SIRegisterInfo.h +++ b/lib/Target/R600/SIRegisterInfo.h @@ -17,17 +17,19 @@ #define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H #include "AMDGPURegisterInfo.h" +#include "AMDGPUSubtarget.h" #include "llvm/Support/Debug.h" namespace llvm { struct SIRegisterInfo : public AMDGPURegisterInfo { - SIRegisterInfo(const AMDGPUSubtarget &st); + SIRegisterInfo(); BitVector getReservedRegs(const MachineFunction &MF) const override; - unsigned getRegPressureSetLimit(unsigned Idx) const override; + unsigned getRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const override; bool requiresRegisterScavenging(const MachineFunction &Fn) const override; @@ -111,7 +113,8 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount /// concurrent waves. - unsigned getNumSGPRsAllowed(unsigned WaveCount) const; + unsigned getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen, + unsigned WaveCount) const; unsigned findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC) const; diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td index 8b25e95..7bb5dc2 100644 --- a/lib/Target/R600/SIRegisterInfo.td +++ b/lib/Target/R600/SIRegisterInfo.td @@ -256,10 +256,3 @@ def VSrc_64 : RegImmOperand<VS_64>; def VCSrc_32 : RegInlineOperand<VS_32>; def VCSrc_64 : RegInlineOperand<VS_64>; - -//===----------------------------------------------------------------------===// -// SGPR and VGPR register classes -//===----------------------------------------------------------------------===// - -def VSrc_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, - (add VReg_128, SReg_128)>; diff --git a/lib/Target/R600/SIShrinkInstructions.cpp b/lib/Target/R600/SIShrinkInstructions.cpp index 97bbd78..51e72cd 100644 --- a/lib/Target/R600/SIShrinkInstructions.cpp +++ b/lib/Target/R600/SIShrinkInstructions.cpp @@ -18,9 +18,10 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "si-shrink-instructions" @@ -88,6 +89,11 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); // Can't shrink instruction with three operands. + // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add + // a special case for it. It can only be shrunk if the third operand + // is vcc. We should handle this the same way we handle vopc, by addding + // a register allocation hint pre-regalloc and then do the shrining + // post-regalloc. if (Src2) return false; diff --git a/lib/Target/README.txt b/lib/Target/README.txt index 0fa56e6..282d923 100644 --- a/lib/Target/README.txt +++ b/lib/Target/README.txt @@ -1268,7 +1268,8 @@ int foo (void) { .. else if (strchr ("<>", *intel_parser.op_string) -Those should be turned into a switch. +Those should be turned into a switch. SimplifyLibCalls only gets the second +case. //===---------------------------------------------------------------------===// @@ -1843,44 +1844,6 @@ we remove checking in code like //===---------------------------------------------------------------------===// -This code (from Benchmarks/Dhrystone/dry.c): - -define i32 @Func1(i32, i32) nounwind readnone optsize ssp { -entry: - %sext = shl i32 %0, 24 - %conv = ashr i32 %sext, 24 - %sext6 = shl i32 %1, 24 - %conv4 = ashr i32 %sext6, 24 - %cmp = icmp eq i32 %conv, %conv4 - %. = select i1 %cmp, i32 10000, i32 0 - ret i32 %. -} - -Should be simplified into something like: - -define i32 @Func1(i32, i32) nounwind readnone optsize ssp { -entry: - %sext = shl i32 %0, 24 - %conv = and i32 %sext, 0xFF000000 - %sext6 = shl i32 %1, 24 - %conv4 = and i32 %sext6, 0xFF000000 - %cmp = icmp eq i32 %conv, %conv4 - %. = select i1 %cmp, i32 10000, i32 0 - ret i32 %. -} - -and then to: - -define i32 @Func1(i32, i32) nounwind readnone optsize ssp { -entry: - %conv = and i32 %0, 0xFF - %conv4 = and i32 %1, 0xFF - %cmp = icmp eq i32 %conv, %conv4 - %. = select i1 %cmp, i32 10000, i32 0 - ret i32 %. -} -//===---------------------------------------------------------------------===// - clang -O3 currently compiles this code int g(unsigned int a) { diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp index 5128843..598856f 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp @@ -74,7 +74,6 @@ public: MCCodeEmitter *llvm::createSparcMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx) { return new SparcMCCodeEmitter(Ctx); } diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h index f72c6c4..3a6f508 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h +++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h @@ -62,8 +62,8 @@ private: const VariantKind Kind; const MCExpr *Expr; - explicit SparcMCExpr(VariantKind _Kind, const MCExpr *_Expr) - : Kind(_Kind), Expr(_Expr) {} + explicit SparcMCExpr(VariantKind Kind, const MCExpr *Expr) + : Kind(Kind), Expr(Expr) {} public: /// @name Construction diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp index 3cc4314..630ed1b 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp @@ -122,25 +122,16 @@ static MCCodeGenInfo *createSparcV9MCCodeGenInfo(StringRef TT, Reloc::Model RM, return X; } -static MCStreamer *createMCStreamer(const Target &T, StringRef TT, - MCContext &Context, MCAsmBackend &MAB, - raw_ostream &OS, MCCodeEmitter *Emitter, - const MCSubtargetInfo &STI, bool RelaxAll) { - MCStreamer *S = createELFStreamer(Context, MAB, OS, Emitter, RelaxAll); - new SparcTargetELFStreamer(*S); - return S; +static MCTargetStreamer * +createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) { + return new SparcTargetELFStreamer(S); } -static MCStreamer * -createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS, - bool isVerboseAsm, bool useDwarfDirectory, - MCInstPrinter *InstPrint, MCCodeEmitter *CE, - MCAsmBackend *TAB, bool ShowInst) { - - MCStreamer *S = llvm::createAsmStreamer( - Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst); - new SparcTargetAsmStreamer(*S, OS); - return S; +static MCTargetStreamer *createTargetAsmStreamer(MCStreamer &S, + formatted_raw_ostream &OS, + MCInstPrinter *InstPrint, + bool isVerboseAsm) { + return new SparcTargetAsmStreamer(S, OS); } static MCInstPrinter *createSparcMCInstPrinter(const Target &T, @@ -157,54 +148,37 @@ extern "C" void LLVMInitializeSparcTargetMC() { RegisterMCAsmInfoFn X(TheSparcTarget, createSparcMCAsmInfo); RegisterMCAsmInfoFn Y(TheSparcV9Target, createSparcV9MCAsmInfo); + for (Target *T : {&TheSparcTarget, &TheSparcV9Target}) { + // Register the MC instruction info. + TargetRegistry::RegisterMCInstrInfo(*T, createSparcMCInstrInfo); + + // Register the MC register info. + TargetRegistry::RegisterMCRegInfo(*T, createSparcMCRegisterInfo); + + // Register the MC subtarget info. + TargetRegistry::RegisterMCSubtargetInfo(*T, createSparcMCSubtargetInfo); + + // Register the MC Code Emitter. + TargetRegistry::RegisterMCCodeEmitter(*T, createSparcMCCodeEmitter); + + // Register the asm backend. + TargetRegistry::RegisterMCAsmBackend(*T, createSparcAsmBackend); + + // Register the object target streamer. + TargetRegistry::RegisterObjectTargetStreamer(*T, + createObjectTargetStreamer); + + // Register the asm streamer. + TargetRegistry::RegisterAsmTargetStreamer(*T, createTargetAsmStreamer); + + // Register the MCInstPrinter + TargetRegistry::RegisterMCInstPrinter(*T, createSparcMCInstPrinter); + } + // Register the MC codegen info. TargetRegistry::RegisterMCCodeGenInfo(TheSparcTarget, createSparcMCCodeGenInfo); TargetRegistry::RegisterMCCodeGenInfo(TheSparcV9Target, createSparcV9MCCodeGenInfo); - // Register the MC instruction info. - TargetRegistry::RegisterMCInstrInfo(TheSparcTarget, createSparcMCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(TheSparcV9Target, createSparcMCInstrInfo); - - // Register the MC register info. - TargetRegistry::RegisterMCRegInfo(TheSparcTarget, createSparcMCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(TheSparcV9Target, - createSparcMCRegisterInfo); - - // Register the MC subtarget info. - TargetRegistry::RegisterMCSubtargetInfo(TheSparcTarget, - createSparcMCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(TheSparcV9Target, - createSparcMCSubtargetInfo); - - // Register the MC Code Emitter. - TargetRegistry::RegisterMCCodeEmitter(TheSparcTarget, - createSparcMCCodeEmitter); - TargetRegistry::RegisterMCCodeEmitter(TheSparcV9Target, - createSparcMCCodeEmitter); - - //Register the asm backend. - TargetRegistry::RegisterMCAsmBackend(TheSparcTarget, - createSparcAsmBackend); - TargetRegistry::RegisterMCAsmBackend(TheSparcV9Target, - createSparcAsmBackend); - - // Register the object streamer. - TargetRegistry::RegisterMCObjectStreamer(TheSparcTarget, - createMCStreamer); - TargetRegistry::RegisterMCObjectStreamer(TheSparcV9Target, - createMCStreamer); - - // Register the asm streamer. - TargetRegistry::RegisterAsmStreamer(TheSparcTarget, - createMCAsmStreamer); - TargetRegistry::RegisterAsmStreamer(TheSparcV9Target, - createMCAsmStreamer); - - // Register the MCInstPrinter - TargetRegistry::RegisterMCInstPrinter(TheSparcTarget, - createSparcMCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(TheSparcV9Target, - createSparcMCInstPrinter); } diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h index c31943d..d2ec991 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h +++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h @@ -33,7 +33,6 @@ extern Target TheSparcV9Target; MCCodeEmitter *createSparcMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx); MCAsmBackend *createSparcAsmBackend(const Target &T, const MCRegisterInfo &MRI, diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp index 9f03b04..1cf5ccf 100644 --- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp +++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp @@ -50,7 +50,7 @@ public: /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for /// inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, - char ConstraintCode, + unsigned ConstraintID, std::vector<SDValue> &OutOps) override; const char *getPassName() const override { @@ -195,12 +195,13 @@ SDNode *SparcDAGToDAGISel::Select(SDNode *N) { /// inline asm expressions. bool SparcDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op, - char ConstraintCode, + unsigned ConstraintID, std::vector<SDValue> &OutOps) { SDValue Op0, Op1; - switch (ConstraintCode) { + switch (ConstraintID) { default: return true; - case 'm': // memory + case InlineAsm::Constraint_i: + case InlineAsm::Constraint_m: // memory if (!SelectADDRrr(Op, Op0, Op1)) SelectADDRri(Op, Op0, Op1); break; diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp index 6774977..c8b0570 100644 --- a/lib/Target/Sparc/SparcISelLowering.cpp +++ b/lib/Target/Sparc/SparcISelLowering.cpp @@ -915,9 +915,10 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI, // Add a register mask operand representing the call-preserved registers. const SparcRegisterInfo *TRI = Subtarget->getRegisterInfo(); - const uint32_t *Mask = ((hasReturnsTwice) - ? TRI->getRTCallPreservedMask(CallConv) - : TRI->getCallPreservedMask(CallConv)); + const uint32_t *Mask = + ((hasReturnsTwice) + ? TRI->getRTCallPreservedMask(CallConv) + : TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv)); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -1229,7 +1230,8 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI, const SparcRegisterInfo *TRI = Subtarget->getRegisterInfo(); const uint32_t *Mask = ((hasReturnsTwice) ? TRI->getRTCallPreservedMask(CLI.CallConv) - : TRI->getCallPreservedMask(CLI.CallConv)); + : TRI->getCallPreservedMask(DAG.getMachineFunction(), + CLI.CallConv)); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -1904,8 +1906,8 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op, Ops.push_back(Callee); Ops.push_back(Symbol); Ops.push_back(DAG.getRegister(SP::O0, PtrVT)); - const uint32_t *Mask = - Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C); + const uint32_t *Mask = Subtarget->getRegisterInfo()->getCallPreservedMask( + DAG.getMachineFunction(), CallingConv::C); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); Ops.push_back(InFlag); diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp index 8b2e6bc..4b70f16 100644 --- a/lib/Target/Sparc/SparcInstrInfo.cpp +++ b/lib/Target/Sparc/SparcInstrInfo.cpp @@ -33,9 +33,8 @@ using namespace llvm; void SparcInstrInfo::anchor() {} SparcInstrInfo::SparcInstrInfo(SparcSubtarget &ST) - : SparcGenInstrInfo(SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), - RI(ST), Subtarget(ST) { -} + : SparcGenInstrInfo(SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), RI(), + Subtarget(ST) {} /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of diff --git a/lib/Target/Sparc/SparcInstrInfo.h b/lib/Target/Sparc/SparcInstrInfo.h index fe93ed7..6e08418 100644 --- a/lib/Target/Sparc/SparcInstrInfo.h +++ b/lib/Target/Sparc/SparcInstrInfo.h @@ -22,6 +22,8 @@ namespace llvm { +class SparcSubtarget; + /// SPII - This namespace holds all of the target specific flags that /// instruction info tracks. /// diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp index 3cca98f..9667bc0 100644 --- a/lib/Target/Sparc/SparcRegisterInfo.cpp +++ b/lib/Target/Sparc/SparcRegisterInfo.cpp @@ -34,17 +34,16 @@ static cl::opt<bool> ReserveAppRegisters("sparc-reserve-app-registers", cl::Hidden, cl::init(false), cl::desc("Reserve application registers (%g2-%g4)")); -SparcRegisterInfo::SparcRegisterInfo(SparcSubtarget &st) - : SparcGenRegisterInfo(SP::O7), Subtarget(st) { -} +SparcRegisterInfo::SparcRegisterInfo() : SparcGenRegisterInfo(SP::O7) {} const MCPhysReg* SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_SaveList; } -const uint32_t* -SparcRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { +const uint32_t * +SparcRegisterInfo::getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { return CSR_RegMask; } @@ -55,6 +54,7 @@ SparcRegisterInfo::getRTCallPreservedMask(CallingConv::ID CC) const { BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); + const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>(); // FIXME: G1 reserved for now for large imm generation by frame code. Reserved.set(SP::G1); @@ -89,6 +89,7 @@ BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const { const TargetRegisterClass* SparcRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) const { + const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>(); return Subtarget.is64Bit() ? &SP::I64RegsRegClass : &SP::IntRegsRegClass; } @@ -160,6 +161,7 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // Addressable stack objects are accessed using neg. offsets from %fp MachineFunction &MF = *MI.getParent()->getParent(); + const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>(); int64_t Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) + MI.getOperand(FIOperandNum + 1).getImm() + Subtarget.getStackPointerBias(); @@ -174,7 +176,7 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, if (!Subtarget.isV9() || !Subtarget.hasHardQuad()) { if (MI.getOpcode() == SP::STQFri) { - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); unsigned SrcReg = MI.getOperand(2).getReg(); unsigned SrcEvenReg = getSubReg(SrcReg, SP::sub_even64); unsigned SrcOddReg = getSubReg(SrcReg, SP::sub_odd64); @@ -186,7 +188,7 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MI.getOperand(2).setReg(SrcOddReg); Offset += 8; } else if (MI.getOpcode() == SP::LDQFri) { - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); unsigned DestReg = MI.getOperand(0).getReg(); unsigned DestEvenReg = getSubReg(DestReg, SP::sub_even64); unsigned DestOddReg = getSubReg(DestReg, SP::sub_odd64); diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h index 63567b0..764a894 100644 --- a/lib/Target/Sparc/SparcRegisterInfo.h +++ b/lib/Target/Sparc/SparcRegisterInfo.h @@ -20,20 +20,13 @@ #include "SparcGenRegisterInfo.inc" namespace llvm { - -class SparcSubtarget; -class TargetInstrInfo; -class Type; - struct SparcRegisterInfo : public SparcGenRegisterInfo { - SparcSubtarget &Subtarget; - - SparcRegisterInfo(SparcSubtarget &st); + SparcRegisterInfo(); /// Code Generation virtual methods... - const MCPhysReg * - getCalleeSavedRegs(const MachineFunction *MF =nullptr) const override; - const uint32_t* getCallPreservedMask(CallingConv::ID CC) const override; + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const uint32_t *getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const override; const uint32_t* getRTCallPreservedMask(CallingConv::ID CC) const; diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp index 1c423dc..6979a17 100644 --- a/lib/Target/Sparc/SparcTargetMachine.cpp +++ b/lib/Target/Sparc/SparcTargetMachine.cpp @@ -56,12 +56,11 @@ SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool is64bit) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - TLOF(make_unique<SparcELFTargetObjectFile>()), - DL(computeDataLayout(is64bit)), - Subtarget(TT, CPU, FS, *this, is64bit) { + CodeGenOpt::Level OL, bool is64bit) + : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options, RM, + CM, OL), + TLOF(make_unique<SparcELFTargetObjectFile>()), + Subtarget(TT, CPU, FS, *this, is64bit) { initAsmInfo(); } diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h index 4f93980..30a8ebf 100644 --- a/lib/Target/Sparc/SparcTargetMachine.h +++ b/lib/Target/Sparc/SparcTargetMachine.h @@ -22,7 +22,6 @@ namespace llvm { class SparcTargetMachine : public LLVMTargetMachine { std::unique_ptr<TargetLoweringObjectFile> TLOF; - const DataLayout DL; SparcSubtarget Subtarget; public: SparcTargetMachine(const Target &T, StringRef TT, @@ -31,8 +30,9 @@ public: CodeGenOpt::Level OL, bool is64bit); ~SparcTargetMachine() override; - const DataLayout *getDataLayout() const override { return &DL; } - const SparcSubtarget *getSubtargetImpl() const override { return &Subtarget; } + const SparcSubtarget *getSubtargetImpl(const Function &) const override { + return &Subtarget; + } // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp index d9bb916..40dc48e 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp @@ -110,7 +110,6 @@ private: MCCodeEmitter *llvm::createSystemZMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &MCSTI, MCContext &Ctx) { return new SystemZMCCodeEmitter(MCII, Ctx); } diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp index 6e82b6d..ffd05a9 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp @@ -181,14 +181,6 @@ static MCInstPrinter *createSystemZMCInstPrinter(const Target &T, return new SystemZInstPrinter(MAI, MII, MRI); } -static MCStreamer * -createSystemZMCObjectStreamer(const Target &T, StringRef TT, MCContext &Ctx, - MCAsmBackend &MAB, raw_ostream &OS, - MCCodeEmitter *Emitter, - const MCSubtargetInfo &STI, bool RelaxAll) { - return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll); -} - extern "C" void LLVMInitializeSystemZTargetMC() { // Register the MCAsmInfo. TargetRegistry::RegisterMCAsmInfo(TheSystemZTarget, @@ -221,8 +213,4 @@ extern "C" void LLVMInitializeSystemZTargetMC() { // Register the MCInstPrinter. TargetRegistry::RegisterMCInstPrinter(TheSystemZTarget, createSystemZMCInstPrinter); - - // Register the MCObjectStreamer; - TargetRegistry::RegisterMCObjectStreamer(TheSystemZTarget, - createSystemZMCObjectStreamer); } diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h index 5eb6526..962c950 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h @@ -71,7 +71,6 @@ inline unsigned getRegAsGRH32(unsigned Reg) { MCCodeEmitter *createSystemZMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx); MCAsmBackend *createSystemZMCAsmBackend(const Target &T, diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp index b8b0db9..a52aa25 100644 --- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -328,7 +328,7 @@ public: // Override SelectionDAGISel. SDNode *Select(SDNode *Node) override; - bool SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, + bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) override; // Include the pieces autogenerated from the target description. @@ -1129,18 +1129,29 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) { bool SystemZDAGToDAGISel:: SelectInlineAsmMemoryOperand(const SDValue &Op, - char ConstraintCode, + unsigned ConstraintID, std::vector<SDValue> &OutOps) { - assert(ConstraintCode == 'm' && "Unexpected constraint code"); - // Accept addresses with short displacements, which are compatible - // with Q, R, S and T. But keep the index operand for future expansion. - SDValue Base, Disp, Index; - if (!selectBDXAddr(SystemZAddressingMode::FormBD, - SystemZAddressingMode::Disp12Only, - Op, Base, Disp, Index)) - return true; - OutOps.push_back(Base); - OutOps.push_back(Disp); - OutOps.push_back(Index); - return false; + switch(ConstraintID) { + default: + llvm_unreachable("Unexpected asm memory constraint"); + case InlineAsm::Constraint_i: + case InlineAsm::Constraint_m: + case InlineAsm::Constraint_Q: + case InlineAsm::Constraint_R: + case InlineAsm::Constraint_S: + case InlineAsm::Constraint_T: + // Accept addresses with short displacements, which are compatible + // with Q, R, S and T. But keep the index operand for future expansion. + SDValue Base, Disp, Index; + if (selectBDXAddr(SystemZAddressingMode::FormBD, + SystemZAddressingMode::Disp12Only, + Op, Base, Disp, Index)) { + OutOps.push_back(Base); + OutOps.push_back(Disp); + OutOps.push_back(Index); + return false; + } + break; + } + return true; } diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index e96398d..0ca8bcd 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -920,7 +920,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, // Add a register mask operand representing the call-preserved registers. const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); - const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); + const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -1858,7 +1858,8 @@ SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node, // Add a register mask operand representing the call-preserved registers. const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); - const uint32_t *Mask = TRI->getCallPreservedMask(CallingConv::C); + const uint32_t *Mask = + TRI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index a2b10b0..23c62c9 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -233,6 +233,26 @@ public: std::string &Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const override; + + unsigned getInlineAsmMemConstraint( + const std::string &ConstraintCode) const override { + if (ConstraintCode.size() == 1) { + switch(ConstraintCode[0]) { + default: + break; + case 'Q': + return InlineAsm::Constraint_Q; + case 'R': + return InlineAsm::Constraint_R; + case 'S': + return InlineAsm::Constraint_S; + case 'T': + return InlineAsm::Constraint_T; + } + } + return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); + } + MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const override; diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp index 8488ec8..5128993 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -743,11 +743,10 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return nullptr; } -MachineInstr * -SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, - int FrameIndex) const { +MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + ArrayRef<unsigned> Ops, + int FrameIndex) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); unsigned Size = MFI->getObjectSize(FrameIndex); unsigned Opcode = MI->getOpcode(); @@ -862,9 +861,9 @@ SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, } MachineInstr * -SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr* MI, - const SmallVectorImpl<unsigned> &Ops, - MachineInstr* LoadMI) const { +SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, + MachineInstr *LoadMI) const { return nullptr; } diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h index e711f89..b55810b 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.h +++ b/lib/Target/SystemZ/SystemZInstrInfo.h @@ -186,11 +186,11 @@ public: MachineBasicBlock::iterator &MBBI, LiveVariables *LV) const override; MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, + ArrayRef<unsigned> Ops, int FrameIndex) const override; - MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr* MI, - const SmallVectorImpl<unsigned> &Ops, - MachineInstr* LoadMI) const override; + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, + MachineInstr *LoadMI) const override; bool expandPostRAPseudo(MachineBasicBlock::iterator MBBI) const override; bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp index 64f5eeb..7cabea9 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp +++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp @@ -28,7 +28,8 @@ SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { } const uint32_t * -SystemZRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { +SystemZRegisterInfo::getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { return CSR_SystemZ_RegMask; } diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h index 212fe91..a0db5a9 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.h +++ b/lib/Target/SystemZ/SystemZRegisterInfo.h @@ -43,9 +43,9 @@ public: bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override { return true; } - const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF = nullptr) const - override; - const uint32_t *getCallPreservedMask(CallingConv::ID CC) const override; + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const uint32_t *getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const override; BitVector getReservedRegs(const MachineFunction &MF) const override; void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp index 73198b1..86baccb 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -25,12 +25,12 @@ SystemZTargetMachine::SystemZTargetMachine(const Target &T, StringRef TT, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + // Make sure that global data has at least 16 bits of alignment by + // default, so that we can refer to it using LARL. We don't have any + // special requirements for stack variables though. + : LLVMTargetMachine(T, "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64", + TT, CPU, FS, Options, RM, CM, OL), TLOF(make_unique<TargetLoweringObjectFileELF>()), - // Make sure that global data has at least 16 bits of alignment by - // default, so that we can refer to it using LARL. We don't have any - // special requirements for stack variables though. - DL("E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64"), Subtarget(TT, CPU, FS, *this) { initAsmInfo(); } diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h index 52ccc5a..181b926 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.h +++ b/lib/Target/SystemZ/SystemZTargetMachine.h @@ -24,7 +24,6 @@ class TargetFrameLowering; class SystemZTargetMachine : public LLVMTargetMachine { std::unique_ptr<TargetLoweringObjectFile> TLOF; - const DataLayout DL; SystemZSubtarget Subtarget; public: @@ -34,9 +33,8 @@ public: CodeGenOpt::Level OL); ~SystemZTargetMachine() override; - // Override TargetMachine. - const DataLayout *getDataLayout() const override { return &DL; } - const SystemZSubtarget *getSubtargetImpl() const override { + const SystemZSubtarget *getSubtargetImpl() const { return &Subtarget; } + const SystemZSubtarget *getSubtargetImpl(const Function &) const override { return &Subtarget; } // Override LLVMTargetMachine diff --git a/lib/Target/Target.cpp b/lib/Target/Target.cpp index 5b7953d..1b74e8c 100644 --- a/lib/Target/Target.cpp +++ b/lib/Target/Target.cpp @@ -34,7 +34,6 @@ inline LLVMTargetLibraryInfoRef wrap(const TargetLibraryInfoImpl *P) { } void llvm::initializeTarget(PassRegistry &Registry) { - initializeDataLayoutPassPass(Registry); initializeTargetLibraryInfoWrapperPassPass(Registry); initializeTargetTransformInfoWrapperPassPass(Registry); } @@ -48,9 +47,6 @@ LLVMTargetDataRef LLVMCreateTargetData(const char *StringRep) { } void LLVMAddTargetData(LLVMTargetDataRef TD, LLVMPassManagerRef PM) { - // The DataLayoutPass must now be in sync with the module. Unfortunatelly we - // cannot enforce that from the C api. - unwrap(PM)->add(new DataLayoutPass()); } void LLVMAddTargetLibraryInfo(LLVMTargetLibraryInfoRef TLI, diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp index faa6fbe..75100fb 100644 --- a/lib/Target/TargetLoweringObjectFile.cpp +++ b/lib/Target/TargetLoweringObjectFile.cpp @@ -343,3 +343,9 @@ const MCExpr *TargetLoweringObjectFile::getDebugThreadLocalSymbol(const MCSymbol // null return could mean 'no location' & we should just do that here. return MCSymbolRefExpr::Create(Sym, *Ctx); } + +void TargetLoweringObjectFile::getNameWithPrefix( + SmallVectorImpl<char> &OutName, const GlobalValue *GV, + bool CannotUsePrivateLabel, Mangler &Mang, const TargetMachine &TM) const { + Mang.getNameWithPrefix(OutName, GV, CannotUsePrivateLabel); +} diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp index 307e93c..dd07f81 100644 --- a/lib/Target/TargetMachine.cpp +++ b/lib/Target/TargetMachine.cpp @@ -22,6 +22,7 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCCodeGenInfo.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/SectionKind.h" @@ -36,18 +37,20 @@ using namespace llvm; // TargetMachine Class // -TargetMachine::TargetMachine(const Target &T, +TargetMachine::TargetMachine(const Target &T, StringRef DataLayoutString, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options) - : TheTarget(T), TargetTriple(TT), TargetCPU(CPU), TargetFS(FS), - CodeGenInfo(nullptr), AsmInfo(nullptr), - RequireStructuredCFG(false), - Options(Options) { -} + : TheTarget(T), DL(DataLayoutString), TargetTriple(TT), TargetCPU(CPU), + TargetFS(FS), CodeGenInfo(nullptr), AsmInfo(nullptr), MRI(nullptr), + MII(nullptr), STI(nullptr), RequireStructuredCFG(false), + Options(Options) {} TargetMachine::~TargetMachine() { delete CodeGenInfo; delete AsmInfo; + delete MRI; + delete MII; + delete STI; } /// \brief Reset the target options based on the function's attributes. @@ -177,7 +180,7 @@ void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name, const TargetLoweringObjectFile *TLOF = getObjFileLowering(); const MCSection *TheSection = TLOF->SectionForGlobal(GV, GVKind, Mang, *this); bool CannotUsePrivateLabel = !canUsePrivateLabel(*AsmInfo, *TheSection); - Mang.getNameWithPrefix(Name, GV, CannotUsePrivateLabel); + TLOF->getNameWithPrefix(Name, GV, CannotUsePrivateLabel, Mang, *this); } MCSymbol *TargetMachine::getSymbol(const GlobalValue *GV, Mangler &Mang) const { diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp index c7838a9..236cb1b 100644 --- a/lib/Target/TargetMachineC.cpp +++ b/lib/Target/TargetMachineC.cpp @@ -198,8 +198,7 @@ static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M, *ErrorMessage = strdup(error.c_str()); return true; } - Mod->setDataLayout(td); - pass.add(new DataLayoutPass()); + Mod->setDataLayout(*td); TargetMachine::CodeGenFileType ft; switch (codegen) { diff --git a/lib/Target/TargetSubtargetInfo.cpp b/lib/Target/TargetSubtargetInfo.cpp index 10597a8..b2bb59e 100644 --- a/lib/Target/TargetSubtargetInfo.cpp +++ b/lib/Target/TargetSubtargetInfo.cpp @@ -23,22 +23,6 @@ TargetSubtargetInfo::TargetSubtargetInfo() {} TargetSubtargetInfo::~TargetSubtargetInfo() {} -// Temporary option to compare overall performance change when moving from the -// SD scheduler to the MachineScheduler pass pipeline. This is convenient for -// benchmarking during the transition from SD to MI scheduling. Once armv7 makes -// the switch, it should go away. The normal way to enable/disable the -// MachineScheduling pass itself is by using -enable-misched. For targets that -// already use MI sched (via MySubTarget::enableMachineScheduler()) -// -misched-bench=false negates the subtarget hook. -static cl::opt<bool> BenchMachineSched("misched-bench", cl::Hidden, - cl::desc("Migrate from the target's default SD scheduler to MI scheduler")); - -bool TargetSubtargetInfo::useMachineScheduler() const { - if (BenchMachineSched.getNumOccurrences()) - return BenchMachineSched; - return enableMachineScheduler(); -} - bool TargetSubtargetInfo::enableAtomicExpand() const { return true; } @@ -47,6 +31,10 @@ bool TargetSubtargetInfo::enableMachineScheduler() const { return false; } +bool TargetSubtargetInfo::enableJoinGlobalCopies() const { + return enableMachineScheduler(); +} + bool TargetSubtargetInfo::enableRALocalReassignment( CodeGenOpt::Level OptLevel) const { return true; diff --git a/lib/Target/X86/Android.mk b/lib/Target/X86/Android.mk index 08646d0..7194dd3 100644 --- a/lib/Target/X86/Android.mk +++ b/lib/Target/X86/Android.mk @@ -1,8 +1,10 @@ LOCAL_PATH := $(call my-dir) x86_codegen_TBLGEN_TABLES := \ + X86GenAsmMatcher.inc \ X86GenAsmWriter.inc \ X86GenAsmWriter1.inc \ + X86GenDisassemblerTables.inc \ X86GenRegisterInfo.inc \ X86GenInstrInfo.inc \ X86GenDAGISel.inc \ diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index 0b6fb52..c24805a 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -11,6 +11,7 @@ #include "X86AsmInstrumentation.h" #include "X86AsmParserCommon.h" #include "X86Operand.h" +#include "X86ISelLowering.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" @@ -664,6 +665,7 @@ private: ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size); std::unique_ptr<X86Operand> ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc, unsigned Size); + std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start, SMLoc End); bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End); std::unique_ptr<X86Operand> ParseIntelBracExpression(unsigned SegReg, SMLoc Start, @@ -1407,6 +1409,35 @@ X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, /*Scale=*/1, Start, End, Size, Identifier, Info); } +//ParseRoundingModeOp - Parse AVX-512 rounding mode operand +std::unique_ptr<X86Operand> +X86AsmParser::ParseRoundingModeOp(SMLoc Start, SMLoc End) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + consumeToken(); // Eat "{" + if (Tok.getIdentifier().startswith("r")){ + int rndMode = StringSwitch<int>(Tok.getIdentifier()) + .Case("rn", X86::STATIC_ROUNDING::TO_NEAREST_INT) + .Case("rd", X86::STATIC_ROUNDING::TO_NEG_INF) + .Case("ru", X86::STATIC_ROUNDING::TO_POS_INF) + .Case("rz", X86::STATIC_ROUNDING::TO_ZERO) + .Default(-1); + if (-1 == rndMode) + return ErrorOperand(Tok.getLoc(), "Invalid rounding mode."); + Parser.Lex(); // Eat "r*" of r*-sae + if (!getLexer().is(AsmToken::Minus)) + return ErrorOperand(Tok.getLoc(), "Expected - at this point"); + Parser.Lex(); // Eat "-" + Parser.Lex(); // Eat the sae + if (!getLexer().is(AsmToken::RCurly)) + return ErrorOperand(Tok.getLoc(), "Expected } at this point"); + Parser.Lex(); // Eat "}" + const MCExpr *RndModeOp = + MCConstantExpr::Create(rndMode, Parser.getContext()); + return X86Operand::CreateImm(RndModeOp, Start, End); + } + return ErrorOperand(Tok.getLoc(), "unknown token in expression"); +} /// ParseIntelMemOperand - Parse intel style memory operand. std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start, @@ -1656,6 +1687,11 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { return ParseIntelMemOperand(Imm, Start, Size); } + // rounding mode token + if (STI.getFeatureBits() & X86::FeatureAVX512 && + getLexer().is(AsmToken::LCurly)) + return ParseRoundingModeOp(Start, End); + // Register. unsigned RegNo = 0; if (!ParseRegister(RegNo, Start, End)) { @@ -1708,6 +1744,12 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { return nullptr; return X86Operand::CreateImm(Val, Start, End); } + case AsmToken::LCurly:{ + SMLoc Start = Parser.getTok().getLoc(), End; + if (STI.getFeatureBits() & X86::FeatureAVX512) + return ParseRoundingModeOp(Start, End); + return ErrorOperand(Start, "unknown token in expression"); + } } } diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h index d67e119..94dbedb 100644 --- a/lib/Target/X86/AsmParser/X86Operand.h +++ b/lib/Target/X86/AsmParser/X86Operand.h @@ -260,6 +260,9 @@ struct X86Operand : public MCParsedAsmOperand { return Kind == Memory && !getMemSegReg() && !getMemBaseReg() && !getMemIndexReg() && getMemScale() == 1; } + bool isAVX512RC() const{ + return isImm(); + } bool isAbsMem16() const { return isAbsMem() && Mem.ModeSize == 16; @@ -394,7 +397,10 @@ struct X86Operand : public MCParsedAsmOperand { RegNo = getGR32FromGR64(RegNo); Inst.addOperand(MCOperand::CreateReg(RegNo)); } - + void addAVX512RCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + addExpr(Inst, getImm()); + } void addImmOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); addExpr(Inst, getImm()); diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index 99fb1ab..e8c5475 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -378,26 +378,28 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, unsigned NewOpc; switch (mcInst.getOpcode()) { default: llvm_unreachable("unexpected opcode"); - case X86::VCMPPDrmi: NewOpc = X86::VCMPPDrmi_alt; break; - case X86::VCMPPDrri: NewOpc = X86::VCMPPDrri_alt; break; - case X86::VCMPPSrmi: NewOpc = X86::VCMPPSrmi_alt; break; - case X86::VCMPPSrri: NewOpc = X86::VCMPPSrri_alt; break; - case X86::VCMPSDrm: NewOpc = X86::VCMPSDrm_alt; break; - case X86::VCMPSDrr: NewOpc = X86::VCMPSDrr_alt; break; - case X86::VCMPSSrm: NewOpc = X86::VCMPSSrm_alt; break; - case X86::VCMPSSrr: NewOpc = X86::VCMPSSrr_alt; break; - case X86::VCMPPDYrmi: NewOpc = X86::VCMPPDYrmi_alt; break; - case X86::VCMPPDYrri: NewOpc = X86::VCMPPDYrri_alt; break; - case X86::VCMPPSYrmi: NewOpc = X86::VCMPPSYrmi_alt; break; - case X86::VCMPPSYrri: NewOpc = X86::VCMPPSYrri_alt; break; - case X86::VCMPPDZrmi: NewOpc = X86::VCMPPDZrmi_alt; break; - case X86::VCMPPDZrri: NewOpc = X86::VCMPPDZrri_alt; break; - case X86::VCMPPSZrmi: NewOpc = X86::VCMPPSZrmi_alt; break; - case X86::VCMPPSZrri: NewOpc = X86::VCMPPSZrri_alt; break; - case X86::VCMPSDZrm: NewOpc = X86::VCMPSDZrmi_alt; break; - case X86::VCMPSDZrr: NewOpc = X86::VCMPSDZrri_alt; break; - case X86::VCMPSSZrm: NewOpc = X86::VCMPSSZrmi_alt; break; - case X86::VCMPSSZrr: NewOpc = X86::VCMPSSZrri_alt; break; + case X86::VCMPPDrmi: NewOpc = X86::VCMPPDrmi_alt; break; + case X86::VCMPPDrri: NewOpc = X86::VCMPPDrri_alt; break; + case X86::VCMPPSrmi: NewOpc = X86::VCMPPSrmi_alt; break; + case X86::VCMPPSrri: NewOpc = X86::VCMPPSrri_alt; break; + case X86::VCMPSDrm: NewOpc = X86::VCMPSDrm_alt; break; + case X86::VCMPSDrr: NewOpc = X86::VCMPSDrr_alt; break; + case X86::VCMPSSrm: NewOpc = X86::VCMPSSrm_alt; break; + case X86::VCMPSSrr: NewOpc = X86::VCMPSSrr_alt; break; + case X86::VCMPPDYrmi: NewOpc = X86::VCMPPDYrmi_alt; break; + case X86::VCMPPDYrri: NewOpc = X86::VCMPPDYrri_alt; break; + case X86::VCMPPSYrmi: NewOpc = X86::VCMPPSYrmi_alt; break; + case X86::VCMPPSYrri: NewOpc = X86::VCMPPSYrri_alt; break; + case X86::VCMPPDZrmi: NewOpc = X86::VCMPPDZrmi_alt; break; + case X86::VCMPPDZrri: NewOpc = X86::VCMPPDZrri_alt; break; + case X86::VCMPPDZrrib: NewOpc = X86::VCMPPDZrrib_alt; break; + case X86::VCMPPSZrmi: NewOpc = X86::VCMPPSZrmi_alt; break; + case X86::VCMPPSZrri: NewOpc = X86::VCMPPSZrri_alt; break; + case X86::VCMPPSZrrib: NewOpc = X86::VCMPPSZrrib_alt; break; + case X86::VCMPSDZrm: NewOpc = X86::VCMPSDZrmi_alt; break; + case X86::VCMPSDZrr: NewOpc = X86::VCMPSDZrri_alt; break; + case X86::VCMPSSZrm: NewOpc = X86::VCMPSSZrmi_alt; break; + case X86::VCMPSSZrr: NewOpc = X86::VCMPSSZrri_alt; break; } // Switch opcode to the one that doesn't get special printing. mcInst.setOpcode(NewOpc); diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index 619a0d4..7c9e012 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -310,11 +310,8 @@ static bool isPrefixAtLocation(struct InternalInstruction* insn, uint8_t prefix, uint64_t location) { - if (insn->prefixPresent[prefix] == 1 && - insn->prefixLocations[prefix] == location) - return true; - else - return false; + return insn->prefixPresent[prefix] == 1 && + insn->prefixLocations[prefix] == location; } /* @@ -1458,6 +1455,8 @@ static int readModRM(struct InternalInstruction* insn) { case TYPE_VK1: \ case TYPE_VK8: \ case TYPE_VK16: \ + if (index > 7) \ + *valid = 0; \ return prefix##_K0 + index; \ case TYPE_MM64: \ return prefix##_MM0 + (index & 0x7); \ diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h index 70c6042..9e65050 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h @@ -485,18 +485,6 @@ struct OperandSpecifier { uint8_t type; }; -// Indicates where the opcode modifier (if any) is to be found. Extended -// opcodes with AddRegFrm have the opcode modifier in the ModR/M byte. -#define MODIFIER_TYPES \ - ENUM_ENTRY(MODIFIER_NONE) - -#define ENUM_ENTRY(n) n, -enum ModifierType { - MODIFIER_TYPES - MODIFIER_max -}; -#undef ENUM_ENTRY - static const unsigned X86_MAX_OPERANDS = 6; /// Decoding mode for the Intel disassembler. 16-bit, 32-bit, and 64-bit mode diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 719b761..a400d46 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -76,8 +76,8 @@ class X86AsmBackend : public MCAsmBackend { bool HasNopl; const uint64_t MaxNopLength; public: - X86AsmBackend(const Target &T, StringRef _CPU) - : MCAsmBackend(), CPU(_CPU), MaxNopLength(_CPU == "slm" ? 7 : 15) { + X86AsmBackend(const Target &T, StringRef CPU) + : MCAsmBackend(), CPU(CPU), MaxNopLength(CPU == "slm" ? 7 : 15) { HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" && CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" && CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" && @@ -351,8 +351,8 @@ namespace { class ELFX86AsmBackend : public X86AsmBackend { public: uint8_t OSABI; - ELFX86AsmBackend(const Target &T, uint8_t _OSABI, StringRef CPU) - : X86AsmBackend(T, CPU), OSABI(_OSABI) {} + ELFX86AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) + : X86AsmBackend(T, CPU), OSABI(OSABI) {} }; class ELFX86_32AsmBackend : public ELFX86AsmBackend { diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index e8b0b4c..76a9d2b 100644 --- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -38,231 +38,214 @@ X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, X86ELFObjectWriter::~X86ELFObjectWriter() {} -unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, - const MCFixup &Fixup, - bool IsPCRel) const { - // determine the type of the relocation +enum X86_64RelType { RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 }; - MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant(); - unsigned Type; - if (getEMachine() == ELF::EM_X86_64) { - if (IsPCRel) { - switch ((unsigned)Fixup.getKind()) { - default: llvm_unreachable("invalid fixup kind!"); - - case FK_Data_8: Type = ELF::R_X86_64_PC64; break; - case FK_Data_4: Type = ELF::R_X86_64_PC32; break; - case FK_Data_2: Type = ELF::R_X86_64_PC16; break; - case FK_Data_1: Type = ELF::R_X86_64_PC8; break; +static X86_64RelType getType64(unsigned Kind, + MCSymbolRefExpr::VariantKind &Modifier, + bool &IsPCRel) { + switch (Kind) { + default: + llvm_unreachable("Unimplemented"); + case X86::reloc_global_offset_table8: + Modifier = MCSymbolRefExpr::VK_GOT; + IsPCRel = true; + return RT64_64; + case FK_Data_8: + return RT64_64; + case X86::reloc_signed_4byte: + if (Modifier == MCSymbolRefExpr::VK_None && !IsPCRel) + return RT64_32S; + return RT64_32; + case X86::reloc_global_offset_table: + Modifier = MCSymbolRefExpr::VK_GOT; + IsPCRel = true; + return RT64_32; + case FK_Data_4: + case FK_PCRel_4: + case X86::reloc_riprel_4byte: + case X86::reloc_riprel_4byte_movq_load: + return RT64_32; + case FK_Data_2: + return RT64_16; + case FK_PCRel_1: + case FK_Data_1: + return RT64_8; + } +} - case FK_PCRel_8: - assert(Modifier == MCSymbolRefExpr::VK_None); - Type = ELF::R_X86_64_PC64; - break; - case X86::reloc_signed_4byte: - case X86::reloc_riprel_4byte_movq_load: - case X86::reloc_riprel_4byte: - case FK_PCRel_4: - switch (Modifier) { - default: - llvm_unreachable("Unimplemented"); - case MCSymbolRefExpr::VK_None: - Type = ELF::R_X86_64_PC32; - break; - case MCSymbolRefExpr::VK_PLT: - Type = ELF::R_X86_64_PLT32; - break; - case MCSymbolRefExpr::VK_GOTPCREL: - Type = ELF::R_X86_64_GOTPCREL; - break; - case MCSymbolRefExpr::VK_GOTTPOFF: - Type = ELF::R_X86_64_GOTTPOFF; - break; - case MCSymbolRefExpr::VK_TLSGD: - Type = ELF::R_X86_64_TLSGD; - break; - case MCSymbolRefExpr::VK_TLSLD: - Type = ELF::R_X86_64_TLSLD; - break; - } - break; - case FK_PCRel_2: - assert(Modifier == MCSymbolRefExpr::VK_None); - Type = ELF::R_X86_64_PC16; - break; - case FK_PCRel_1: - assert(Modifier == MCSymbolRefExpr::VK_None); - Type = ELF::R_X86_64_PC8; - break; - } - } else { - switch ((unsigned)Fixup.getKind()) { - default: llvm_unreachable("invalid fixup kind!"); - case X86::reloc_global_offset_table8: - Type = ELF::R_X86_64_GOTPC64; - break; - case X86::reloc_global_offset_table: - Type = ELF::R_X86_64_GOTPC32; - break; - case FK_Data_8: - switch (Modifier) { - default: - llvm_unreachable("Unimplemented"); - case MCSymbolRefExpr::VK_None: - Type = ELF::R_X86_64_64; - break; - case MCSymbolRefExpr::VK_GOT: - Type = ELF::R_X86_64_GOT64; - break; - case MCSymbolRefExpr::VK_GOTOFF: - Type = ELF::R_X86_64_GOTOFF64; - break; - case MCSymbolRefExpr::VK_TPOFF: - Type = ELF::R_X86_64_TPOFF64; - break; - case MCSymbolRefExpr::VK_DTPOFF: - Type = ELF::R_X86_64_DTPOFF64; - break; - } - break; - case X86::reloc_signed_4byte: - switch (Modifier) { - default: - llvm_unreachable("Unimplemented"); - case MCSymbolRefExpr::VK_None: - Type = ELF::R_X86_64_32S; - break; - case MCSymbolRefExpr::VK_GOT: - Type = ELF::R_X86_64_GOT32; - break; - case MCSymbolRefExpr::VK_GOTPCREL: - Type = ELF::R_X86_64_GOTPCREL; - break; - case MCSymbolRefExpr::VK_TPOFF: - Type = ELF::R_X86_64_TPOFF32; - break; - case MCSymbolRefExpr::VK_DTPOFF: - Type = ELF::R_X86_64_DTPOFF32; - break; - } - break; - case FK_Data_4: - Type = ELF::R_X86_64_32; - break; - case FK_Data_2: Type = ELF::R_X86_64_16; break; - case FK_PCRel_1: - case FK_Data_1: Type = ELF::R_X86_64_8; break; - } +static unsigned getRelocType64(MCSymbolRefExpr::VariantKind Modifier, + X86_64RelType Type, bool IsPCRel) { + switch (Modifier) { + default: + llvm_unreachable("Unimplemented"); + case MCSymbolRefExpr::VK_None: + switch (Type) { + case RT64_64: + return IsPCRel ? ELF::R_X86_64_PC64 : ELF::R_X86_64_64; + case RT64_32: + return IsPCRel ? ELF::R_X86_64_PC32 : ELF::R_X86_64_32; + case RT64_32S: + return ELF::R_X86_64_32S; + case RT64_16: + return IsPCRel ? ELF::R_X86_64_PC16 : ELF::R_X86_64_16; + case RT64_8: + return IsPCRel ? ELF::R_X86_64_PC8 : ELF::R_X86_64_8; } - } else if (getEMachine() == ELF::EM_386) { - if (IsPCRel) { - switch ((unsigned)Fixup.getKind()) { - default: llvm_unreachable("invalid fixup kind!"); - - case X86::reloc_global_offset_table: - Type = ELF::R_386_GOTPC; - break; - - case FK_PCRel_1: - case FK_Data_1: - switch (Modifier) { - default: - llvm_unreachable("Unimplemented"); - case MCSymbolRefExpr::VK_None: - Type = ELF::R_386_PC8; - break; - } - break; - - case FK_PCRel_2: - case FK_Data_2: - switch (Modifier) { - default: - llvm_unreachable("Unimplemented"); - case MCSymbolRefExpr::VK_None: - Type = ELF::R_386_PC16; - break; - } - break; + case MCSymbolRefExpr::VK_GOT: + switch (Type) { + case RT64_64: + return IsPCRel ? ELF::R_X86_64_GOTPC64 : ELF::R_X86_64_GOT64; + case RT64_32: + return IsPCRel ? ELF::R_X86_64_GOTPC32 : ELF::R_X86_64_GOT32; + case RT64_32S: + case RT64_16: + case RT64_8: + llvm_unreachable("Unimplemented"); + } + case MCSymbolRefExpr::VK_GOTOFF: + assert(Type == RT64_64); + assert(!IsPCRel); + return ELF::R_X86_64_GOTOFF64; + case MCSymbolRefExpr::VK_TPOFF: + assert(!IsPCRel); + switch (Type) { + case RT64_64: + return ELF::R_X86_64_TPOFF64; + case RT64_32: + return ELF::R_X86_64_TPOFF32; + case RT64_32S: + case RT64_16: + case RT64_8: + llvm_unreachable("Unimplemented"); + } + case MCSymbolRefExpr::VK_DTPOFF: + assert(!IsPCRel); + switch (Type) { + case RT64_64: + return ELF::R_X86_64_DTPOFF64; + case RT64_32: + return ELF::R_X86_64_DTPOFF32; + case RT64_32S: + case RT64_16: + case RT64_8: + llvm_unreachable("Unimplemented"); + } + case MCSymbolRefExpr::VK_SIZE: + assert(!IsPCRel); + switch (Type) { + case RT64_64: + return ELF::R_X86_64_SIZE64; + case RT64_32: + return ELF::R_X86_64_SIZE32; + case RT64_32S: + case RT64_16: + case RT64_8: + llvm_unreachable("Unimplemented"); + } + case MCSymbolRefExpr::VK_TLSGD: + assert(Type == RT64_32); + return ELF::R_X86_64_TLSGD; + case MCSymbolRefExpr::VK_GOTTPOFF: + assert(Type == RT64_32); + return ELF::R_X86_64_GOTTPOFF; + case MCSymbolRefExpr::VK_TLSLD: + assert(Type == RT64_32); + return ELF::R_X86_64_TLSLD; + case MCSymbolRefExpr::VK_PLT: + assert(Type == RT64_32); + return ELF::R_X86_64_PLT32; + case MCSymbolRefExpr::VK_GOTPCREL: + assert(Type == RT64_32); + return ELF::R_X86_64_GOTPCREL; + } +} - case X86::reloc_signed_4byte: - case FK_PCRel_4: - case FK_Data_4: - switch (Modifier) { - default: - llvm_unreachable("Unimplemented"); - case MCSymbolRefExpr::VK_None: - Type = ELF::R_386_PC32; - break; - case MCSymbolRefExpr::VK_PLT: - Type = ELF::R_386_PLT32; - break; - } - break; - } - } else { - switch ((unsigned)Fixup.getKind()) { - default: llvm_unreachable("invalid fixup kind!"); +enum X86_32RelType { RT32_32, RT32_16, RT32_8 }; - case X86::reloc_global_offset_table: - Type = ELF::R_386_GOTPC; - break; +static X86_32RelType getType32(X86_64RelType T) { + switch (T) { + case RT64_64: + llvm_unreachable("Unimplemented"); + case RT64_32: + case RT64_32S: + return RT32_32; + case RT64_16: + return RT32_16; + case RT64_8: + return RT32_8; + } + llvm_unreachable("unexpected relocation type!"); +} - // FIXME: Should we avoid selecting reloc_signed_4byte in 32 bit mode - // instead? - case X86::reloc_signed_4byte: - case FK_PCRel_4: - case FK_Data_4: - switch (Modifier) { - default: - llvm_unreachable("Unimplemented"); - case MCSymbolRefExpr::VK_None: - Type = ELF::R_386_32; - break; - case MCSymbolRefExpr::VK_GOT: - Type = ELF::R_386_GOT32; - break; - case MCSymbolRefExpr::VK_PLT: - Type = ELF::R_386_PLT32; - break; - case MCSymbolRefExpr::VK_GOTOFF: - Type = ELF::R_386_GOTOFF; - break; - case MCSymbolRefExpr::VK_TLSGD: - Type = ELF::R_386_TLS_GD; - break; - case MCSymbolRefExpr::VK_TPOFF: - Type = ELF::R_386_TLS_LE_32; - break; - case MCSymbolRefExpr::VK_INDNTPOFF: - Type = ELF::R_386_TLS_IE; - break; - case MCSymbolRefExpr::VK_NTPOFF: - Type = ELF::R_386_TLS_LE; - break; - case MCSymbolRefExpr::VK_GOTNTPOFF: - Type = ELF::R_386_TLS_GOTIE; - break; - case MCSymbolRefExpr::VK_TLSLDM: - Type = ELF::R_386_TLS_LDM; - break; - case MCSymbolRefExpr::VK_DTPOFF: - Type = ELF::R_386_TLS_LDO_32; - break; - case MCSymbolRefExpr::VK_GOTTPOFF: - Type = ELF::R_386_TLS_IE_32; - break; - } - break; - case FK_Data_2: Type = ELF::R_386_16; break; - case FK_PCRel_1: - case FK_Data_1: Type = ELF::R_386_8; break; - } +static unsigned getRelocType32(MCSymbolRefExpr::VariantKind Modifier, + X86_32RelType Type, bool IsPCRel) { + switch (Modifier) { + default: + llvm_unreachable("Unimplemented"); + case MCSymbolRefExpr::VK_None: + switch (Type) { + case RT32_32: + return IsPCRel ? ELF::R_386_PC32 : ELF::R_386_32; + case RT32_16: + return IsPCRel ? ELF::R_386_PC16 : ELF::R_386_16; + case RT32_8: + return IsPCRel ? ELF::R_386_PC8 : ELF::R_386_8; } - } else - llvm_unreachable("Unsupported ELF machine type."); + case MCSymbolRefExpr::VK_GOT: + assert(Type == RT32_32); + return IsPCRel ? ELF::R_386_GOTPC : ELF::R_386_GOT32; + case MCSymbolRefExpr::VK_GOTOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_GOTOFF; + case MCSymbolRefExpr::VK_TPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_LE_32; + case MCSymbolRefExpr::VK_DTPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_LDO_32; + case MCSymbolRefExpr::VK_TLSGD: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_GD; + case MCSymbolRefExpr::VK_GOTTPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_IE_32; + case MCSymbolRefExpr::VK_PLT: + assert(Type == RT32_32); + return ELF::R_386_PLT32; + case MCSymbolRefExpr::VK_INDNTPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_IE; + case MCSymbolRefExpr::VK_NTPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_LE; + case MCSymbolRefExpr::VK_GOTNTPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_GOTIE; + case MCSymbolRefExpr::VK_TLSLDM: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_LDM; + } +} + +unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel) const { + MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant(); + X86_64RelType Type = getType64(Fixup.getKind(), Modifier, IsPCRel); + if (getEMachine() == ELF::EM_X86_64) + return getRelocType64(Modifier, Type, IsPCRel); - return Type; + assert(getEMachine() == ELF::EM_386 && "Unsupported ELF machine type."); + return getRelocType32(Modifier, getType32(Type), IsPCRel); } MCObjectWriter *llvm::createX86ELFObjectWriter(raw_ostream &OS, diff --git a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp index b679316..10b83f4 100644 --- a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp @@ -36,7 +36,7 @@ public: MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName); // FIXME: check that the value is actually the same. - if (Sym->isVariable() == false) + if (!Sym->isVariable()) Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx)); const MCExpr *Expr = nullptr; diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 3ad8ab1..9b98a3e 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -168,10 +168,8 @@ public: } // end anonymous namespace - MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx) { return new X86MCCodeEmitter(MCII, Ctx); } diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 0e7b4e5..0946326 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -55,143 +55,6 @@ std::string X86_MC::ParseX86Triple(StringRef TT) { return FS; } -/// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in the -/// specified arguments. If we can't run cpuid on the host, return true. -bool X86_MC::GetCpuIDAndInfo(unsigned value, unsigned *rEAX, - unsigned *rEBX, unsigned *rECX, unsigned *rEDX) { -#if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64) - #if defined(__GNUC__) - // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually. - asm ("movq\t%%rbx, %%rsi\n\t" - "cpuid\n\t" - "xchgq\t%%rbx, %%rsi\n\t" - : "=a" (*rEAX), - "=S" (*rEBX), - "=c" (*rECX), - "=d" (*rEDX) - : "a" (value)); - return false; - #elif defined(_MSC_VER) - int registers[4]; - __cpuid(registers, value); - *rEAX = registers[0]; - *rEBX = registers[1]; - *rECX = registers[2]; - *rEDX = registers[3]; - return false; - #else - return true; - #endif -#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86) - #if defined(__GNUC__) - asm ("movl\t%%ebx, %%esi\n\t" - "cpuid\n\t" - "xchgl\t%%ebx, %%esi\n\t" - : "=a" (*rEAX), - "=S" (*rEBX), - "=c" (*rECX), - "=d" (*rEDX) - : "a" (value)); - return false; - #elif defined(_MSC_VER) - __asm { - mov eax,value - cpuid - mov esi,rEAX - mov dword ptr [esi],eax - mov esi,rEBX - mov dword ptr [esi],ebx - mov esi,rECX - mov dword ptr [esi],ecx - mov esi,rEDX - mov dword ptr [esi],edx - } - return false; - #else - return true; - #endif -#else - return true; -#endif -} - -/// GetCpuIDAndInfoEx - Execute the specified cpuid with subleaf and return the -/// 4 values in the specified arguments. If we can't run cpuid on the host, -/// return true. -bool X86_MC::GetCpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX, - unsigned *rEBX, unsigned *rECX, unsigned *rEDX) { -#if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64) - #if defined(__GNUC__) - // gcc desn't know cpuid would clobber ebx/rbx. Preseve it manually. - asm ("movq\t%%rbx, %%rsi\n\t" - "cpuid\n\t" - "xchgq\t%%rbx, %%rsi\n\t" - : "=a" (*rEAX), - "=S" (*rEBX), - "=c" (*rECX), - "=d" (*rEDX) - : "a" (value), - "c" (subleaf)); - return false; - #elif defined(_MSC_VER) - int registers[4]; - __cpuidex(registers, value, subleaf); - *rEAX = registers[0]; - *rEBX = registers[1]; - *rECX = registers[2]; - *rEDX = registers[3]; - return false; - #else - return true; - #endif -#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86) - #if defined(__GNUC__) - asm ("movl\t%%ebx, %%esi\n\t" - "cpuid\n\t" - "xchgl\t%%ebx, %%esi\n\t" - : "=a" (*rEAX), - "=S" (*rEBX), - "=c" (*rECX), - "=d" (*rEDX) - : "a" (value), - "c" (subleaf)); - return false; - #elif defined(_MSC_VER) - __asm { - mov eax,value - mov ecx,subleaf - cpuid - mov esi,rEAX - mov dword ptr [esi],eax - mov esi,rEBX - mov dword ptr [esi],ebx - mov esi,rECX - mov dword ptr [esi],ecx - mov esi,rEDX - mov dword ptr [esi],edx - } - return false; - #else - return true; - #endif -#else - return true; -#endif -} - -void X86_MC::DetectFamilyModel(unsigned EAX, unsigned &Family, - unsigned &Model) { - Family = (EAX >> 8) & 0xf; // Bits 8 - 11 - Model = (EAX >> 4) & 0xf; // Bits 4 - 7 - if (Family == 6 || Family == 0xf) { - if (Family == 0xf) - // Examine extended family ID if family ID is F. - Family += (EAX >> 20) & 0xff; // Bits 20 - 27 - // Examine extended model ID if family ID is 6 or F. - Model += ((EAX >> 16) & 0xf) << 4; // Bits 16 - 19 - } -} - unsigned X86_MC::getDwarfRegFlavour(Triple TT, bool isEH) { if (TT.getArch() == Triple::x86_64) return DWARFFlavour::X86_64; @@ -344,24 +207,6 @@ static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM, return X; } -static MCStreamer *createMCStreamer(const Target &T, StringRef TT, - MCContext &Ctx, MCAsmBackend &MAB, - raw_ostream &_OS, MCCodeEmitter *_Emitter, - const MCSubtargetInfo &STI, bool RelaxAll) { - Triple TheTriple(TT); - - switch (TheTriple.getObjectFormat()) { - default: llvm_unreachable("unsupported object format"); - case Triple::MachO: - return createMachOStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll); - case Triple::COFF: - assert(TheTriple.isOSWindows() && "only Windows COFF is supported"); - return createX86WinCOFFStreamer(Ctx, MAB, _Emitter, _OS, RelaxAll); - case Triple::ELF: - return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll); - } -} - static MCInstPrinter *createX86MCInstPrinter(const Target &T, unsigned SyntaxVariant, const MCAsmInfo &MAI, @@ -392,61 +237,42 @@ static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) { // Force static initialization. extern "C" void LLVMInitializeX86TargetMC() { - // Register the MC asm info. - RegisterMCAsmInfoFn A(TheX86_32Target, createX86MCAsmInfo); - RegisterMCAsmInfoFn B(TheX86_64Target, createX86MCAsmInfo); - - // Register the MC codegen info. - RegisterMCCodeGenInfoFn C(TheX86_32Target, createX86MCCodeGenInfo); - RegisterMCCodeGenInfoFn D(TheX86_64Target, createX86MCCodeGenInfo); - - // Register the MC instruction info. - TargetRegistry::RegisterMCInstrInfo(TheX86_32Target, createX86MCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(TheX86_64Target, createX86MCInstrInfo); - - // Register the MC register info. - TargetRegistry::RegisterMCRegInfo(TheX86_32Target, createX86MCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(TheX86_64Target, createX86MCRegisterInfo); - - // Register the MC subtarget info. - TargetRegistry::RegisterMCSubtargetInfo(TheX86_32Target, - X86_MC::createX86MCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(TheX86_64Target, - X86_MC::createX86MCSubtargetInfo); - - // Register the MC instruction analyzer. - TargetRegistry::RegisterMCInstrAnalysis(TheX86_32Target, - createX86MCInstrAnalysis); - TargetRegistry::RegisterMCInstrAnalysis(TheX86_64Target, - createX86MCInstrAnalysis); - - // Register the code emitter. - TargetRegistry::RegisterMCCodeEmitter(TheX86_32Target, - createX86MCCodeEmitter); - TargetRegistry::RegisterMCCodeEmitter(TheX86_64Target, - createX86MCCodeEmitter); + for (Target *T : {&TheX86_32Target, &TheX86_64Target}) { + // Register the MC asm info. + RegisterMCAsmInfoFn X(*T, createX86MCAsmInfo); + + // Register the MC codegen info. + RegisterMCCodeGenInfoFn Y(*T, createX86MCCodeGenInfo); + + // Register the MC instruction info. + TargetRegistry::RegisterMCInstrInfo(*T, createX86MCInstrInfo); + + // Register the MC register info. + TargetRegistry::RegisterMCRegInfo(*T, createX86MCRegisterInfo); + + // Register the MC subtarget info. + TargetRegistry::RegisterMCSubtargetInfo(*T, + X86_MC::createX86MCSubtargetInfo); + + // Register the MC instruction analyzer. + TargetRegistry::RegisterMCInstrAnalysis(*T, createX86MCInstrAnalysis); + + // Register the code emitter. + TargetRegistry::RegisterMCCodeEmitter(*T, createX86MCCodeEmitter); + + // Register the object streamer. + TargetRegistry::RegisterCOFFStreamer(*T, createX86WinCOFFStreamer); + + // Register the MCInstPrinter. + TargetRegistry::RegisterMCInstPrinter(*T, createX86MCInstPrinter); + + // Register the MC relocation info. + TargetRegistry::RegisterMCRelocationInfo(*T, createX86MCRelocationInfo); + } // Register the asm backend. TargetRegistry::RegisterMCAsmBackend(TheX86_32Target, createX86_32AsmBackend); TargetRegistry::RegisterMCAsmBackend(TheX86_64Target, createX86_64AsmBackend); - - // Register the object streamer. - TargetRegistry::RegisterMCObjectStreamer(TheX86_32Target, - createMCStreamer); - TargetRegistry::RegisterMCObjectStreamer(TheX86_64Target, - createMCStreamer); - - // Register the MCInstPrinter. - TargetRegistry::RegisterMCInstPrinter(TheX86_32Target, - createX86MCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(TheX86_64Target, - createX86MCInstPrinter); - - // Register the MC relocation info. - TargetRegistry::RegisterMCRelocationInfo(TheX86_32Target, - createX86MCRelocationInfo); - TargetRegistry::RegisterMCRelocationInfo(TheX86_64Target, - createX86MCRelocationInfo); } diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index d8320b9..6f50f11 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -53,18 +53,6 @@ namespace N86 { namespace X86_MC { std::string ParseX86Triple(StringRef TT); - /// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in - /// the specified arguments. If we can't run cpuid on the host, return true. - bool GetCpuIDAndInfo(unsigned value, unsigned *rEAX, - unsigned *rEBX, unsigned *rECX, unsigned *rEDX); - /// GetCpuIDAndInfoEx - Execute the specified cpuid with subleaf and return - /// the 4 values in the specified arguments. If we can't run cpuid on the - /// host, return true. - bool GetCpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX, - unsigned *rEBX, unsigned *rECX, unsigned *rEDX); - - void DetectFamilyModel(unsigned EAX, unsigned &Family, unsigned &Model); - unsigned getDwarfRegFlavour(Triple TT, bool isEH); void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI); @@ -78,7 +66,6 @@ namespace X86_MC { MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx); MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI, @@ -86,12 +73,12 @@ MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI, MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, StringRef TT, StringRef CPU); -/// createX86WinCOFFStreamer - Construct an X86 Windows COFF machine code -/// streamer which will generate PE/COFF format object files. +/// Construct an X86 Windows COFF machine code streamer which will generate +/// PE/COFF format object files. /// /// Takes ownership of \p AB and \p CE. MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, - MCCodeEmitter *CE, raw_ostream &OS, + raw_ostream &OS, MCCodeEmitter *CE, bool RelaxAll); /// createX86MachObjectWriter - Construct an X86 Mach-O object writer. diff --git a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp index 3b81d53..81749fc 100644 --- a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp @@ -38,7 +38,7 @@ public: MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName); // FIXME: check that the value is actually the same. - if (Sym->isVariable() == false) + if (!Sym->isVariable()) Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx)); const MCExpr *Expr = nullptr; @@ -93,7 +93,7 @@ public: RSymI->getName(RSymName); MCSymbol *RSym = Ctx.GetOrCreateSymbol(RSymName); - if (RSym->isVariable() == false) + if (!RSym->isVariable()) RSym->setVariableValue(MCConstantExpr::Create(RSymAddr, Ctx)); const MCExpr *RHS = MCSymbolRefExpr::Create(RSym, Ctx); diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index 5f1596c..5690efe 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -48,13 +48,11 @@ void X86WinCOFFStreamer::FinishImpl() { } } -namespace llvm { -MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, - MCCodeEmitter *CE, raw_ostream &OS, - bool RelaxAll) { +MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, + raw_ostream &OS, MCCodeEmitter *CE, + bool RelaxAll) { X86WinCOFFStreamer *S = new X86WinCOFFStreamer(C, AB, CE, OS); S->getAssembler().setRelaxAll(RelaxAll); return S; } -} diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt index 71329b0..e6896e8 100644 --- a/lib/Target/X86/README-SSE.txt +++ b/lib/Target/X86/README-SSE.txt @@ -93,36 +93,6 @@ The pattern isel got this one right. //===---------------------------------------------------------------------===// -SSE should implement 'select_cc' using 'emulated conditional moves' that use -pcmp/pand/pandn/por to do a selection instead of a conditional branch: - -double %X(double %Y, double %Z, double %A, double %B) { - %C = setlt double %A, %B - %z = fadd double %Z, 0.0 ;; select operand is not a load - %D = select bool %C, double %Y, double %z - ret double %D -} - -We currently emit: - -_X: - subl $12, %esp - xorpd %xmm0, %xmm0 - addsd 24(%esp), %xmm0 - movsd 32(%esp), %xmm1 - movsd 16(%esp), %xmm2 - ucomisd 40(%esp), %xmm1 - jb LBB_X_2 -LBB_X_1: - movsd %xmm0, %xmm2 -LBB_X_2: - movsd %xmm2, (%esp) - fldl (%esp) - addl $12, %esp - ret - -//===---------------------------------------------------------------------===// - Lower memcpy / memset to a series of SSE 128 bit move instructions when it's feasible. @@ -787,25 +757,6 @@ cheaper to do fld1 than load from a constant pool for example, so //===---------------------------------------------------------------------===// -The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to -"cmpsd". For example, this code: - -double d1(double x) { return x == x ? x : x + x; } - -Compiles into: - -_d1: - ucomisd %xmm0, %xmm0 - jnp LBB1_2 - addsd %xmm0, %xmm0 - ret -LBB1_2: - ret - -Also, the 'ret's should be shared. This is PR6032. - -//===---------------------------------------------------------------------===// - These should compile into the same code (PR6214): Perhaps instcombine should canonicalize the former into the later? @@ -858,35 +809,6 @@ doing a shuffle from v[1] to v[0] then a float store. //===---------------------------------------------------------------------===// -On SSE4 machines, we compile this code: - -define <2 x float> @test2(<2 x float> %Q, <2 x float> %R, - <2 x float> *%P) nounwind { - %Z = fadd <2 x float> %Q, %R - - store <2 x float> %Z, <2 x float> *%P - ret <2 x float> %Z -} - -into: - -_test2: ## @test2 -## BB#0: - insertps $0, %xmm2, %xmm2 - insertps $16, %xmm3, %xmm2 - insertps $0, %xmm0, %xmm3 - insertps $16, %xmm1, %xmm3 - addps %xmm2, %xmm3 - movq %xmm3, (%rdi) - movaps %xmm3, %xmm0 - pshufd $1, %xmm3, %xmm1 - ## kill: XMM1<def> XMM1<kill> - ret - -The insertps's of $0 are pointless complex copies. - -//===---------------------------------------------------------------------===// - [UNSAFE FP] void foo(double, double, double); diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index bb0b9ce..f6033a7 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -63,9 +63,6 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer.EndCOFFSymbolDef(); } - // Have common code print out the function header with linkage info etc. - EmitFunctionHeader(); - // Emit the rest of the function body. EmitFunctionBody(); diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index a17f052..cba140f 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -84,7 +84,7 @@ private: bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT, DebugLoc DL); bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, MachineMemOperand *MMO, - unsigned &ResultReg); + unsigned &ResultReg, unsigned Alignment = 1); bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM, MachineMemOperand *MMO = nullptr, bool Aligned = false); @@ -327,7 +327,8 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { /// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV. /// Return true and the result register by reference if it is possible. bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, - MachineMemOperand *MMO, unsigned &ResultReg) { + MachineMemOperand *MMO, unsigned &ResultReg, + unsigned Alignment) { // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; const TargetRegisterClass *RC = nullptr; @@ -372,6 +373,30 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, case MVT::f80: // No f80 support yet. return false; + case MVT::v4f32: + if (Alignment >= 16) + Opc = Subtarget->hasAVX() ? X86::VMOVAPSrm : X86::MOVAPSrm; + else + Opc = Subtarget->hasAVX() ? X86::VMOVUPSrm : X86::MOVUPSrm; + RC = &X86::VR128RegClass; + break; + case MVT::v2f64: + if (Alignment >= 16) + Opc = Subtarget->hasAVX() ? X86::VMOVAPDrm : X86::MOVAPDrm; + else + Opc = Subtarget->hasAVX() ? X86::VMOVUPDrm : X86::MOVUPDrm; + RC = &X86::VR128RegClass; + break; + case MVT::v4i32: + case MVT::v2i64: + case MVT::v8i16: + case MVT::v16i8: + if (Alignment >= 16) + Opc = Subtarget->hasAVX() ? X86::VMOVDQArm : X86::MOVDQArm; + else + Opc = Subtarget->hasAVX() ? X86::VMOVDQUrm : X86::MOVDQUrm; + RC = &X86::VR128RegClass; + break; } ResultReg = createResultReg(RC); @@ -1068,8 +1093,14 @@ bool X86FastISel::X86SelectLoad(const Instruction *I) { if (!X86SelectAddress(Ptr, AM)) return false; + unsigned Alignment = LI->getAlignment(); + unsigned ABIAlignment = DL.getABITypeAlignment(LI->getType()); + if (Alignment == 0) // Ensure that codegen never sees alignment 0 + Alignment = ABIAlignment; + unsigned ResultReg = 0; - if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg)) + if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg, + Alignment)) return false; updateValueMap(I, ResultReg); @@ -1094,20 +1125,30 @@ static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) { } } -/// X86ChooseCmpImmediateOpcode - If we have a comparison with RHS as the RHS -/// of the comparison, return an opcode that works for the compare (e.g. -/// CMP32ri) otherwise return 0. +/// If we have a comparison with RHS as the RHS of the comparison, return an +/// opcode that works for the compare (e.g. CMP32ri) otherwise return 0. static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) { + int64_t Val = RHSC->getSExtValue(); switch (VT.getSimpleVT().SimpleTy) { // Otherwise, we can't fold the immediate into this comparison. - default: return 0; - case MVT::i8: return X86::CMP8ri; - case MVT::i16: return X86::CMP16ri; - case MVT::i32: return X86::CMP32ri; + default: + return 0; + case MVT::i8: + return X86::CMP8ri; + case MVT::i16: + if (isInt<8>(Val)) + return X86::CMP16ri8; + return X86::CMP16ri; + case MVT::i32: + if (isInt<8>(Val)) + return X86::CMP32ri8; + return X86::CMP32ri; case MVT::i64: + if (isInt<8>(Val)) + return X86::CMP64ri8; // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext // field. - if ((int)RHSC->getSExtValue() == RHSC->getSExtValue()) + if (isInt<32>(Val)) return X86::CMP64ri32; return 0; } @@ -1810,11 +1851,11 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { return true; } -/// \brief Emit SSE instructions to lower the select. +/// \brief Emit SSE or AVX instructions to lower the select. /// /// Try to use SSE1/SSE2 instructions to simulate a select without branches. /// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary -/// SSE instructions are available. +/// SSE instructions are available. If AVX is available, try to use a VBLENDV. bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { // Optimize conditions coming from a compare if both instructions are in the // same basic block (values defined in other basic blocks may not have @@ -1850,19 +1891,17 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { if (NeedSwap) std::swap(CmpLHS, CmpRHS); - static unsigned OpcTable[2][2][4] = { - { { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr }, - { X86::VCMPSSrr, X86::VFsANDPSrr, X86::VFsANDNPSrr, X86::VFsORPSrr } }, - { { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr }, - { X86::VCMPSDrr, X86::VFsANDPDrr, X86::VFsANDNPDrr, X86::VFsORPDrr } } + // Choose the SSE instruction sequence based on data type (float or double). + static unsigned OpcTable[2][4] = { + { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr }, + { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr } }; - bool HasAVX = Subtarget->hasAVX(); unsigned *Opc = nullptr; switch (RetVT.SimpleTy) { default: return false; - case MVT::f32: Opc = &OpcTable[0][HasAVX][0]; break; - case MVT::f64: Opc = &OpcTable[1][HasAVX][0]; break; + case MVT::f32: Opc = &OpcTable[0][0]; break; + case MVT::f64: Opc = &OpcTable[1][0]; break; } const Value *LHS = I->getOperand(1); @@ -1884,14 +1923,33 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { return false; const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); - unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, - CmpRHSReg, CmpRHSIsKill, CC); - unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, - LHSReg, LHSIsKill); - unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, - RHSReg, RHSIsKill); - unsigned ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, - AndReg, /*IsKill=*/true); + unsigned ResultReg; + + if (Subtarget->hasAVX()) { + // If we have AVX, create 1 blendv instead of 3 logic instructions. + // Blendv was introduced with SSE 4.1, but the 2 register form implicitly + // uses XMM0 as the selection register. That may need just as many + // instructions as the AND/ANDN/OR sequence due to register moves, so + // don't bother. + unsigned CmpOpcode = + (RetVT.SimpleTy == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr; + unsigned BlendOpcode = + (RetVT.SimpleTy == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr; + + unsigned CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill, + CmpRHSReg, CmpRHSIsKill, CC); + ResultReg = fastEmitInst_rrr(BlendOpcode, RC, RHSReg, RHSIsKill, + LHSReg, LHSIsKill, CmpReg, true); + } else { + unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, + CmpRHSReg, CmpRHSIsKill, CC); + unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, + LHSReg, LHSIsKill); + unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, + RHSReg, RHSIsKill); + ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, + AndReg, /*IsKill=*/true); + } updateValueMap(I, ResultReg); return true; } @@ -2015,38 +2073,30 @@ bool X86FastISel::X86SelectSIToFP(const Instruction *I) { if (OpReg == 0) return false; - bool HasAVX = Subtarget->hasAVX(); const TargetRegisterClass *RC = nullptr; unsigned Opcode; - if (I->getType()->isDoubleTy() && X86ScalarSSEf64) { + if (I->getType()->isDoubleTy()) { // sitofp int -> double - Opcode = HasAVX ? X86::VCVTSI2SDrr : X86::CVTSI2SDrr; + Opcode = X86::VCVTSI2SDrr; RC = &X86::FR64RegClass; - } else if (I->getType()->isFloatTy() && X86ScalarSSEf32) { + } else if (I->getType()->isFloatTy()) { // sitofp int -> float - Opcode = HasAVX ? X86::VCVTSI2SSrr : X86::CVTSI2SSrr; + Opcode = X86::VCVTSI2SSrr; RC = &X86::FR32RegClass; } else return false; + // The target-independent selection algorithm in FastISel already knows how + // to select a SINT_TO_FP if the target is SSE but not AVX. This code is only + // reachable if the subtarget has AVX. + assert(Subtarget->hasAVX() && "Expected a subtarget with AVX!"); - unsigned ImplicitDefReg = 0; - if (HasAVX) { - ImplicitDefReg = createResultReg(RC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); - } - - const MCInstrDesc &II = TII.get(Opcode); - OpReg = constrainOperandRegClass(II, OpReg, (HasAVX ? 2 : 1)); - - unsigned ResultReg = createResultReg(RC); - MachineInstrBuilder MIB; - MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg); - if (ImplicitDefReg) - MIB.addReg(ImplicitDefReg, RegState::Kill); - MIB.addReg(OpReg); + unsigned ImplicitDefReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); + unsigned ResultReg = + fastEmitInst_rr(Opcode, RC, ImplicitDefReg, true, OpReg, false); updateValueMap(I, ResultReg); return true; } @@ -3053,7 +3103,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { // Add a register mask operand representing the call-preserved registers. // Proper defs for return values will be added by setPhysRegsDeadExcept(). - MIB.addRegMask(TRI.getCallPreservedMask(CC)); + MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC)); // Add an implicit use GOT pointer in EBX. if (Subtarget->isPICStyleGOT()) diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index c8e5f64..3b0bd03 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -32,10 +32,10 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/EdgeBundles.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/InlineAsm.h" #include "llvm/Support/Debug.h" @@ -300,7 +300,7 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { // function. If it is all integer, there is nothing for us to do! bool FPIsUsed = false; - assert(X86::FP6 == X86::FP0+6 && "Register enums aren't sorted right!"); + static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!"); for (unsigned i = 0; i <= 6; ++i) if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) { FPIsUsed = true; @@ -438,7 +438,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { // Rewind to first instruction newly inserted. while (Start != BB.begin() && std::prev(Start) != PrevI) --Start; dbgs() << "Inserted instructions:\n\t"; - Start->print(dbgs(), &MF.getTarget()); + Start->print(dbgs()); while (++Start != std::next(I)) {} } dumpStack(); diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index cead099..1d2c73c 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -581,7 +581,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { bool Is64Bit = STI.is64Bit(); // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); - bool IsWin64 = STI.isTargetWin64(); + bool IsWin64 = STI.isCallingConvWin64(Fn->getCallingConv()); // Not necessarily synonymous with IsWin64. bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry(); diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 8d50ae1..fb12ce5 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -228,7 +228,7 @@ namespace { /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for /// inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, - char ConstraintCode, + unsigned ConstraintID, std::vector<SDValue> &OutOps) override; void EmitSpecialCodeForMain(); @@ -1004,6 +1004,15 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, switch (N.getOpcode()) { default: break; + case ISD::FRAME_ALLOC_RECOVER: { + if (!AM.hasSymbolicDisplacement()) + if (const auto *ESNode = dyn_cast<ExternalSymbolSDNode>(N.getOperand(0))) + if (ESNode->getOpcode() == ISD::TargetExternalSymbol) { + AM.ES = ESNode->getSymbol(); + return false; + } + break; + } case ISD::Constant: { uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue(); if (!FoldOffsetIntoAddress(Val, AM)) @@ -2805,14 +2814,14 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } bool X86DAGToDAGISel:: -SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, +SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) { SDValue Op0, Op1, Op2, Op3, Op4; - switch (ConstraintCode) { - case 'o': // offsetable ?? - case 'v': // not offsetable ?? + switch (ConstraintID) { + case InlineAsm::Constraint_o: // offsetable ?? + case InlineAsm::Constraint_v: // not offsetable ?? default: return true; - case 'm': // memory + case InlineAsm::Constraint_m: // memory if (!SelectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4)) return true; break; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 6866be7..8b92e70 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -25,7 +25,6 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" -#include "llvm/ADT/VariadicFunction.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -77,119 +76,6 @@ static cl::opt<int> ReciprocalEstimateRefinementSteps( static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2); -static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG, SDLoc dl, - unsigned vectorWidth) { - assert((vectorWidth == 128 || vectorWidth == 256) && - "Unsupported vector width"); - EVT VT = Vec.getValueType(); - EVT ElVT = VT.getVectorElementType(); - unsigned Factor = VT.getSizeInBits()/vectorWidth; - EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, - VT.getVectorNumElements()/Factor); - - // Extract from UNDEF is UNDEF. - if (Vec.getOpcode() == ISD::UNDEF) - return DAG.getUNDEF(ResultVT); - - // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR - unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); - - // This is the index of the first element of the vectorWidth-bit chunk - // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth) - * ElemsPerChunk); - - // If the input is a buildvector just emit a smaller one. - if (Vec.getOpcode() == ISD::BUILD_VECTOR) - return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, - makeArrayRef(Vec->op_begin() + NormalizedIdxVal, - ElemsPerChunk)); - - SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); -} - -/// Generate a DAG to grab 128-bits from a vector > 128 bits. This -/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 -/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 -/// instructions or a simple subregister reference. Idx is an index in the -/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes -/// lowering EXTRACT_VECTOR_ELT operations easier. -static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG, SDLoc dl) { - assert((Vec.getValueType().is256BitVector() || - Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); - return ExtractSubVector(Vec, IdxVal, DAG, dl, 128); -} - -/// Generate a DAG to grab 256-bits from a 512-bit vector. -static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG, SDLoc dl) { - assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); - return ExtractSubVector(Vec, IdxVal, DAG, dl, 256); -} - -static SDValue InsertSubVector(SDValue Result, SDValue Vec, - unsigned IdxVal, SelectionDAG &DAG, - SDLoc dl, unsigned vectorWidth) { - assert((vectorWidth == 128 || vectorWidth == 256) && - "Unsupported vector width"); - // Inserting UNDEF is Result - if (Vec.getOpcode() == ISD::UNDEF) - return Result; - EVT VT = Vec.getValueType(); - EVT ElVT = VT.getVectorElementType(); - EVT ResultVT = Result.getValueType(); - - // Insert the relevant vectorWidth bits. - unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); - - // This is the index of the first element of the vectorWidth-bit chunk - // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth) - * ElemsPerChunk); - - SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); -} - -/// Generate a DAG to put 128-bits into a vector > 128 bits. This -/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or -/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a -/// simple superregister reference. Idx is an index in the 128 bits -/// we want. It need not be aligned to a 128-bit boundary. That makes -/// lowering INSERT_VECTOR_ELT operations easier. -static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG,SDLoc dl) { - assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); - return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); -} - -static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG, SDLoc dl) { - assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); - return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); -} - -/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 -/// instructions. This is used because creating CONCAT_VECTOR nodes of -/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower -/// large BUILD_VECTORS. -static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, - unsigned NumElems, SelectionDAG &DAG, - SDLoc dl) { - SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); - return Insert128BitVector(V, V2, NumElems/2, DAG, dl); -} - -static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, - unsigned NumElems, SelectionDAG &DAG, - SDLoc dl) { - SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); - return Insert256BitVector(V, V2, NumElems/2, DAG, dl); -} - X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -871,35 +757,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // MMX-sized vectors (other than x86mmx) are expected to be expanded // into smaller operations. - setOperationAction(ISD::MULHS, MVT::v8i8, Expand); - setOperationAction(ISD::MULHS, MVT::v4i16, Expand); - setOperationAction(ISD::MULHS, MVT::v2i32, Expand); - setOperationAction(ISD::MULHS, MVT::v1i64, Expand); - setOperationAction(ISD::AND, MVT::v8i8, Expand); - setOperationAction(ISD::AND, MVT::v4i16, Expand); - setOperationAction(ISD::AND, MVT::v2i32, Expand); - setOperationAction(ISD::AND, MVT::v1i64, Expand); - setOperationAction(ISD::OR, MVT::v8i8, Expand); - setOperationAction(ISD::OR, MVT::v4i16, Expand); - setOperationAction(ISD::OR, MVT::v2i32, Expand); - setOperationAction(ISD::OR, MVT::v1i64, Expand); - setOperationAction(ISD::XOR, MVT::v8i8, Expand); - setOperationAction(ISD::XOR, MVT::v4i16, Expand); - setOperationAction(ISD::XOR, MVT::v2i32, Expand); - setOperationAction(ISD::XOR, MVT::v1i64, Expand); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); + for (MVT MMXTy : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64}) { + setOperationAction(ISD::MULHS, MMXTy, Expand); + setOperationAction(ISD::AND, MMXTy, Expand); + setOperationAction(ISD::OR, MMXTy, Expand); + setOperationAction(ISD::XOR, MMXTy, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, MMXTy, Expand); + setOperationAction(ISD::SELECT, MMXTy, Expand); + setOperationAction(ISD::BITCAST, MMXTy, Expand); + } setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); - setOperationAction(ISD::SELECT, MVT::v8i8, Expand); - setOperationAction(ISD::SELECT, MVT::v4i16, Expand); - setOperationAction(ISD::SELECT, MVT::v2i32, Expand); - setOperationAction(ISD::SELECT, MVT::v1i64, Expand); - setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); - setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); - setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); - setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) { addRegisterClass(MVT::v4f32, &X86::VR128RegClass); @@ -1065,27 +932,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) { - setOperationAction(ISD::FFLOOR, MVT::f32, Legal); - setOperationAction(ISD::FCEIL, MVT::f32, Legal); - setOperationAction(ISD::FTRUNC, MVT::f32, Legal); - setOperationAction(ISD::FRINT, MVT::f32, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); - setOperationAction(ISD::FFLOOR, MVT::f64, Legal); - setOperationAction(ISD::FCEIL, MVT::f64, Legal); - setOperationAction(ISD::FTRUNC, MVT::f64, Legal); - setOperationAction(ISD::FRINT, MVT::f64, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); - - setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); - setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); - setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); - setOperationAction(ISD::FRINT, MVT::v4f32, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); - setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); - setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); - setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); - setOperationAction(ISD::FRINT, MVT::v2f64, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); + for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { + setOperationAction(ISD::FFLOOR, RoundedTy, Legal); + setOperationAction(ISD::FCEIL, RoundedTy, Legal); + setOperationAction(ISD::FTRUNC, RoundedTy, Legal); + setOperationAction(ISD::FRINT, RoundedTy, Legal); + setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); + } // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); @@ -1474,7 +1327,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal); setOperationAction(ISD::SETCC, MVT::v16i1, Custom); @@ -1576,6 +1428,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SUB, MVT::v32i16, Legal); setOperationAction(ISD::SUB, MVT::v64i8, Legal); setOperationAction(ISD::MUL, MVT::v32i16, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { const MVT VT = (MVT::SimpleValueType)i; @@ -1599,7 +1455,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SETCC, MVT::v4i1, Custom); setOperationAction(ISD::SETCC, MVT::v2i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom); setOperationAction(ISD::AND, MVT::v8i32, Legal); setOperationAction(ISD::OR, MVT::v8i32, Legal); @@ -3189,7 +3048,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Add a register mask operand representing the call-preserved registers. const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); - const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); + const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -3906,21 +3765,6 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, return true; } -/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming -/// the two vector operands have swapped position. -static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, - unsigned NumElems) { - for (unsigned i = 0; i != NumElems; ++i) { - int idx = Mask[i]; - if (idx < 0) - continue; - else if (idx < (int)NumElems) - Mask[i] = idx + NumElems; - else - Mask[i] = idx - NumElems; - } -} - /// isVEXTRACTIndex - Return true if the specified /// EXTRACT_SUBVECTOR operand specifies a vector extract that is /// suitable for instruction that extract 128 or 256 bit vectors @@ -4083,9 +3927,13 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); } else if (VT.getScalarType() == MVT::i1) { - assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type"); + + assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16) + && "Unexpected vector type"); + assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8) + && "Unexpected vector type"); SDValue Cst = DAG.getConstant(0, MVT::i1); - SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); + SmallVector<SDValue, 64> Ops(VT.getVectorNumElements(), Cst); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } else llvm_unreachable("Unexpected vector type"); @@ -4093,6 +3941,162 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, return DAG.getNode(ISD::BITCAST, dl, VT, Vec); } +static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl, + unsigned vectorWidth) { + assert((vectorWidth == 128 || vectorWidth == 256) && + "Unsupported vector width"); + EVT VT = Vec.getValueType(); + EVT ElVT = VT.getVectorElementType(); + unsigned Factor = VT.getSizeInBits()/vectorWidth; + EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, + VT.getVectorNumElements()/Factor); + + // Extract from UNDEF is UNDEF. + if (Vec.getOpcode() == ISD::UNDEF) + return DAG.getUNDEF(ResultVT); + + // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR + unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); + + // This is the index of the first element of the vectorWidth-bit chunk + // we want. + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth) + * ElemsPerChunk); + + // If the input is a buildvector just emit a smaller one. + if (Vec.getOpcode() == ISD::BUILD_VECTOR) + return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, + makeArrayRef(Vec->op_begin() + NormalizedIdxVal, + ElemsPerChunk)); + + SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); +} + +/// Generate a DAG to grab 128-bits from a vector > 128 bits. This +/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 +/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 +/// instructions or a simple subregister reference. Idx is an index in the +/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes +/// lowering EXTRACT_VECTOR_ELT operations easier. +static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert((Vec.getValueType().is256BitVector() || + Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); + return ExtractSubVector(Vec, IdxVal, DAG, dl, 128); +} + +/// Generate a DAG to grab 256-bits from a 512-bit vector. +static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); + return ExtractSubVector(Vec, IdxVal, DAG, dl, 256); +} + +static SDValue InsertSubVector(SDValue Result, SDValue Vec, + unsigned IdxVal, SelectionDAG &DAG, + SDLoc dl, unsigned vectorWidth) { + assert((vectorWidth == 128 || vectorWidth == 256) && + "Unsupported vector width"); + // Inserting UNDEF is Result + if (Vec.getOpcode() == ISD::UNDEF) + return Result; + EVT VT = Vec.getValueType(); + EVT ElVT = VT.getVectorElementType(); + EVT ResultVT = Result.getValueType(); + + // Insert the relevant vectorWidth bits. + unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); + + // This is the index of the first element of the vectorWidth-bit chunk + // we want. + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth) + * ElemsPerChunk); + + SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); +} + +/// Generate a DAG to put 128-bits into a vector > 128 bits. This +/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or +/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a +/// simple superregister reference. Idx is an index in the 128 bits +/// we want. It need not be aligned to a 128-bit boundary. That makes +/// lowering INSERT_VECTOR_ELT operations easier. +static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); + + // For insertion into the zero index (low half) of a 256-bit vector, it is + // more efficient to generate a blend with immediate instead of an insert*128. + // We are still creating an INSERT_SUBVECTOR below with an undef node to + // extend the subvector to the size of the result vector. Make sure that + // we are not recursing on that node by checking for undef here. + if (IdxVal == 0 && Result.getValueType().is256BitVector() && + Result.getOpcode() != ISD::UNDEF) { + EVT ResultVT = Result.getValueType(); + SDValue ZeroIndex = DAG.getIntPtrConstant(0); + SDValue Undef = DAG.getUNDEF(ResultVT); + SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef, + Vec, ZeroIndex); + + // The blend instruction, and therefore its mask, depend on the data type. + MVT ScalarType = ResultVT.getScalarType().getSimpleVT(); + if (ScalarType.isFloatingPoint()) { + // Choose either vblendps (float) or vblendpd (double). + unsigned ScalarSize = ScalarType.getSizeInBits(); + assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type"); + unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f; + SDValue Mask = DAG.getConstant(MaskVal, MVT::i8); + return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask); + } + + const X86Subtarget &Subtarget = + static_cast<const X86Subtarget &>(DAG.getSubtarget()); + + // AVX2 is needed for 256-bit integer blend support. + // Integers must be cast to 32-bit because there is only vpblendd; + // vpblendw can't be used for this because it has a handicapped mask. + + // If we don't have AVX2, then cast to float. Using a wrong domain blend + // is still more efficient than using the wrong domain vinsertf128 that + // will be created by InsertSubVector(). + MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; + + SDValue Mask = DAG.getConstant(0x0f, MVT::i8); + Vec256 = DAG.getNode(ISD::BITCAST, dl, CastVT, Vec256); + Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask); + return DAG.getNode(ISD::BITCAST, dl, ResultVT, Vec256); + } + + return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); +} + +static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); + return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); +} + +/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 +/// instructions. This is used because creating CONCAT_VECTOR nodes of +/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower +/// large BUILD_VECTORS. +static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, + unsigned NumElems, SelectionDAG &DAG, + SDLoc dl) { + SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); + return Insert128BitVector(V, V2, NumElems/2, DAG, dl); +} + +static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, + unsigned NumElems, SelectionDAG &DAG, + SDLoc dl) { + SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); + return Insert256BitVector(V, V2, NumElems/2, DAG, dl); +} + /// getOnesVector - Returns a vector of specified type with all bits set. /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. @@ -5567,8 +5571,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); } - SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG); - if (Broadcast.getNode()) + if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG)) return Broadcast; unsigned EVTBits = ExtVT.getSizeInBits(); @@ -5635,12 +5638,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || (ExtVT == MVT::i64 && Subtarget->is64Bit())) { - if (VT.is256BitVector() || VT.is512BitVector()) { + if (VT.is512BitVector()) { SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, Item, DAG.getIntPtrConstant(0)); } - assert(VT.is128BitVector() && "Expected an SSE value type!"); + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); @@ -5742,24 +5746,20 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { } // If element VT is < 32 bits, convert it to inserts into a zero vector. - if (EVTBits == 8 && NumElems == 16) { - SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, - Subtarget, *this); - if (V.getNode()) return V; - } + if (EVTBits == 8 && NumElems == 16) + if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, + Subtarget, *this)) + return V; - if (EVTBits == 16 && NumElems == 8) { - SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, - Subtarget, *this); - if (V.getNode()) return V; - } + if (EVTBits == 16 && NumElems == 8) + if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, + Subtarget, *this)) + return V; // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS - if (EVTBits == 32 && NumElems == 4) { - SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this); - if (V.getNode()) + if (EVTBits == 32 && NumElems == 4) + if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this)) return V; - } // If element VT is == 32 bits, turn it into a number of shuffles. SmallVector<SDValue, 8> V(NumElems); @@ -5807,13 +5807,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { V[i] = Op.getOperand(i); // Check for elements which are consecutive loads. - SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false); - if (LD.getNode()) + if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false)) return LD; // Check for a build vector from mostly shuffle plus few inserting. - SDValue Sh = buildFromShuffleMostly(Op, DAG); - if (Sh.getNode()) + if (SDValue Sh = buildFromShuffleMostly(Op, DAG)) return Sh; // For SSE 4.1, use insertps to put the high elements into the low element. @@ -5893,8 +5891,64 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } -static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { - MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType(); +static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG & DAG) { + SDLoc dl(Op); + MVT ResVT = Op.getSimpleValueType(); + unsigned NumOfOperands = Op.getNumOperands(); + + assert(isPowerOf2_32(NumOfOperands) && + "Unexpected number of operands in CONCAT_VECTORS"); + + if (NumOfOperands > 2) { + MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(), + ResVT.getVectorNumElements()/2); + SmallVector<SDValue, 2> Ops; + for (unsigned i = 0; i < NumOfOperands/2; i++) + Ops.push_back(Op.getOperand(i)); + SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); + Ops.clear(); + for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++) + Ops.push_back(Op.getOperand(i)); + SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); + } + + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode()); + bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode()); + + if (IsZeroV1 && IsZeroV2) + return getZeroVector(ResVT, Subtarget, DAG, dl); + + SDValue ZeroIdx = DAG.getIntPtrConstant(0); + SDValue Undef = DAG.getUNDEF(ResVT); + unsigned NumElems = ResVT.getVectorNumElements(); + SDValue ShiftBits = DAG.getConstant(NumElems/2, MVT::i8); + + V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, ZeroIdx); + V2 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V2, ShiftBits); + if (IsZeroV1) + return V2; + + V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); + // Zero the upper bits of V1 + V1 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V1, ShiftBits); + V1 = DAG.getNode(X86ISD::VSRLI, dl, ResVT, V1, ShiftBits); + if (IsZeroV2) + return V1; + return DAG.getNode(ISD::OR, dl, ResVT, V1, V2); +} + +static SDValue LowerCONCAT_VECTORS(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + if (VT.getVectorElementType() == MVT::i1) + return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG); + assert((VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))); @@ -6935,8 +6989,8 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, "a sorted mask where the broadcast " "comes from V1."); - // Go up the chain of (vector) values to try and find a scalar load that - // we can combine with the broadcast. + // Go up the chain of (vector) values to find a scalar load that we can + // combine with the broadcast. for (;;) { switch (V.getOpcode()) { case ISD::CONCAT_VECTORS: { @@ -6973,12 +7027,12 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { V = V.getOperand(BroadcastIdx); - // If the scalar isn't a load we can't broadcast from it in AVX1, only with - // AVX2. + // If the scalar isn't a load, we can't broadcast from it in AVX1. + // Only AVX2 has register broadcasts. if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) return SDValue(); } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) { - // We can't broadcast from a vector register w/o AVX2, and we can only + // We can't broadcast from a vector register without AVX2, and we can only // broadcast from the zero-element of a vector register. return SDValue(); } @@ -7689,10 +7743,18 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, /// The exact breakdown of how to form these dword pairs and align them on the /// correct sides is really tricky. See the comments within the function for /// more of the details. +/// +/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each +/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to +/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16 +/// vector, form the analogous 128-bit 8-element Mask. static SDValue lowerV8I16GeneralSingleInputVectorShuffle( - SDLoc DL, SDValue V, MutableArrayRef<int> Mask, + SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); + assert(VT.getScalarType() == MVT::i16 && "Bad input type!"); + MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); + + assert(Mask.size() == 8 && "Shuffle mask length doen't match!"); MutableArrayRef<int> LoMask = Mask.slice(0, 4); MutableArrayRef<int> HiMask = Mask.slice(4, 4); @@ -7845,9 +7907,9 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( int PSHUFDMask[] = {0, 1, 2, 3}; PSHUFDMask[ADWord] = BDWord; PSHUFDMask[BDWord] = ADWord; - V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, - DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, - DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), + V = DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, + DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V), getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); // Adjust the mask to match the new locations of A and B. @@ -7859,8 +7921,8 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( // Recurse back into this routine to re-compute state now that this isn't // a 3 and 1 problem. - return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), - Mask); + return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget, + DAG); }; if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); @@ -8083,15 +8145,15 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( // Now enact all the shuffles we've computed to move the inputs into their // target half. if (!isNoopShuffleMask(PSHUFLMask)) - V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V, + V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG)); if (!isNoopShuffleMask(PSHUFHMask)) - V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V, + V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG)); if (!isNoopShuffleMask(PSHUFDMask)) - V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, - DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, - DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), + V = DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, + DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V), getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); // At this point, each half should contain all its inputs, and we can then @@ -8105,7 +8167,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( // Do a half shuffle for the low mask. if (!isNoopShuffleMask(LoMask)) - V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V, + V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, getV4X86ShuffleImm8ForMask(LoMask, DAG)); // Do a half shuffle with the high mask after shifting its values down. @@ -8113,7 +8175,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( if (M >= 0) M -= 4; if (!isNoopShuffleMask(HiMask)) - V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V, + V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, getV4X86ShuffleImm8ForMask(HiMask, DAG)); return V; @@ -8232,8 +8294,8 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Mask, Subtarget, DAG)) return Rotate; - return lowerV8I16GeneralSingleInputVectorShuffle(DL, V1, Mask, Subtarget, - DAG); + return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, Mask, + Subtarget, DAG); } assert(std::any_of(Mask.begin(), Mask.end(), isV1) && @@ -8946,7 +9008,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT, int LaneSize = Mask.size() / 2; // If there are only inputs from one 128-bit lane, splitting will in fact be - // less expensive. The flags track wether the given lane contains an element + // less expensive. The flags track whether the given lane contains an element // that crosses to another lane. bool LaneCrossing[2] = {false, false}; for (int i = 0, Size = Mask.size(); i < Size; ++i) @@ -8986,34 +9048,78 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { + // TODO: If minimizing size and one of the inputs is a zero vector and the + // the zero vector has only one use, we could use a VPERM2X128 to save the + // instruction bytes needed to explicitly generate the zero vector. + // Blends are faster and handle all the non-lane-crossing cases. if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask, Subtarget, DAG)) return Blend; - MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), - VT.getVectorNumElements() / 2); - // Check for patterns which can be matched with a single insert of a 128-bit - // subvector. - if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}) || - isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) { - SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, - DAG.getIntPtrConstant(0)); - SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, - Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0)); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); - } - if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 6, 7})) { - SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, - DAG.getIntPtrConstant(0)); - SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2, - DAG.getIntPtrConstant(2)); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); + bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode()); + + // If either input operand is a zero vector, use VPERM2X128 because its mask + // allows us to replace the zero input with an implicit zero. + if (!IsV1Zero && !IsV2Zero) { + // Check for patterns which can be matched with a single insert of a 128-bit + // subvector. + bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}); + if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) { + MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements() / 2); + SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, + DAG.getIntPtrConstant(0)); + SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, + OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); + } + } + + // Otherwise form a 128-bit permutation. After accounting for undefs, + // convert the 64-bit shuffle mask selection values into 128-bit + // selection bits by dividing the indexes by 2 and shifting into positions + // defined by a vperm2*128 instruction's immediate control byte. + + // The immediate permute control byte looks like this: + // [1:0] - select 128 bits from sources for low half of destination + // [2] - ignore + // [3] - zero low half of destination + // [5:4] - select 128 bits from sources for high half of destination + // [6] - ignore + // [7] - zero high half of destination + + int MaskLO = Mask[0]; + if (MaskLO == SM_SentinelUndef) + MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1]; + + int MaskHI = Mask[2]; + if (MaskHI == SM_SentinelUndef) + MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3]; + + unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4; + + // If either input is a zero vector, replace it with an undef input. + // Shuffle mask values < 4 are selecting elements of V1. + // Shuffle mask values >= 4 are selecting elements of V2. + // Adjust each half of the permute mask by clearing the half that was + // selecting the zero vector and setting the zero mask bit. + if (IsV1Zero) { + V1 = DAG.getUNDEF(VT); + if (MaskLO < 4) + PermMask = (PermMask & 0xf0) | 0x08; + if (MaskHI < 4) + PermMask = (PermMask & 0x0f) | 0x80; + } + if (IsV2Zero) { + V2 = DAG.getUNDEF(VT); + if (MaskLO >= 4) + PermMask = (PermMask & 0xf0) | 0x08; + if (MaskHI >= 4) + PermMask = (PermMask & 0x0f) | 0x80; } - // Otherwise form a 128-bit permutation. - // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half. - unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4; return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, DAG.getConstant(PermMask, MVT::i8)); } @@ -9326,6 +9432,15 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + // If we have a single input to the zero element, insert that into V1 if we + // can do so cheaply. + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 8; }); + if (NumV2Elements == 1 && Mask[0] >= 8) + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Blend; @@ -9557,6 +9672,15 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask, DAG); + SmallVector<int, 8> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { + // As this is a single-input shuffle, the repeated mask should be + // a strictly valid v8i16 mask that we can pass through to the v8i16 + // lowering to handle even the v16 case. + return lowerV8I16GeneralSingleInputVectorShuffle( + DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG); + } + SDValue PSHUFBMask[32]; for (int i = 0; i < 16; ++i) { if (Mask[i] == -1) { @@ -10118,8 +10242,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { // Try to lower this to a blend-style vector shuffle. This can handle all // constant condition cases. - SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG); - if (BlendOp.getNode()) + if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG)) return BlendOp; // Variable blends are only legal from SSE4.1 onward. @@ -10421,17 +10544,31 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // If the vector is wider than 128 bits, extract the 128-bit subvector, insert // into that, and then insert the subvector back into the result. if (VT.is256BitVector() || VT.is512BitVector()) { - // Get the desired 128-bit vector half. + // With a 256-bit vector, we can insert into the zero element efficiently + // using a blend if we have AVX or AVX2 and the right data type. + if (VT.is256BitVector() && IdxVal == 0) { + // TODO: It is worthwhile to cast integer to floating point and back + // and incur a domain crossing penalty if that's what we'll end up + // doing anyway after extracting to a 128-bit vector. + if ((Subtarget->hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || + (Subtarget->hasAVX2() && EltVT == MVT::i32)) { + SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); + N2 = DAG.getIntPtrConstant(1); + return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2); + } + } + + // Get the desired 128-bit vector chunk. SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); - // Insert the element into the desired half. + // Insert the element into the desired chunk. unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits(); unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128; V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, DAG.getConstant(IdxIn128, MVT::i32)); - // Insert the changed part back to the 256-bit vector + // Insert the changed part back into the bigger vector return Insert128BitVector(N0, V, IdxVal, DAG, dl); } assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); @@ -10456,16 +10593,29 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, } if (EltVT == MVT::f32) { - // Bits [7:6] of the constant are the source select. This will always be - // zero here. The DAG Combiner may combine an extract_elt index into - // these - // bits. For example (insert (extract, 3), 2) could be matched by - // putting - // the '3' into bits [7:6] of X86ISD::INSERTPS. - // Bits [5:4] of the constant are the destination select. This is the - // value of the incoming immediate. - // Bits [3:0] of the constant are the zero mask. The DAG Combiner may + // Bits [7:6] of the constant are the source select. This will always be + // zero here. The DAG Combiner may combine an extract_elt index into + // these bits. For example (insert (extract, 3), 2) could be matched by + // putting the '3' into bits [7:6] of X86ISD::INSERTPS. + // Bits [5:4] of the constant are the destination select. This is the + // value of the incoming immediate. + // Bits [3:0] of the constant are the zero mask. The DAG Combiner may // combine either bitwise AND or insert of float 0.0 to set these bits. + + const Function *F = DAG.getMachineFunction().getFunction(); + bool MinSize = F->hasFnAttribute(Attribute::MinSize); + if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { + // If this is an insertion of 32-bits into the low 32-bits of + // a vector, we prefer to generate a blend with immediate rather + // than an insertps. Blends are simpler operations in hardware and so + // will always have equal or better performance than insertps. + // But if optimizing for size and there's a load folding opportunity, + // generate insertps because blendps does not have a 32-bit memory + // operand form. + N2 = DAG.getIntPtrConstant(1); + N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); + return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2); + } N2 = DAG.getIntPtrConstant(IdxVal << 4); // Create this as a scalar to vector.. N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); @@ -10593,6 +10743,37 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); + if (OpVT.getVectorElementType() == MVT::i1) { + if (IdxVal == 0 && Vec.getOpcode() == ISD::UNDEF) // the operation is legal + return Op; + SDValue ZeroIdx = DAG.getIntPtrConstant(0); + SDValue Undef = DAG.getUNDEF(OpVT); + unsigned NumElems = OpVT.getVectorNumElements(); + SDValue ShiftBits = DAG.getConstant(NumElems/2, MVT::i8); + + if (IdxVal == OpVT.getVectorNumElements() / 2) { + // Zero upper bits of the Vec + Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); + + SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, + SubVec, ZeroIdx); + Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits); + return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2); + } + if (IdxVal == 0) { + SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, + SubVec, ZeroIdx); + // Zero upper bits of the Vec2 + Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits); + Vec2 = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec2, ShiftBits); + // Zero lower bits of the Vec + Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); + // Merge them together + return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2); + } + } return SDValue(); } @@ -13149,9 +13330,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op1.getValueType(); SDValue CC; - // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops - // are available. Otherwise fp cmovs get lowered into a less efficient branch - // sequence later on. + // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops + // are available or VBLENDV if AVX is available. + // Otherwise FP cmovs get lowered into a less efficient branch sequence later. if (Cond.getOpcode() == ISD::SETCC && ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || (Subtarget->hasSSE1() && VT == MVT::f32)) && @@ -13166,8 +13347,42 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { DAG.getConstant(SSECC, MVT::i8)); return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2); } + SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, DAG.getConstant(SSECC, MVT::i8)); + + // If we have AVX, we can use a variable vector select (VBLENDV) instead + // of 3 logic instructions for size savings and potentially speed. + // Unfortunately, there is no scalar form of VBLENDV. + + // If either operand is a constant, don't try this. We can expect to + // optimize away at least one of the logic instructions later in that + // case, so that sequence would be faster than a variable blend. + + // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly + // uses XMM0 as the selection register. That may need just as many + // instructions as the AND/ANDN/OR sequence due to register moves, so + // don't bother. + + if (Subtarget->hasAVX() && + !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) { + + // Convert to vectors, do a VSELECT, and convert back to scalar. + // All of the conversions should be optimized away. + + EVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; + SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1); + SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2); + SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp); + + EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; + VCmp = DAG.getNode(ISD::BITCAST, DL, VCmpVT, VCmp); + + SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, + VSel, DAG.getIntPtrConstant(0)); + } SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); @@ -14595,6 +14810,13 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. + case Intrinsic::x86_avx2_permd: + case Intrinsic::x86_avx2_permps: + // Operands intentionally swapped. Mask is last operand to intrinsic, + // but second operand for node/instruction. + return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(1)); + case Intrinsic::x86_avx512_mask_valign_q_512: case Intrinsic::x86_avx512_mask_valign_d_512: // Vector source operands are swapped. @@ -16039,21 +16261,19 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); - SDValue V; assert(VT.isVector() && "Custom lowering only for vector shifts!"); assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!"); - V = LowerScalarImmediateShift(Op, DAG, Subtarget); - if (V.getNode()) + if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget)) return V; - V = LowerScalarVariableShift(Op, DAG, Subtarget); - if (V.getNode()) + if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget)) return V; if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64)) return Op; + // AVX2 has VPSLLV/VPSRAV/VPSRLV. if (Subtarget->hasInt256()) { if (Op.getOpcode() == ISD::SRL && @@ -16068,6 +16288,17 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return Op; } + // 2i64 vector logical shifts can efficiently avoid scalarization - do the + // shifts per-lane and then shuffle the partial results back together. + if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) { + // Splat the shift amounts so the scalar shifts above will catch it. + SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0}); + SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1}); + SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0); + SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1); + return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3}); + } + // If possible, lower this packed shift into a vector multiply instead of // expanding it into a sequence of scalar shifts. // Do this only if the vector shift count is a constant build_vector. @@ -16238,7 +16469,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt); return DAG.getNode(ISD::TRUNCATE, dl, VT, DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt)); - } + } // Decompose 256-bit shifts into smaller 128-bit shifts. if (VT.is256BitVector()) { @@ -16254,12 +16485,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, SDValue Amt1, Amt2; if (Amt.getOpcode() == ISD::BUILD_VECTOR) { // Constant shift amount - SmallVector<SDValue, 4> Amt1Csts; - SmallVector<SDValue, 4> Amt2Csts; - for (unsigned i = 0; i != NumElems/2; ++i) - Amt1Csts.push_back(Amt->getOperand(i)); - for (unsigned i = NumElems/2; i != NumElems; ++i) - Amt2Csts.push_back(Amt->getOperand(i)); + SmallVector<SDValue, 8> Ops(Amt->op_begin(), Amt->op_begin() + NumElems); + ArrayRef<SDValue> Amt1Csts = makeArrayRef(Ops).slice(0, NumElems / 2); + ArrayRef<SDValue> Amt2Csts = makeArrayRef(Ops).slice(NumElems / 2); Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts); Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts); @@ -16386,14 +16614,17 @@ bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { return needsCmpXchgNb(PTy->getElementType()); } -bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { +TargetLoweringBase::AtomicRMWExpansionKind +X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; const Type *MemType = AI->getType(); // If the operand is too big, we must see if cmpxchg8/16b is available // and default to library calls otherwise. - if (MemType->getPrimitiveSizeInBits() > NativeWidth) - return needsCmpXchgNb(MemType); + if (MemType->getPrimitiveSizeInBits() > NativeWidth) { + return needsCmpXchgNb(MemType) ? AtomicRMWExpansionKind::CmpXChg + : AtomicRMWExpansionKind::None; + } AtomicRMWInst::BinOp Op = AI->getOperation(); switch (Op) { @@ -16403,13 +16634,14 @@ bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::Add: case AtomicRMWInst::Sub: // It's better to use xadd, xsub or xchg for these in all cases. - return false; + return AtomicRMWExpansionKind::None; case AtomicRMWInst::Or: case AtomicRMWInst::And: case AtomicRMWInst::Xor: // If the atomicrmw's result isn't actually used, we can just add a "lock" // prefix to a normal instruction for these operations. - return !AI->use_empty(); + return !AI->use_empty() ? AtomicRMWExpansionKind::CmpXChg + : AtomicRMWExpansionKind::None; case AtomicRMWInst::Nand: case AtomicRMWInst::Max: case AtomicRMWInst::Min: @@ -16417,7 +16649,7 @@ bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::UMin: // These always require a non-trivial set of data operations on x86. We must // use a cmpxchg loop. - return true; + return AtomicRMWExpansionKind::CmpXChg; } } @@ -16874,7 +17106,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); - case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG); case ISD::VSELECT: return LowerVSELECT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); @@ -17719,7 +17951,8 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI, // 9 ) EFLAGS (implicit-def) assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); - assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); + static_assert(X86::AddrNumOperands == 5, + "VAARG_64 assumes 5 address operands"); unsigned DestReg = MI->getOperand(0).getReg(); MachineOperand &Base = MI->getOperand(1); @@ -18095,6 +18328,92 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // fallthrough --> copy0MBB MachineBasicBlock *thisMBB = BB; MachineFunction *F = BB->getParent(); + + // We also lower double CMOVs: + // (CMOV (CMOV F, T, cc1), T, cc2) + // to two successives branches. For that, we look for another CMOV as the + // following instruction. + // + // Without this, we would add a PHI between the two jumps, which ends up + // creating a few copies all around. For instance, for + // + // (sitofp (zext (fcmp une))) + // + // we would generate: + // + // ucomiss %xmm1, %xmm0 + // movss <1.0f>, %xmm0 + // movaps %xmm0, %xmm1 + // jne .LBB5_2 + // xorps %xmm1, %xmm1 + // .LBB5_2: + // jp .LBB5_4 + // movaps %xmm1, %xmm0 + // .LBB5_4: + // retq + // + // because this custom-inserter would have generated: + // + // A + // | \ + // | B + // | / + // C + // | \ + // | D + // | / + // E + // + // A: X = ...; Y = ... + // B: empty + // C: Z = PHI [X, A], [Y, B] + // D: empty + // E: PHI [X, C], [Z, D] + // + // If we lower both CMOVs in a single step, we can instead generate: + // + // A + // | \ + // | C + // | /| + // |/ | + // | | + // | D + // | / + // E + // + // A: X = ...; Y = ... + // D: empty + // E: PHI [X, A], [X, C], [Y, D] + // + // Which, in our sitofp/fcmp example, gives us something like: + // + // ucomiss %xmm1, %xmm0 + // movss <1.0f>, %xmm0 + // jne .LBB5_4 + // jp .LBB5_4 + // xorps %xmm0, %xmm0 + // .LBB5_4: + // retq + // + MachineInstr *NextCMOV = nullptr; + MachineBasicBlock::iterator NextMIIt = + std::next(MachineBasicBlock::iterator(MI)); + if (NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && + NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() && + NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) + NextCMOV = &*NextMIIt; + + MachineBasicBlock *jcc1MBB = nullptr; + + // If we have a double CMOV, we lower it to two successive branches to + // the same block. EFLAGS is used by both, so mark it as live in the second. + if (NextCMOV) { + jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, jcc1MBB); + jcc1MBB->addLiveIn(X86::EFLAGS); + } + MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, copy0MBB); @@ -18103,8 +18422,10 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); - if (!MI->killsRegister(X86::EFLAGS) && - !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { + + MachineInstr *LastEFLAGSUser = NextCMOV ? NextCMOV : MI; + if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) && + !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) { copy0MBB->addLiveIn(X86::EFLAGS); sinkMBB->addLiveIn(X86::EFLAGS); } @@ -18115,7 +18436,19 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, sinkMBB->transferSuccessorsAndUpdatePHIs(BB); // Add the true and fallthrough blocks as its successors. - BB->addSuccessor(copy0MBB); + if (NextCMOV) { + // The fallthrough block may be jcc1MBB, if we have a double CMOV. + BB->addSuccessor(jcc1MBB); + + // In that case, jcc1MBB will itself fallthrough the copy0MBB, and + // jump to the sinkMBB. + jcc1MBB->addSuccessor(copy0MBB); + jcc1MBB->addSuccessor(sinkMBB); + } else { + BB->addSuccessor(copy0MBB); + } + + // The true block target of the first (or only) branch is always sinkMBB. BB->addSuccessor(sinkMBB); // Create the conditional branch instruction. @@ -18123,6 +18456,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); + if (NextCMOV) { + unsigned Opc2 = X86::GetCondBranchFromCond( + (X86::CondCode)NextCMOV->getOperand(3).getImm()); + BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB); + } + // copy0MBB: // %FalseValue = ... // # fallthrough to sinkMBB @@ -18131,10 +18470,22 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // sinkMBB: // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] // ... - BuildMI(*sinkMBB, sinkMBB->begin(), DL, - TII->get(X86::PHI), MI->getOperand(0).getReg()) - .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) - .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + MachineInstrBuilder MIB = + BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), + MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) + .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + + // If we have a double CMOV, the second Jcc provides the same incoming + // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes). + if (NextCMOV) { + MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB); + // Copy the PHI result to the register defined by the second CMOV. + BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), + DL, TII->get(TargetOpcode::COPY), NextCMOV->getOperand(0).getReg()) + .addReg(MI->getOperand(0).getReg()); + NextCMOV->eraseFromParent(); + } MI->eraseFromParent(); // The pseudo instruction is gone now. return sinkMBB; @@ -18218,7 +18569,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, // Calls into a routine in libgcc to allocate more space from the heap. const uint32_t *RegMask = - Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C); + Subtarget->getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C); if (IsLP64) { BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) .addReg(sizeVReg); @@ -18303,7 +18654,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, // FIXME: The 32-bit calls have non-standard calling conventions. Use a // proper register mask. const uint32_t *RegMask = - Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C); + Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C); if (Subtarget->is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI) @@ -19132,9 +19483,11 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, // Note that even with AVX we prefer the PSHUFD form of shuffle for integer // vectors because it can have a load folded into it that UNPCK cannot. This // doesn't preclude something switching to the shorter encoding post-RA. - if (FloatDomain) { - if (Mask.equals(0, 0) || Mask.equals(1, 1)) { - bool Lo = Mask.equals(0, 0); + // + // FIXME: Should teach these routines about AVX vector widths. + if (FloatDomain && VT.getSizeInBits() == 128) { + if (Mask.equals({0, 0}) || Mask.equals({1, 1})) { + bool Lo = Mask.equals({0, 0}); unsigned Shuffle; MVT ShuffleVT; // Check if we have SSE3 which will let us use MOVDDUP. That instruction @@ -19163,8 +19516,8 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, return true; } if (Subtarget->hasSSE3() && - (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) { - bool Lo = Mask.equals(0, 0, 2, 2); + (Mask.equals({0, 0, 2, 2}) || Mask.equals({1, 1, 3, 3}))) { + bool Lo = Mask.equals({0, 0, 2, 2}); unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP; MVT ShuffleVT = MVT::v4f32; if (Depth == 1 && Root->getOpcode() == Shuffle) @@ -19177,8 +19530,8 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, /*AddTo*/ true); return true; } - if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) { - bool Lo = Mask.equals(0, 0, 1, 1); + if (Mask.equals({0, 0, 1, 1}) || Mask.equals({2, 2, 3, 3})) { + bool Lo = Mask.equals({0, 0, 1, 1}); unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; MVT ShuffleVT = MVT::v4f32; if (Depth == 1 && Root->getOpcode() == Shuffle) @@ -19196,12 +19549,12 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK // variants as none of these have single-instruction variants that are // superior to the UNPCK formulation. - if (!FloatDomain && - (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) || - Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) || - Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) || - Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, - 15))) { + if (!FloatDomain && VT.getSizeInBits() == 128 && + (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) || + Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) || + Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) || + Mask.equals( + {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}))) { bool Lo = Mask[0] == 0; unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; if (Depth == 1 && Root->getOpcode() == Shuffle) @@ -19237,9 +19590,9 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, // in practice PSHUFB tends to be *very* fast so we're more aggressive. if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) { SmallVector<SDValue, 16> PSHUFBMask; - assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!"); - int Ratio = 16 / Mask.size(); - for (unsigned i = 0; i < 16; ++i) { + int NumBytes = VT.getSizeInBits() / 8; + int Ratio = NumBytes / Mask.size(); + for (int i = 0; i < NumBytes; ++i) { if (Mask[i / Ratio] == SM_SentinelUndef) { PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8)); continue; @@ -19249,12 +19602,13 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, : 255; PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8)); } - Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input); + MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes); + Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Input); DCI.AddToWorklist(Op.getNode()); SDValue PSHUFBMaskOp = - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask); + DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask); DCI.AddToWorklist(PSHUFBMaskOp.getNode()); - Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp); + Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp); DCI.AddToWorklist(Op.getNode()); DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), /*AddTo*/ true); @@ -19312,10 +19666,6 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, MVT VT = Op.getSimpleValueType(); if (!VT.isVector()) return false; // Bail if we hit a non-vector. - // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit - // version should be added. - if (VT.getSizeInBits() != 128) - return false; assert(Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"); @@ -19418,12 +19768,26 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 /// PSHUF-style masks that can be reused with such instructions. static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) { + MVT VT = N.getSimpleValueType(); SmallVector<int, 4> Mask; bool IsUnary; - bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary); + bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary); (void)HaveMask; assert(HaveMask); + // If we have more than 128-bits, only the low 128-bits of shuffle mask + // matter. Check that the upper masks are repeats and remove them. + if (VT.getSizeInBits() > 128) { + int LaneElts = 128 / VT.getScalarSizeInBits(); +#ifndef NDEBUG + for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i) + for (int j = 0; j < LaneElts; ++j) + assert(Mask[j] == Mask[i * LaneElts + j] - LaneElts && + "Mask doesn't repeat in high 128-bit lanes!"); +#endif + Mask.resize(LaneElts); + } + switch (N.getOpcode()) { case X86ISD::PSHUFD: return Mask; @@ -19496,7 +19860,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, case X86ISD::UNPCKH: // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword // shuffle into a preceding word shuffle. - if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16) + if (V.getSimpleValueType().getScalarType() != MVT::i8 && + V.getSimpleValueType().getScalarType() != MVT::i16) return SDValue(); // Search for a half-shuffle which we can combine with. @@ -19670,8 +20035,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, break; case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: - assert(VT == MVT::v8i16); - (void)VT; + assert(VT.getScalarType() == MVT::i16 && "Bad word shuffle type!"); if (combineRedundantHalfShuffle(N, Mask, DAG, DCI)) return SDValue(); // We combined away this shuffle, so we're done. @@ -19679,17 +20043,18 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, // See if this reduces to a PSHUFD which is no more expensive and can // combine with more operations. Note that it has to at least flip the // dwords as otherwise it would have been removed as a no-op. - if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) { + if (makeArrayRef(Mask).equals({2, 3, 0, 1})) { int DMask[] = {0, 1, 2, 3}; int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; DMask[DOffset + 0] = DOffset + 1; DMask[DOffset + 1] = DOffset + 0; - V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V); + MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); + V = DAG.getNode(ISD::BITCAST, DL, DVT, V); DCI.AddToWorklist(V.getNode()); - V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V, + V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V, getV4X86ShuffleImm8ForMask(DMask, DAG)); DCI.AddToWorklist(V.getNode()); - return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); + return DAG.getNode(ISD::BITCAST, DL, VT, V); } // Look for shuffle patterns which can be implemented as a single unpack. @@ -19717,18 +20082,14 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, int MappedMask[8]; for (int i = 0; i < 8; ++i) MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2; - const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3}; - const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7}; - if (std::equal(std::begin(MappedMask), std::end(MappedMask), - std::begin(UnpackLoMask)) || - std::equal(std::begin(MappedMask), std::end(MappedMask), - std::begin(UnpackHiMask))) { + if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) || + makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) { // We can replace all three shuffles with an unpack. - V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0)); + V = DAG.getNode(ISD::BITCAST, DL, VT, D.getOperand(0)); DCI.AddToWorklist(V.getNode()); return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL : X86ISD::UNPCKH, - DL, MVT::v8i16, V, V); + DL, VT, V, V); } } } @@ -19876,10 +20237,6 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, } } - // Only handle 128 wide vector from here on. - if (!VT.is128BitVector()) - return SDValue(); - // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are // consecutive, non-overlapping, and in the right order. @@ -20987,6 +21344,49 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { return SDValue(); } +/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS. +/// Match: +/// (X86or (X86setcc) (X86setcc)) +/// (X86cmp (and (X86setcc) (X86setcc)), 0) +static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, + X86::CondCode &CC1, SDValue &Flags, + bool &isAnd) { + if (Cond->getOpcode() == X86ISD::CMP) { + ConstantSDNode *CondOp1C = dyn_cast<ConstantSDNode>(Cond->getOperand(1)); + if (!CondOp1C || !CondOp1C->isNullValue()) + return false; + + Cond = Cond->getOperand(0); + } + + isAnd = false; + + SDValue SetCC0, SetCC1; + switch (Cond->getOpcode()) { + default: return false; + case ISD::AND: + case X86ISD::AND: + isAnd = true; + // fallthru + case ISD::OR: + case X86ISD::OR: + SetCC0 = Cond->getOperand(0); + SetCC1 = Cond->getOperand(1); + break; + }; + + // Make sure we have SETCC nodes, using the same flags value. + if (SetCC0.getOpcode() != X86ISD::SETCC || + SetCC1.getOpcode() != X86ISD::SETCC || + SetCC0->getOperand(1) != SetCC1->getOperand(1)) + return false; + + CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0); + CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0); + Flags = SetCC0->getOperand(1); + return true; +} + /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -21156,6 +21556,44 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, } } + // Fold and/or of setcc's to double CMOV: + // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2) + // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2) + // + // This combine lets us generate: + // cmovcc1 (jcc1 if we don't have CMOV) + // cmovcc2 (same) + // instead of: + // setcc1 + // setcc2 + // and/or + // cmovne (jne if we don't have CMOV) + // When we can't use the CMOV instruction, it might increase branch + // mispredicts. + // When we can use CMOV, or when there is no mispredict, this improves + // throughput and reduces register pressure. + // + if (CC == X86::COND_NE) { + SDValue Flags; + X86::CondCode CC0, CC1; + bool isAndSetCC; + if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) { + if (isAndSetCC) { + std::swap(FalseOp, TrueOp); + CC0 = X86::GetOppositeBranchCondition(CC0); + CC1 = X86::GetOppositeBranchCondition(CC1); + } + + SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, MVT::i8), + Flags}; + SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps); + SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, MVT::i8), Flags}; + SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1)); + return CMOV; + } + } + return SDValue(); } @@ -21166,24 +21604,16 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, default: return SDValue(); // SSE/AVX/AVX2 blend intrinsics. case Intrinsic::x86_avx2_pblendvb: - case Intrinsic::x86_avx2_pblendw: - case Intrinsic::x86_avx2_pblendd_128: - case Intrinsic::x86_avx2_pblendd_256: // Don't try to simplify this intrinsic if we don't have AVX2. if (!Subtarget->hasAVX2()) return SDValue(); // FALL-THROUGH - case Intrinsic::x86_avx_blend_pd_256: - case Intrinsic::x86_avx_blend_ps_256: case Intrinsic::x86_avx_blendv_pd_256: case Intrinsic::x86_avx_blendv_ps_256: // Don't try to simplify this intrinsic if we don't have AVX. if (!Subtarget->hasAVX()) return SDValue(); // FALL-THROUGH - case Intrinsic::x86_sse41_pblendw: - case Intrinsic::x86_sse41_blendpd: - case Intrinsic::x86_sse41_blendps: case Intrinsic::x86_sse41_blendvps: case Intrinsic::x86_sse41_blendvpd: case Intrinsic::x86_sse41_pblendvb: { @@ -21640,7 +22070,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, // an and with a mask. // We'd like to try to combine that into a shuffle with zero // plus a bitcast, removing the and. - if (N0.getOpcode() != ISD::BITCAST || + if (N0.getOpcode() != ISD::BITCAST || N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE) return SDValue(); @@ -21670,7 +22100,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, unsigned ResSize = N1.getValueType().getScalarSizeInBits(); // Make sure the splat matches the mask we expect - if (SplatBitSize > ResSize || + if (SplatBitSize > ResSize || (SplatValue + 1).exactLogBase2() != (int)SrcSize) return SDValue(); @@ -21724,12 +22154,10 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); - SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget); - if (Zext.getNode()) + if (SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget)) return Zext; - SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); - if (R.getNode()) + if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget)) return R; EVT VT = N->getValueType(0); @@ -22521,7 +22949,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { // If A and B occur in reverse order in RHS, then "swap" them (which means // rewriting the mask). if (A != C) - CommuteVectorShuffleMask(RMask, NumElts); + ShuffleVectorSDNode::commuteMask(RMask); // At this point LHS and RHS are equivalent to // LHS = VECTOR_SHUFFLE A, B, LMask @@ -22630,7 +23058,7 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); - + return SDValue(); } @@ -22864,45 +23292,51 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0))) if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), - LHS.getValueType(), RHS, LHS.getOperand(1)); - return DAG.getSetCC(SDLoc(N), N->getValueType(0), - addV, DAG.getConstant(0, addV.getValueType()), CC); + SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), LHS.getValueType(), RHS, + LHS.getOperand(1)); + return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV, + DAG.getConstant(0, addV.getValueType()), CC); } if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0))) if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), - RHS.getValueType(), LHS, RHS.getOperand(1)); - return DAG.getSetCC(SDLoc(N), N->getValueType(0), - addV, DAG.getConstant(0, addV.getValueType()), CC); + SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), RHS.getValueType(), LHS, + RHS.getOperand(1)); + return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV, + DAG.getConstant(0, addV.getValueType()), CC); } - if (VT.getScalarType() == MVT::i1) { - bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) && - (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); - bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode()); - if (!IsSEXT0 && !IsVZero0) - return SDValue(); - bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) && - (RHS.getOperand(0).getValueType().getScalarType() == MVT::i1); + if (VT.getScalarType() == MVT::i1 && + (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { + bool IsSEXT0 = + (LHS.getOpcode() == ISD::SIGN_EXTEND) && + (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); - if (!IsSEXT1 && !IsVZero1) - return SDValue(); + if (!IsSEXT0 || !IsVZero1) { + // Swap the operands and update the condition code. + std::swap(LHS, RHS); + CC = ISD::getSetCCSwappedOperands(CC); + + IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) && + (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); + IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); + } if (IsSEXT0 && IsVZero1) { - assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type"); - if (CC == ISD::SETEQ) + assert(VT == LHS.getOperand(0).getValueType() && + "Uexpected operand type"); + if (CC == ISD::SETGT) + return DAG.getConstant(0, VT); + if (CC == ISD::SETLE) + return DAG.getConstant(1, VT); + if (CC == ISD::SETEQ || CC == ISD::SETGE) return DAG.getNOT(DL, LHS.getOperand(0), VT); + + assert((CC == ISD::SETNE || CC == ISD::SETLT) && + "Unexpected condition code!"); return LHS.getOperand(0); } - if (IsSEXT1 && IsVZero0) { - assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type"); - if (CC == ISD::SETEQ) - return DAG.getNOT(DL, RHS.getOperand(0), VT); - return RHS.getOperand(0); - } } return SDValue(); @@ -22940,7 +23374,7 @@ static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, // countS and just gets an f32 from that address. unsigned DestIndex = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6; - + Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG); // Create this as a scalar to vector to match the instruction pattern. @@ -22964,7 +23398,7 @@ static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) { // pattern-matching possibilities related to scalar math ops in SSE/AVX. // x86InstrInfo knows how to commute this back after instruction selection // if it would help register allocation. - + // TODO: If optimizing for size or a processor that doesn't suffer from // partial register update stalls, this should be transformed into a MOVSD // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. @@ -23503,27 +23937,23 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { // X86 Inline Assembly Support //===----------------------------------------------------------------------===// -namespace { - // Helper to match a string separated by whitespace. - bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) { - s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace. - - for (unsigned i = 0, e = args.size(); i != e; ++i) { - StringRef piece(*args[i]); - if (!s.startswith(piece)) // Check if the piece matches. - return false; +// Helper to match a string separated by whitespace. +static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) { + S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace. - s = s.substr(piece.size()); - StringRef::size_type pos = s.find_first_not_of(" \t"); - if (pos == 0) // We matched a prefix. - return false; + for (StringRef Piece : Pieces) { + if (!S.startswith(Piece)) // Check if the piece matches. + return false; - s = s.substr(pos); - } + S = S.substr(Piece.size()); + StringRef::size_type Pos = S.find_first_not_of(" \t"); + if (Pos == 0) // We matched a prefix. + return false; - return s.empty(); + S = S.substr(Pos); } - const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={}; + + return S.empty(); } static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) { @@ -23563,12 +23993,12 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { // ops instead of emitting the bswap asm. For now, we don't support 486 or // lower so don't worry about this. // bswap $0 - if (matchAsm(AsmPieces[0], "bswap", "$0") || - matchAsm(AsmPieces[0], "bswapl", "$0") || - matchAsm(AsmPieces[0], "bswapq", "$0") || - matchAsm(AsmPieces[0], "bswap", "${0:q}") || - matchAsm(AsmPieces[0], "bswapl", "${0:q}") || - matchAsm(AsmPieces[0], "bswapq", "${0:q}")) { + if (matchAsm(AsmPieces[0], {"bswap", "$0"}) || + matchAsm(AsmPieces[0], {"bswapl", "$0"}) || + matchAsm(AsmPieces[0], {"bswapq", "$0"}) || + matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) || + matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) || + matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) { // No need to check constraints, nothing other than the equivalent of // "=r,0" would be valid here. return IntrinsicLowering::LowerToByteSwap(CI); @@ -23577,8 +24007,8 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { // rorw $$8, ${0:w} --> llvm.bswap.i16 if (CI->getType()->isIntegerTy(16) && IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && - (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") || - matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) { + (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) || + matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) { AsmPieces.clear(); const std::string &ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); @@ -23590,9 +24020,9 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { case 3: if (CI->getType()->isIntegerTy(32) && IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && - matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") && - matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") && - matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) { + matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) && + matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) && + matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) { AsmPieces.clear(); const std::string &ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); @@ -23607,9 +24037,9 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 - if (matchAsm(AsmPieces[0], "bswap", "%eax") && - matchAsm(AsmPieces[1], "bswap", "%edx") && - matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx")) + if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) && + matchAsm(AsmPieces[1], {"bswap", "%edx"}) && + matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"})) return IntrinsicLowering::LowerToByteSwap(CI); } } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 4423015..dd20ec2 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -30,37 +30,37 @@ namespace llvm { // Start the numbering where the builtin ops leave off. FIRST_NUMBER = ISD::BUILTIN_OP_END, - /// BSF - Bit scan forward. - /// BSR - Bit scan reverse. + /// Bit scan forward. BSF, + /// Bit scan reverse. BSR, - /// SHLD, SHRD - Double shift instructions. These correspond to + /// Double shift instructions. These correspond to /// X86::SHLDxx and X86::SHRDxx instructions. SHLD, SHRD, - /// FAND - Bitwise logical AND of floating point values. This corresponds + /// Bitwise logical AND of floating point values. This corresponds /// to X86::ANDPS or X86::ANDPD. FAND, - /// FOR - Bitwise logical OR of floating point values. This corresponds + /// Bitwise logical OR of floating point values. This corresponds /// to X86::ORPS or X86::ORPD. FOR, - /// FXOR - Bitwise logical XOR of floating point values. This corresponds + /// Bitwise logical XOR of floating point values. This corresponds /// to X86::XORPS or X86::XORPD. FXOR, - /// FANDN - Bitwise logical ANDNOT of floating point values. This + /// Bitwise logical ANDNOT of floating point values. This /// corresponds to X86::ANDNPS or X86::ANDNPD. FANDN, - /// FSRL - Bitwise logical right shift of floating point values. These + /// Bitwise logical right shift of floating point values. This /// corresponds to X86::PSRLDQ. FSRL, - /// CALL - These operations represent an abstract X86 call + /// These operations represent an abstract X86 call /// instruction, which includes a bunch of information. In particular the /// operands of these node are: /// @@ -79,8 +79,7 @@ namespace llvm { /// CALL, - /// RDTSC_DAG - This operation implements the lowering for - /// readcyclecounter + /// This operation implements the lowering for readcyclecounter RDTSC_DAG, /// X86 Read Time-Stamp Counter and Processor ID. @@ -131,187 +130,186 @@ namespace llvm { /// 1 is the number of bytes of stack to pop. RET_FLAG, - /// REP_STOS - Repeat fill, corresponds to X86::REP_STOSx. + /// Repeat fill, corresponds to X86::REP_STOSx. REP_STOS, - /// REP_MOVS - Repeat move, corresponds to X86::REP_MOVSx. + /// Repeat move, corresponds to X86::REP_MOVSx. REP_MOVS, - /// GlobalBaseReg - On Darwin, this node represents the result of the popl + /// On Darwin, this node represents the result of the popl /// at function entry, used for PIC code. GlobalBaseReg, - /// Wrapper - A wrapper node for TargetConstantPool, + /// A wrapper node for TargetConstantPool, /// TargetExternalSymbol, and TargetGlobalAddress. Wrapper, - /// WrapperRIP - Special wrapper used under X86-64 PIC mode for RIP + /// Special wrapper used under X86-64 PIC mode for RIP /// relative displacements. WrapperRIP, - /// MOVDQ2Q - Copies a 64-bit value from the low word of an XMM vector + /// Copies a 64-bit value from the low word of an XMM vector /// to an MMX vector. If you think this is too close to the previous /// mnemonic, so do I; blame Intel. MOVDQ2Q, - /// MMX_MOVD2W - Copies a 32-bit value from the low word of a MMX + /// Copies a 32-bit value from the low word of a MMX /// vector to a GPR. MMX_MOVD2W, - /// MMX_MOVW2D - Copies a GPR into the low 32-bit word of a MMX vector + /// Copies a GPR into the low 32-bit word of a MMX vector /// and zero out the high word. MMX_MOVW2D, - /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to + /// Extract an 8-bit value from a vector and zero extend it to /// i32, corresponds to X86::PEXTRB. PEXTRB, - /// PEXTRW - Extract a 16-bit value from a vector and zero extend it to + /// Extract a 16-bit value from a vector and zero extend it to /// i32, corresponds to X86::PEXTRW. PEXTRW, - /// INSERTPS - Insert any element of a 4 x float vector into any element + /// Insert any element of a 4 x float vector into any element /// of a destination 4 x floatvector. INSERTPS, - /// PINSRB - Insert the lower 8-bits of a 32-bit value to a vector, + /// Insert the lower 8-bits of a 32-bit value to a vector, /// corresponds to X86::PINSRB. PINSRB, - /// PINSRW - Insert the lower 16-bits of a 32-bit value to a vector, + /// Insert the lower 16-bits of a 32-bit value to a vector, /// corresponds to X86::PINSRW. PINSRW, MMX_PINSRW, - /// PSHUFB - Shuffle 16 8-bit values within a vector. + /// Shuffle 16 8-bit values within a vector. PSHUFB, - /// ANDNP - Bitwise Logical AND NOT of Packed FP values. + /// Bitwise Logical AND NOT of Packed FP values. ANDNP, - /// PSIGN - Copy integer sign. + /// Copy integer sign. PSIGN, - /// BLENDI - Blend where the selector is an immediate. + /// Blend where the selector is an immediate. BLENDI, - /// SHRUNKBLEND - Blend where the condition has been shrunk. + /// Blend where the condition has been shrunk. /// This is used to emphasize that the condition mask is /// no more valid for generic VSELECT optimizations. SHRUNKBLEND, - /// ADDSUB - Combined add and sub on an FP vector. + /// Combined add and sub on an FP vector. ADDSUB, - // FADD, FSUB, FMUL, FDIV, FMIN, FMAX - FP vector ops with rounding mode. + // FP vector ops with rounding mode. FADD_RND, FSUB_RND, FMUL_RND, FDIV_RND, - // SUBUS - Integer sub with unsigned saturation. + // Integer sub with unsigned saturation. SUBUS, - /// HADD - Integer horizontal add. + /// Integer horizontal add. HADD, - /// HSUB - Integer horizontal sub. + /// Integer horizontal sub. HSUB, - /// FHADD - Floating point horizontal add. + /// Floating point horizontal add. FHADD, - /// FHSUB - Floating point horizontal sub. + /// Floating point horizontal sub. FHSUB, - /// UMAX, UMIN - Unsigned integer max and min. + /// Unsigned integer max and min. UMAX, UMIN, - /// SMAX, SMIN - Signed integer max and min. + /// Signed integer max and min. SMAX, SMIN, - /// FMAX, FMIN - Floating point max and min. - /// + /// Floating point max and min. FMAX, FMIN, - /// FMAXC, FMINC - Commutative FMIN and FMAX. + /// Commutative FMIN and FMAX. FMAXC, FMINC, - /// FRSQRT, FRCP - Floating point reciprocal-sqrt and reciprocal - /// approximation. Note that these typically require refinement + /// Floating point reciprocal-sqrt and reciprocal approximation. + /// Note that these typically require refinement /// in order to obtain suitable precision. FRSQRT, FRCP, - // TLSADDR - Thread Local Storage. + // Thread Local Storage. TLSADDR, - // TLSBASEADDR - Thread Local Storage. A call to get the start address + // Thread Local Storage. A call to get the start address // of the TLS block for the current module. TLSBASEADDR, - // TLSCALL - Thread Local Storage. When calling to an OS provided + // Thread Local Storage. When calling to an OS provided // thunk at the address from an earlier relocation. TLSCALL, - // EH_RETURN - Exception Handling helpers. + // Exception Handling helpers. EH_RETURN, - // EH_SJLJ_SETJMP - SjLj exception handling setjmp. + // SjLj exception handling setjmp. EH_SJLJ_SETJMP, - // EH_SJLJ_LONGJMP - SjLj exception handling longjmp. + // SjLj exception handling longjmp. EH_SJLJ_LONGJMP, - /// TC_RETURN - Tail call return. See X86TargetLowering::LowerCall for + /// Tail call return. See X86TargetLowering::LowerCall for /// the list of operands. TC_RETURN, - // VZEXT_MOVL - Vector move to low scalar and zero higher vector elements. + // Vector move to low scalar and zero higher vector elements. VZEXT_MOVL, - // VZEXT - Vector integer zero-extend. + // Vector integer zero-extend. VZEXT, - // VSEXT - Vector integer signed-extend. + // Vector integer signed-extend. VSEXT, - // VTRUNC - Vector integer truncate. + // Vector integer truncate. VTRUNC, - // VTRUNC - Vector integer truncate with mask. + // Vector integer truncate with mask. VTRUNCM, - // VFPEXT - Vector FP extend. + // Vector FP extend. VFPEXT, - // VFPROUND - Vector FP round. + // Vector FP round. VFPROUND, - // VSHL, VSRL - 128-bit vector logical left / right shift + // 128-bit vector logical left / right shift VSHLDQ, VSRLDQ, - // VSHL, VSRL, VSRA - Vector shift elements + // Vector shift elements VSHL, VSRL, VSRA, - // VSHLI, VSRLI, VSRAI - Vector shift elements by immediate + // Vector shift elements by immediate VSHLI, VSRLI, VSRAI, - // CMPP - Vector packed double/float comparison. + // Vector packed double/float comparison. CMPP, - // PCMP* - Vector integer comparisons. + // Vector integer comparisons. PCMPEQ, PCMPGT, - // PCMP*M - Vector integer comparisons, the result is in a mask vector. + // Vector integer comparisons, the result is in a mask vector. PCMPEQM, PCMPGTM, - /// CMPM, CMPMU - Vector comparison generating mask bits for fp and + /// Vector comparison generating mask bits for fp and /// integer signed and unsigned data types. CMPM, CMPMU, - // ADD, SUB, SMUL, etc. - Arithmetic operations with FLAGS results. + // Arithmetic operations with FLAGS results. ADD, SUB, ADC, SBB, SMUL, INC, DEC, OR, XOR, AND, - BEXTR, // BEXTR - Bit field extract + BEXTR, // Bit field extract UMUL, // LOW, HI, FLAGS = umul LHS, RHS @@ -322,16 +320,16 @@ namespace llvm { UDIVREM8_ZEXT_HREG, SDIVREM8_SEXT_HREG, - // MUL_IMM - X86 specific multiply by immediate. + // X86-specific multiply by immediate. MUL_IMM, - // PTEST - Vector bitwise comparisons. + // Vector bitwise comparisons. PTEST, - // TESTP - Vector packed fp sign bitwise comparisons. + // Vector packed fp sign bitwise comparisons. TESTP, - // TESTM, TESTNM - Vector "test" in AVX-512, the result is in a mask vector. + // Vector "test" in AVX-512, the result is in a mask vector. TESTM, TESTNM, @@ -697,6 +695,12 @@ namespace llvm { std::vector<SDValue> &Ops, SelectionDAG &DAG) const override; + unsigned getInlineAsmMemConstraint( + const std::string &ConstraintCode) const override { + // FIXME: Map different constraints differently. + return InlineAsm::Constraint_m; + } + /// Given a physical register constraint /// (e.g. {edx}), return the register number and the register class for the /// register. This should only be used for C_Register constraints. On @@ -993,7 +997,8 @@ namespace llvm { bool shouldExpandAtomicLoadInIR(LoadInst *SI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; - bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + TargetLoweringBase::AtomicRMWExpansionKind + shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 4923bc5..509602f 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -74,6 +74,15 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, !if (!eq (Size, 128), "v2i64", !if (!eq (Size, 256), "v4i64", VTName)), VTName)); + + PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" # + !if (!eq (TypeVariantName, "i"), + !if (!eq (Size, 128), "v2i64", + !if (!eq (Size, 256), "v4i64", + !if (!eq (Size, 512), + !if (!eq (EltSize, 64), "v8i64", "v16i32"), + VTName))), VTName)); + PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT); // The corresponding float type, e.g. v16f32 for v16i32 @@ -107,6 +116,9 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, // create the canonical constant zero node ImmAllZerosV. ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32"); dag ImmAllZerosV = (VT (bitconvert (i32VT immAllZerosV))); + + string ZSuffix = !if (!eq (Size, 128), "Z128", + !if (!eq (Size, 256), "Z256", "Z")); } def v64i8_info : X86VectorVTInfo<64, i8, VR512, "b">; @@ -1559,6 +1571,11 @@ multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC, (outs KRC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), !strconcat("vcmp", suffix, "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; + def rrib_alt: AVX512PIi8<0xC2, MRMSrcReg, + (outs KRC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), + !strconcat("vcmp", suffix, + "\t{{sae}, $cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc, {sae}}"), + [], d>, EVEX_B; let mayLoad = 1 in def rmi_alt : AVX512PIi8<0xC2, MRMSrcMem, (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), @@ -2047,6 +2064,8 @@ let Predicates = [HasVLX] in { (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>; def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>; + def : Pat<(v4i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), + (v4i1 (COPY_TO_REGCLASS VK2:$src, VK4))>; def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>; def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), @@ -2062,177 +2081,193 @@ def : Pat<(v8i1 (X86vsrli VK8:$src, (i8 imm:$imm))), (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>; + +def : Pat<(v4i1 (X86vshli VK4:$src, (i8 imm:$imm))), + (v4i1 (COPY_TO_REGCLASS + (KSHIFTLWri (COPY_TO_REGCLASS VK4:$src, VK16), + (I8Imm $imm)), VK4))>, Requires<[HasAVX512]>; + +def : Pat<(v4i1 (X86vsrli VK4:$src, (i8 imm:$imm))), + (v4i1 (COPY_TO_REGCLASS + (KSHIFTRWri (COPY_TO_REGCLASS VK4:$src, VK16), + (I8Imm $imm)), VK4))>, Requires<[HasAVX512]>; + //===----------------------------------------------------------------------===// // AVX-512 - Aligned and unaligned load and store // -multiclass avx512_load<bits<8> opc, string OpcodeStr, PatFrag ld_frag, - RegisterClass KRC, RegisterClass RC, - ValueType vt, ValueType zvt, X86MemOperand memop, - Domain d, bit IsReMaterializable = 1> { -let hasSideEffects = 0 in { - def rr : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), + +multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + PatFrag ld_frag, PatFrag mload, + bit IsReMaterializable = 1> { + let hasSideEffects = 0 in { + def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], - d>, EVEX; - def rrkz : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src), + _.ExeDomain>, EVEX; + def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src), !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", - "${dst} {${mask}} {z}, $src}"), [], d>, EVEX, EVEX_KZ; - } + "${dst} {${mask}} {z}, $src}"), [], _.ExeDomain>, + EVEX, EVEX_KZ; + let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable, SchedRW = [WriteLoad] in - def rm : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), (ins memop:$src), + def rm : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.MemOp:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (vt (bitconvert (ld_frag addr:$src))))], - d>, EVEX; - - let AddedComplexity = 20 in { - let Constraints = "$src0 = $dst", hasSideEffects = 0 in { - let hasSideEffects = 0 in - def rrk : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src0, KRC:$mask, RC:$src1), - !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", - "${dst} {${mask}}, $src1}"), - [(set RC:$dst, (vt (vselect KRC:$mask, - (vt RC:$src1), - (vt RC:$src0))))], - d>, EVEX, EVEX_K; + [(set _.RC:$dst, (_.VT (bitconvert (ld_frag addr:$src))))], + _.ExeDomain>, EVEX; + + let Constraints = "$src0 = $dst" in { + def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1), + !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", + "${dst} {${mask}}, $src1}"), + [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask, + (_.VT _.RC:$src1), + (_.VT _.RC:$src0))))], _.ExeDomain>, + EVEX, EVEX_K; let mayLoad = 1, SchedRW = [WriteLoad] in - def rmk : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src0, KRC:$mask, memop:$src1), + def rmk : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src1), !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", "${dst} {${mask}}, $src1}"), - [(set RC:$dst, (vt - (vselect KRC:$mask, - (vt (bitconvert (ld_frag addr:$src1))), - (vt RC:$src0))))], - d>, EVEX, EVEX_K; + [(set _.RC:$dst, (_.VT + (vselect _.KRCWM:$mask, + (_.VT (bitconvert (ld_frag addr:$src1))), + (_.VT _.RC:$src0))))], _.ExeDomain>, EVEX, EVEX_K; } let mayLoad = 1, SchedRW = [WriteLoad] in - def rmkz : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, memop:$src), - !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", - "${dst} {${mask}} {z}, $src}"), - [(set RC:$dst, (vt - (vselect KRC:$mask, - (vt (bitconvert (ld_frag addr:$src))), - (vt (bitconvert (zvt immAllZerosV))))))], - d>, EVEX, EVEX_KZ; + def rmkz : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.MemOp:$src), + OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"# + "${dst} {${mask}} {z}, $src}", + [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask, + (_.VT (bitconvert (ld_frag addr:$src))), _.ImmAllZerosV)))], + _.ExeDomain>, EVEX, EVEX_KZ; } + def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)), + (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>; + + def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, _.ImmAllZerosV)), + (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>; + + def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src0))), + (!cast<Instruction>(NAME#_.ZSuffix##rmk) _.RC:$src0, + _.KRCWM:$mask, addr:$ptr)>; } -multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, string ld_pat, - string elty, string elsz, string vsz512, - string vsz256, string vsz128, Domain d, - Predicate prd, bit IsReMaterializable = 1> { +multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, + Predicate prd, + bit IsReMaterializable = 1> { let Predicates = [prd] in - defm Z : avx512_load<opc, OpcodeStr, - !cast<PatFrag>(ld_pat##"v"##vsz512##elty##elsz), - !cast<RegisterClass>("VK"##vsz512##"WM"), VR512, - !cast<ValueType>("v"##vsz512##elty##elsz), v16i32, - !cast<X86MemOperand>(elty##"512mem"), d, - IsReMaterializable>, EVEX_V512; + defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.AlignedLdFrag, + masked_load_aligned512, IsReMaterializable>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_load<opc, OpcodeStr, - !cast<PatFrag>(ld_pat##!if(!eq(elty,"f"), - "v"##vsz256##elty##elsz, "v4i64")), - !cast<RegisterClass>("VK"##vsz256##"WM"), VR256X, - !cast<ValueType>("v"##vsz256##elty##elsz), v8i32, - !cast<X86MemOperand>(elty##"256mem"), d, - IsReMaterializable>, EVEX_V256; - - defm Z128 : avx512_load<opc, OpcodeStr, - !cast<PatFrag>(ld_pat##!if(!eq(elty,"f"), - "v"##vsz128##elty##elsz, "v2i64")), - !cast<RegisterClass>("VK"##vsz128##"WM"), VR128X, - !cast<ValueType>("v"##vsz128##elty##elsz), v4i32, - !cast<X86MemOperand>(elty##"128mem"), d, - IsReMaterializable>, EVEX_V128; + defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.AlignedLdFrag, + masked_load_aligned256, IsReMaterializable>, EVEX_V256; + defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.AlignedLdFrag, + masked_load_aligned128, IsReMaterializable>, EVEX_V128; } } +multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, + Predicate prd, + bit IsReMaterializable = 1> { + let Predicates = [prd] in + defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.LdFrag, + masked_load_unaligned, IsReMaterializable>, EVEX_V512; -multiclass avx512_store<bits<8> opc, string OpcodeStr, PatFrag st_frag, - ValueType OpVT, RegisterClass KRC, RegisterClass RC, - X86MemOperand memop, Domain d> { + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.LdFrag, + masked_load_unaligned, IsReMaterializable>, EVEX_V256; + defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.LdFrag, + masked_load_unaligned, IsReMaterializable>, EVEX_V128; + } +} + +multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + PatFrag st_frag, PatFrag mstore> { let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { - def rr_alt : AVX512PI<opc, MRMDestReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], d>, - EVEX; + def rr_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src), + OpcodeStr # "\t{$src, $dst|$dst, $src}", [], + _.ExeDomain>, EVEX; let Constraints = "$src1 = $dst" in - def rrk_alt : AVX512PI<opc, MRMDestReg, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, RC:$src2), - !strconcat(OpcodeStr, - "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [], d>, - EVEX, EVEX_K; - def rrkz_alt : AVX512PI<opc, MRMDestReg, (outs RC:$dst), - (ins KRC:$mask, RC:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - [], d>, EVEX, EVEX_KZ; + def rrk_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), + (ins _.RC:$src1, _.KRCWM:$mask, _.RC:$src2), + OpcodeStr # + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}", + [], _.ExeDomain>, EVEX, EVEX_K; + def rrkz_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src), + OpcodeStr # + "\t{$src, ${dst} {${mask}} {z}|" # + "${dst} {${mask}} {z}, $src}", + [], _.ExeDomain>, EVEX, EVEX_KZ; } let mayStore = 1 in { - def mr : AVX512PI<opc, MRMDestMem, (outs), (ins memop:$dst, RC:$src), + def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(st_frag (OpVT RC:$src), addr:$dst)], d>, EVEX; + [(st_frag (_.VT _.RC:$src), addr:$dst)], _.ExeDomain>, EVEX; def mrk : AVX512PI<opc, MRMDestMem, (outs), - (ins memop:$dst, KRC:$mask, RC:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), - [], d>, EVEX, EVEX_K; + (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src), + OpcodeStr # "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}", + [], _.ExeDomain>, EVEX, EVEX_K; } + + def: Pat<(mstore addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src)), + (!cast<Instruction>(NAME#_.ZSuffix##mrk) addr:$ptr, + _.KRCWM:$mask, _.RC:$src)>; } -multiclass avx512_store_vl<bits<8> opc, string OpcodeStr, string st_pat, - string st_suff_512, string st_suff_256, - string st_suff_128, string elty, string elsz, - string vsz512, string vsz256, string vsz128, - Domain d, Predicate prd> { +multiclass avx512_store_vl< bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, Predicate prd> { let Predicates = [prd] in - defm Z : avx512_store<opc, OpcodeStr, !cast<PatFrag>(st_pat##st_suff_512), - !cast<ValueType>("v"##vsz512##elty##elsz), - !cast<RegisterClass>("VK"##vsz512##"WM"), VR512, - !cast<X86MemOperand>(elty##"512mem"), d>, EVEX_V512; + defm Z : avx512_store<opc, OpcodeStr, _.info512, store, + masked_store_unaligned>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_store<opc, OpcodeStr, !cast<PatFrag>(st_pat##st_suff_256), - !cast<ValueType>("v"##vsz256##elty##elsz), - !cast<RegisterClass>("VK"##vsz256##"WM"), VR256X, - !cast<X86MemOperand>(elty##"256mem"), d>, EVEX_V256; - - defm Z128 : avx512_store<opc, OpcodeStr, !cast<PatFrag>(st_pat##st_suff_128), - !cast<ValueType>("v"##vsz128##elty##elsz), - !cast<RegisterClass>("VK"##vsz128##"WM"), VR128X, - !cast<X86MemOperand>(elty##"128mem"), d>, EVEX_V128; + defm Z256 : avx512_store<opc, OpcodeStr, _.info256, store, + masked_store_unaligned>, EVEX_V256; + defm Z128 : avx512_store<opc, OpcodeStr, _.info128, store, + masked_store_unaligned>, EVEX_V128; } } -defm VMOVAPS : avx512_load_vl<0x28, "vmovaps", "alignedload", "f", "32", - "16", "8", "4", SSEPackedSingle, HasAVX512>, - avx512_store_vl<0x29, "vmovaps", "alignedstore", - "512", "256", "", "f", "32", "16", "8", "4", - SSEPackedSingle, HasAVX512>, - PS, EVEX_CD8<32, CD8VF>; +multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_store<opc, OpcodeStr, _.info512, alignedstore512, + masked_store_aligned512>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_store<opc, OpcodeStr, _.info256, alignedstore256, + masked_store_aligned256>, EVEX_V256; + defm Z128 : avx512_store<opc, OpcodeStr, _.info128, alignedstore, + masked_store_aligned128>, EVEX_V128; + } +} -defm VMOVAPD : avx512_load_vl<0x28, "vmovapd", "alignedload", "f", "64", - "8", "4", "2", SSEPackedDouble, HasAVX512>, - avx512_store_vl<0x29, "vmovapd", "alignedstore", - "512", "256", "", "f", "64", "8", "4", "2", - SSEPackedDouble, HasAVX512>, - PD, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VMOVUPS : avx512_load_vl<0x10, "vmovups", "load", "f", "32", - "16", "8", "4", SSEPackedSingle, HasAVX512>, - avx512_store_vl<0x11, "vmovups", "store", "", "", "", "f", "32", - "16", "8", "4", SSEPackedSingle, HasAVX512>, +defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info, + HasAVX512>, + avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info, + HasAVX512>, PS, EVEX_CD8<32, CD8VF>; + +defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info, + HasAVX512>, + avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info, + HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512>, + avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512>, PS, EVEX_CD8<32, CD8VF>; -defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", "load", "f", "64", - "8", "4", "2", SSEPackedDouble, HasAVX512, 0>, - avx512_store_vl<0x11, "vmovupd", "store", "", "", "", "f", "64", - "8", "4", "2", SSEPackedDouble, HasAVX512>, - PD, VEX_W, EVEX_CD8<64, CD8VF>; +defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, 0>, + avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>, + PD, VEX_W, EVEX_CD8<64, CD8VF>; def: Pat<(v8f64 (int_x86_avx512_mask_loadu_pd_512 addr:$ptr, (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)), @@ -2276,6 +2311,7 @@ def: Pat<(int_x86_avx512_mask_store_pd_512 addr:$ptr, (v8f64 VR512:$src), (VMOVAPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src)>; +let Predicates = [HasAVX512, NoVLX] in { def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src)), (VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), @@ -2285,73 +2321,36 @@ def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, undef)), (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmkz (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; -def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src)), - (VMOVUPSZmrk addr:$ptr, VK16WM:$mask, VR512:$src)>; - -def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src)), - (VMOVUPDZmrk addr:$ptr, VK8WM:$mask, VR512:$src)>; - -def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, undef)), - (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>; - -def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, - (bc_v16f32 (v16i32 immAllZerosV)))), - (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>; - -def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src0))), - (VMOVUPSZrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>; - -def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, undef)), - (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>; - -def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, - (bc_v8f64 (v16i32 immAllZerosV)))), - (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>; - -def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src0))), - (VMOVUPDZrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>; - def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src0))), (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmk (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src0, sub_ymm), (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; +} -defm VMOVDQA32 : avx512_load_vl<0x6F, "vmovdqa32", "alignedload", "i", "32", - "16", "8", "4", SSEPackedInt, HasAVX512>, - avx512_store_vl<0x7F, "vmovdqa32", "alignedstore", - "512", "256", "", "i", "32", "16", "8", "4", - SSEPackedInt, HasAVX512>, - PD, EVEX_CD8<32, CD8VF>; - -defm VMOVDQA64 : avx512_load_vl<0x6F, "vmovdqa64", "alignedload", "i", "64", - "8", "4", "2", SSEPackedInt, HasAVX512>, - avx512_store_vl<0x7F, "vmovdqa64", "alignedstore", - "512", "256", "", "i", "64", "8", "4", "2", - SSEPackedInt, HasAVX512>, - PD, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", "load", "i", "8", - "64", "32", "16", SSEPackedInt, HasBWI>, - avx512_store_vl<0x7F, "vmovdqu8", "store", "", "", "", - "i", "8", "64", "32", "16", SSEPackedInt, +defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info, + HasAVX512>, + avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info, + HasAVX512>, PD, EVEX_CD8<32, CD8VF>; + +defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info, + HasAVX512>, + avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info, + HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>, + avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, HasBWI>, XD, EVEX_CD8<8, CD8VF>; -defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", "load", "i", "16", - "32", "16", "8", SSEPackedInt, HasBWI>, - avx512_store_vl<0x7F, "vmovdqu16", "store", "", "", "", - "i", "16", "32", "16", "8", SSEPackedInt, +defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>, + avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info, HasBWI>, XD, VEX_W, EVEX_CD8<16, CD8VF>; -defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", "load", "i", "32", - "16", "8", "4", SSEPackedInt, HasAVX512>, - avx512_store_vl<0x7F, "vmovdqu32", "store", "", "", "", - "i", "32", "16", "8", "4", SSEPackedInt, +defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512>, + avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info, HasAVX512>, XS, EVEX_CD8<32, CD8VF>; -defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", "load", "i", "64", - "8", "4", "2", SSEPackedInt, HasAVX512>, - avx512_store_vl<0x7F, "vmovdqu64", "store", "", "", "", - "i", "64", "8", "4", "2", SSEPackedInt, +defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512>, + avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>; def: Pat<(v16i32 (int_x86_avx512_mask_loadu_d_512 addr:$ptr, @@ -2389,37 +2388,8 @@ def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV), (v16i32 VR512:$src))), (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>; } - -def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 immAllZerosV))), - (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>; - -def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, undef)), - (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>; - -def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src0))), - (VMOVDQU32Zrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>; - -def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, - (bc_v8i64 (v16i32 immAllZerosV)))), - (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>; - -def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, undef)), - (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>; - -def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src0))), - (VMOVDQU64Zrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>; - -def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src)), - (VMOVDQU32Zmrk addr:$ptr, VK16WM:$mask, VR512:$src)>; - -def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src)), - (VMOVDQU64Zmrk addr:$ptr, VK8WM:$mask, VR512:$src)>; - -// SKX replacement -def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), - (VMOVDQU32Z256mrk addr:$ptr, VK8WM:$mask, VR256:$src)>; - -// KNL replacement +// NoVLX patterns +let Predicates = [HasAVX512, NoVLX] in { def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), (VMOVDQU32Zmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), @@ -2428,7 +2398,7 @@ def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, undef)), (v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmkz (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; - +} // Move Int Doubleword to Packed Double Int // @@ -3243,28 +3213,95 @@ defm VPANDN : avx512_binop_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, //===----------------------------------------------------------------------===// // AVX-512 FP arithmetic //===----------------------------------------------------------------------===// +multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + SDNode OpNode, SDNode VecNode, OpndItins itins, + bit IsCommutable> { -multiclass avx512_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, - SizeItins itins> { - defm SSZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), OpNode, FR32X, - f32mem, itins.s, 0>, XS, EVEX_4V, VEX_LIG, - EVEX_CD8<32, CD8VT1>; - defm SDZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), OpNode, FR64X, - f64mem, itins.d, 0>, XD, VEX_W, EVEX_4V, VEX_LIG, - EVEX_CD8<64, CD8VT1>; + defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 FROUND_CURRENT)), + "", itins.rr, IsCommutable>; + + defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (VecNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), + (i32 FROUND_CURRENT)), + "", itins.rm, IsCommutable>; + let isCodeGenOnly = 1, isCommutable = IsCommutable, + Predicates = [HasAVX512] in { + def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))], + itins.rr>; + def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.ScalarMemOp:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.FRC:$dst, (OpNode _.FRC:$src1, + (_.ScalarLdFrag addr:$src2)))], itins.rr>; + } } -let isCommutable = 1 in { -defm VADD : avx512_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>; -defm VMUL : avx512_binop_s<0x59, "mul", fmul, SSE_ALU_ITINS_S>; -defm VMIN : avx512_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>; -defm VMAX : avx512_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>; +multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + SDNode VecNode, OpndItins itins, bit IsCommutable> { + + defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, + "$rc, $src2, $src1", "$src1, $src2, $rc", + (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 imm:$rc)), "", itins.rr, IsCommutable>, + EVEX_B, EVEX_RC; } -let isCommutable = 0 in { -defm VSUB : avx512_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>; -defm VDIV : avx512_binop_s<0x5E, "div", fdiv, SSE_ALU_ITINS_S>; +multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + SDNode VecNode, OpndItins itins, bit IsCommutable> { + + defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 FROUND_NO_EXC)), "{sae}">, EVEX_B; } +multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode VecNode, + SizeItins itins, bit IsCommutable> { + defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode, + itins.s, IsCommutable>, + avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, VecNode, + itins.s, IsCommutable>, + XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; + defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode, + itins.d, IsCommutable>, + avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, VecNode, + itins.d, IsCommutable>, + XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; +} + +multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode VecNode, + SizeItins itins, bit IsCommutable> { + defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode, + itins.s, IsCommutable>, + avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, VecNode, + itins.s, IsCommutable>, + XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; + defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode, + itins.d, IsCommutable>, + avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, VecNode, + itins.d, IsCommutable>, + XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; +} +defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnd, SSE_ALU_ITINS_S, 1>; +defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnd, SSE_ALU_ITINS_S, 1>; +defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnd, SSE_ALU_ITINS_S, 0>; +defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnd, SSE_ALU_ITINS_S, 0>; +defm VMIN : avx512_binop_s_sae <0x5D, "vmin", X86fmin, X86fminRnd, SSE_ALU_ITINS_S, 1>; +defm VMAX : avx512_binop_s_sae <0x5F, "vmax", X86fmax, X86fmaxRnd, SSE_ALU_ITINS_S, 1>; + multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, bit IsCommutable> { defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), @@ -3411,15 +3448,27 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))), " ", SSE_INTSHIFT_ITINS_P.rr>, AVX512BIi8Base, EVEX_4V; + let mayLoad = 1 in defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (_.VT (OpNode (_.LdFrag addr:$src1), (i8 imm:$src2))), + (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i8 imm:$src2))), " ", SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V; } +multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM, + string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + let mayLoad = 1 in + defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr, + "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2", + (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2))), + " ", SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V, EVEX_B; +} + multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode, - ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { + ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { // src2 is always 128-bit defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, VR128X:$src2), OpcodeStr, @@ -3430,46 +3479,95 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, i128mem:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2)))), - " ", SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase, EVEX_4V; + " ", SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase, + EVEX_4V; } multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, - ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { - defm Z : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag, _>, EVEX_V512; + ValueType SrcVT, PatFrag bc_frag, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag, + VTInfo.info512>, EVEX_V512, + EVEX_CD8<VTInfo.info512.EltSize, CD8VQ> ; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag, + VTInfo.info256>, EVEX_V256, + EVEX_CD8<VTInfo.info256.EltSize, CD8VH>; + defm Z128 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag, + VTInfo.info128>, EVEX_V128, + EVEX_CD8<VTInfo.info128.EltSize, CD8VF>; + } } -multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, string OpcodeStr, - SDNode OpNode> { +multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, bits<8> opcw, + string OpcodeStr, SDNode OpNode> { defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, v4i32, bc_v4i32, - v16i32_info>, EVEX_CD8<32, CD8VQ>; + avx512vl_i32_info, HasAVX512>; defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, v2i64, bc_v2i64, - v8i64_info>, EVEX_CD8<64, CD8VQ>, VEX_W; + avx512vl_i64_info, HasAVX512>, VEX_W; + defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, v8i16, bc_v8i16, + avx512vl_i16_info, HasBWI>; +} + +multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM, + string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo> { + let Predicates = [HasAVX512] in + defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, + VTInfo.info512>, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + VTInfo.info512>, EVEX_V512; + let Predicates = [HasAVX512, HasVLX] in { + defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, + VTInfo.info256>, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + VTInfo.info256>, EVEX_V256; + defm Z128: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, + VTInfo.info128>, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + VTInfo.info128>, EVEX_V128; + } } -defm VPSRLDZ : avx512_shift_rmi<0x72, MRM2r, MRM2m, "vpsrld", X86vsrli, - v16i32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPSRLQZ : avx512_shift_rmi<0x73, MRM2r, MRM2m, "vpsrlq", X86vsrli, - v8i64_info>, EVEX_V512, - EVEX_CD8<64, CD8VF>, VEX_W; +multiclass avx512_shift_rmi_w<bits<8> opcw, + Format ImmFormR, Format ImmFormM, + string OpcodeStr, SDNode OpNode> { + let Predicates = [HasBWI] in + defm WZ: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode, + v32i16_info>, EVEX_V512; + let Predicates = [HasVLX, HasBWI] in { + defm WZ256: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode, + v16i16x_info>, EVEX_V256; + defm WZ128: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode, + v8i16x_info>, EVEX_V128; + } +} -defm VPSLLDZ : avx512_shift_rmi<0x72, MRM6r, MRM6m, "vpslld", X86vshli, - v16i32_info>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPSLLQZ : avx512_shift_rmi<0x73, MRM6r, MRM6m, "vpsllq", X86vshli, - v8i64_info>, EVEX_V512, - EVEX_CD8<64, CD8VF>, VEX_W; +multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq, + Format ImmFormR, Format ImmFormM, + string OpcodeStr, SDNode OpNode> { + defm D: avx512_shift_rmi_sizes<opcd, ImmFormR, ImmFormM, OpcodeStr#"d", OpNode, + avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; + defm Q: avx512_shift_rmi_sizes<opcq, ImmFormR, ImmFormM, OpcodeStr#"q", OpNode, + avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W; +} -defm VPSRADZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsrad", X86vsrai, - v16i32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPSRAQZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsraq", X86vsrai, - v8i64_info>, EVEX_V512, - EVEX_CD8<64, CD8VF>, VEX_W; +defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli>, + avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli>; -defm VPSLL : avx512_shift_types<0xF2, 0xF3, "vpsll", X86vshl>; -defm VPSRA : avx512_shift_types<0xE2, 0xE2, "vpsra", X86vsra>; -defm VPSRL : avx512_shift_types<0xD2, 0xD3, "vpsrl", X86vsrl>; +defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli>, + avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli>; + +defm VPSRA : avx512_shift_rmi_dq<0x72, 0x73, MRM4r, MRM4m, "vpsra", X86vsrai>, + avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai>; + +defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", rotr>; +defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", rotl>; + +defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl>; +defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>; +defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl>; //===-------------------------------------------------------------------===// // Variable Bit Shifts @@ -3481,29 +3579,71 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2))), " ", SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V; + let mayLoad = 1 in defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2))), - " ", SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V; + " ", SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V, + EVEX_CD8<_.EltSize, CD8VF>; } +multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let mayLoad = 1 in + defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2))))), + " ", SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B, + EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; +} multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, AVX512VLVectorVTInfo _> { - defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512; + let Predicates = [HasAVX512] in + defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512; + + let Predicates = [HasAVX512, HasVLX] in { + defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256; + defm Z128 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info128>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128; + } } multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr, SDNode OpNode> { defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode, - avx512vl_i32_info>, EVEX_CD8<32, CD8VQ>; + avx512vl_i32_info>; defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode, - avx512vl_i64_info>, EVEX_CD8<64, CD8VQ>, VEX_W; + avx512vl_i64_info>, VEX_W; } -defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>; -defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>; -defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>; +multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + let Predicates = [HasBWI] in + defm WZ: avx512_var_shift<opc, OpcodeStr, OpNode, v32i16_info>, + EVEX_V512, VEX_W; + let Predicates = [HasVLX, HasBWI] in { + + defm WZ256: avx512_var_shift<opc, OpcodeStr, OpNode, v16i16x_info>, + EVEX_V256, VEX_W; + defm WZ128: avx512_var_shift<opc, OpcodeStr, OpNode, v8i16x_info>, + EVEX_V128, VEX_W; + } +} + +defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>, + avx512_var_shift_w<0x12, "vpsllvw", shl>; +defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>, + avx512_var_shift_w<0x11, "vpsravw", sra>; +defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>, + avx512_var_shift_w<0x10, "vpsrlvw", srl>; +defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>; +defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>; //===----------------------------------------------------------------------===// // AVX-512 - MOVDDUP @@ -4919,81 +5059,74 @@ defm VPMOVSXDQZ: avx512_extend<0x25, "vpmovsxdq", VK8WM, VR512, VR256X, X86vsext //===----------------------------------------------------------------------===// // GATHER - SCATTER Operations -multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - X86MemOperand memop, PatFrag GatherNode> { -let mayLoad = 1, hasTwoExplicitDefs = 1, +multiclass avx512_gather<bits<8> opc, string OpcodeStr, RegisterClass KRC, + RegisterClass RC, X86MemOperand memop> { +let mayLoad = 1, Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb" in - def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, _.KRCWM:$mask_wb), - (ins _.RC:$src1, _.KRCWM:$mask, memop:$src2), + def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst, KRC:$mask_wb), + (ins RC:$src1, KRC:$mask, memop:$src2), !strconcat(OpcodeStr, "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), - [(set _.RC:$dst, _.KRCWM:$mask_wb, - (_.VT (GatherNode (_.VT _.RC:$src1), _.KRCWM:$mask, - vectoraddr:$src2)))]>, EVEX, EVEX_K, - EVEX_CD8<_.EltSize, CD8VT1>; + []>, EVEX, EVEX_K; } let ExeDomain = SSEPackedDouble in { -defm VGATHERDPDZ : avx512_gather<0x92, "vgatherdpd", v8f64_info, vy64xmem, - mgatherv8i32>, EVEX_V512, VEX_W; -defm VGATHERQPDZ : avx512_gather<0x93, "vgatherqpd", v8f64_info, vz64mem, - mgatherv8i64>, EVEX_V512, VEX_W; +defm VGATHERDPDZ : avx512_gather<0x92, "vgatherdpd", VK8WM, VR512, vy64xmem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VGATHERQPDZ : avx512_gather<0x93, "vgatherqpd", VK8WM, VR512, vz64mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; } let ExeDomain = SSEPackedSingle in { -defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", v16f32_info, vz32mem, - mgatherv16i32>, EVEX_V512; -defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", v8f32x_info, vz64mem, - mgatherv8i64>, EVEX_V512; +defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", VK16WM, VR512, vz32mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; +defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", VK8WM, VR256X, vz64mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; } -defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", v8i64_info, vy64xmem, - mgatherv8i32>, EVEX_V512, VEX_W; -defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", v16i32_info, vz32mem, - mgatherv16i32>, EVEX_V512; - -defm VPGATHERQQZ : avx512_gather<0x91, "vpgatherqq", v8i64_info, vz64mem, - mgatherv8i64>, EVEX_V512, VEX_W; -defm VPGATHERQDZ : avx512_gather<0x91, "vpgatherqd", v8i32x_info, vz64mem, - mgatherv8i64>, EVEX_V512; +defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", VK8WM, VR512, vy64xmem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", VK16WM, VR512, vz32mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; -multiclass avx512_scatter<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - X86MemOperand memop, PatFrag ScatterNode> { +defm VPGATHERQQZ : avx512_gather<0x91, "vpgatherqq", VK8WM, VR512, vz64mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VPGATHERQDZ : avx512_gather<0x91, "vpgatherqd", VK8WM, VR256X, vz64mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; +multiclass avx512_scatter<bits<8> opc, string OpcodeStr, RegisterClass KRC, + RegisterClass RC, X86MemOperand memop> { let mayStore = 1, Constraints = "$mask = $mask_wb" in - - def mr : AVX5128I<opc, MRMDestMem, (outs _.KRCWM:$mask_wb), - (ins memop:$dst, _.KRCWM:$mask, _.RC:$src), + def mr : AVX5128I<opc, MRMDestMem, (outs KRC:$mask_wb), + (ins memop:$dst, KRC:$mask, RC:$src2), !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), - [(set _.KRCWM:$mask_wb, (ScatterNode (_.VT _.RC:$src), - _.KRCWM:$mask, vectoraddr:$dst))]>, - EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), + []>, EVEX, EVEX_K; } let ExeDomain = SSEPackedDouble in { -defm VSCATTERDPDZ : avx512_scatter<0xA2, "vscatterdpd", v8f64_info, vy64xmem, - mscatterv8i32>, EVEX_V512, VEX_W; -defm VSCATTERQPDZ : avx512_scatter<0xA3, "vscatterqpd", v8f64_info, vz64mem, - mscatterv8i64>, EVEX_V512, VEX_W; +defm VSCATTERDPDZ : avx512_scatter<0xA2, "vscatterdpd", VK8WM, VR512, vy64xmem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VSCATTERQPDZ : avx512_scatter<0xA3, "vscatterqpd", VK8WM, VR512, vz64mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; } let ExeDomain = SSEPackedSingle in { -defm VSCATTERDPSZ : avx512_scatter<0xA2, "vscatterdps", v16f32_info, vz32mem, - mscatterv16i32>, EVEX_V512; -defm VSCATTERQPSZ : avx512_scatter<0xA3, "vscatterqps", v8f32x_info, vz64mem, - mscatterv8i64>, EVEX_V512; +defm VSCATTERDPSZ : avx512_scatter<0xA2, "vscatterdps", VK16WM, VR512, vz32mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; +defm VSCATTERQPSZ : avx512_scatter<0xA3, "vscatterqps", VK8WM, VR256X, vz64mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; } -defm VPSCATTERDQZ : avx512_scatter<0xA0, "vpscatterdq", v8i64_info, vy64xmem, - mscatterv8i32>, EVEX_V512, VEX_W; -defm VPSCATTERDDZ : avx512_scatter<0xA0, "vpscatterdd", v16i32_info, vz32mem, - mscatterv16i32>, EVEX_V512; +defm VPSCATTERDQZ : avx512_scatter<0xA0, "vpscatterdq", VK8WM, VR512, vy64xmem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VPSCATTERDDZ : avx512_scatter<0xA0, "vpscatterdd", VK16WM, VR512, vz32mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; -defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", v8i64_info, vz64mem, - mscatterv8i64>, EVEX_V512, VEX_W; -defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", v8i32x_info, vz64mem, - mscatterv8i64>, EVEX_V512; +defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", VK8WM, VR512, vz64mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", VK8WM, VR256X, vz64mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; // prefetch multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index bf515a8..0bdabdf 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -282,6 +282,8 @@ def X86faddRnd : SDNode<"X86ISD::FADD_RND", SDTFPBinOpRound>; def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>; def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>; def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>; +def X86fmaxRnd : SDNode<"X86ISD::FMAX", SDTFPBinOpRound>; +def X86fminRnd : SDNode<"X86ISD::FMIN", SDTFPBinOpRound>; def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>; @@ -304,8 +306,6 @@ def X86exp2 : SDNode<"X86ISD::EXP2", STDFp1SrcRm>; def X86rsqrt28s : SDNode<"X86ISD::RSQRT28", STDFp2SrcRm>; def X86rcp28s : SDNode<"X86ISD::RCP28", STDFp2SrcRm>; def X86RndScale : SDNode<"X86ISD::RNDSCALE", STDFp3SrcRm>; -def X86mgather : SDNode<"X86ISD::GATHER", SDTypeProfile<1, 3, - [SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>]>>; def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>, @@ -526,58 +526,6 @@ def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), return false; }]>; -def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_gather node:$src1, node:$src2, node:$src3) , [{ - //if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) - // return (Mgt->getIndex().getValueType() == MVT::v8i32 || - // Mgt->getBasePtr().getValueType() == MVT::v8i32); - //return false; - return N != 0; -}]>; - -def mgatherv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_gather node:$src1, node:$src2, node:$src3) , [{ - //if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) - // return (Mgt->getIndex().getValueType() == MVT::v8i64 || - // Mgt->getBasePtr().getValueType() == MVT::v8i64); - //return false; - return N != 0; -}]>; -def mgatherv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_gather node:$src1, node:$src2, node:$src3) , [{ - //if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) - // return (Mgt->getIndex().getValueType() == MVT::v16i32 || - // Mgt->getBasePtr().getValueType() == MVT::v16i32); - //return false; - return N != 0; -}]>; - -def mscatterv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_scatter node:$src1, node:$src2, node:$src3) , [{ - //if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) - // return (Sc->getIndex().getValueType() == MVT::v8i32 || - // Sc->getBasePtr().getValueType() == MVT::v8i32); - //return false; - return N != 0; -}]>; - -def mscatterv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_scatter node:$src1, node:$src2, node:$src3) , [{ - //if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) - // return (Sc->getIndex().getValueType() == MVT::v8i64 || - // Sc->getBasePtr().getValueType() == MVT::v8i64); - //return false; - return N != 0; -}]>; -def mscatterv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_scatter node:$src1, node:$src2, node:$src3) , [{ - //if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) - // return (Sc->getIndex().getValueType() == MVT::v16i32 || - // Sc->getBasePtr().getValueType() == MVT::v16i32); - //return false; - return N != 0; -}]>; - // 128-bit bitconvert pattern fragments def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>; def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>; @@ -681,3 +629,55 @@ def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec, return X86::isVINSERT256Index(N); }], INSERT_get_vinsert256_imm>; +def masked_load_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_load node:$src1, node:$src2, node:$src3), [{ + if (dyn_cast<MaskedLoadSDNode>(N)) + return cast<MaskedLoadSDNode>(N)->getAlignment() >= 16; + return false; +}]>; + +def masked_load_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_load node:$src1, node:$src2, node:$src3), [{ + if (dyn_cast<MaskedLoadSDNode>(N)) + return cast<MaskedLoadSDNode>(N)->getAlignment() >= 32; + return false; +}]>; + +def masked_load_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_load node:$src1, node:$src2, node:$src3), [{ + if (dyn_cast<MaskedLoadSDNode>(N)) + return cast<MaskedLoadSDNode>(N)->getAlignment() >= 64; + return false; +}]>; + +def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_load node:$src1, node:$src2, node:$src3), [{ + return (dyn_cast<MaskedLoadSDNode>(N) != 0); +}]>; + +def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + if (dyn_cast<MaskedStoreSDNode>(N)) + return cast<MaskedStoreSDNode>(N)->getAlignment() >= 16; + return false; +}]>; + +def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + if (dyn_cast<MaskedStoreSDNode>(N)) + return cast<MaskedStoreSDNode>(N)->getAlignment() >= 32; + return false; +}]>; + +def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + if (dyn_cast<MaskedStoreSDNode>(N)) + return cast<MaskedStoreSDNode>(N)->getAlignment() >= 64; + return false; +}]>; + +def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + return (dyn_cast<MaskedStoreSDNode>(N) != 0); +}]>; + diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index f5b9680..538ec1c 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -104,7 +104,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) : X86GenInstrInfo( (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)), - Subtarget(STI), RI(STI) { + Subtarget(STI), RI(STI.getTargetTriple()) { static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = { { X86::ADC32ri, X86::ADC32mi, 0 }, @@ -4573,9 +4573,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI, return nullptr; // Check whether we can fold the def into SrcOperandId. - SmallVector<unsigned, 8> Ops; - Ops.push_back(SrcOperandId); - MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI); + MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI); if (FoldMI) { FoldAsLoadDefReg = 0; return FoldMI; @@ -4670,7 +4668,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { } static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, - const SmallVectorImpl<MachineOperand> &MOs, + ArrayRef<MachineOperand> MOs, MachineInstr *MI, const TargetInstrInfo &TII) { // Create the base instruction with the memory operand as the first part. @@ -4697,9 +4695,8 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, return MIB; } -static MachineInstr *FuseInst(MachineFunction &MF, - unsigned Opcode, unsigned OpNo, - const SmallVectorImpl<MachineOperand> &MOs, +static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, + unsigned OpNo, ArrayRef<MachineOperand> MOs, MachineInstr *MI, const TargetInstrInfo &TII) { // Omit the implicit operands, something BuildMI can't do. MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), @@ -4723,7 +4720,7 @@ static MachineInstr *FuseInst(MachineFunction &MF, } static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, - const SmallVectorImpl<MachineOperand> &MOs, + ArrayRef<MachineOperand> MOs, MachineInstr *MI) { MachineFunction &MF = *MI->getParent()->getParent(); MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(Opcode)); @@ -4736,12 +4733,12 @@ static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, return MIB.addImm(0); } -MachineInstr* -X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr *MI, unsigned OpNum, - const SmallVectorImpl<MachineOperand> &MOs, - unsigned Size, unsigned Align, - bool AllowCommute) const { +MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + unsigned OpNum, + ArrayRef<MachineOperand> MOs, + unsigned Size, unsigned Align, + bool AllowCommute) const { const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr; bool isCallRegIndirect = Subtarget.callRegIndirect(); @@ -5104,10 +5101,10 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, MI->addRegisterKilled(Reg, TRI, true); } -MachineInstr* -X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, - int FrameIndex) const { +MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + ArrayRef<unsigned> Ops, + int FrameIndex) const { // Check switch flag if (NoFusing) return nullptr; @@ -5145,10 +5142,9 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, } else if (Ops.size() != 1) return nullptr; - SmallVector<MachineOperand,4> MOs; - MOs.push_back(MachineOperand::CreateFI(FrameIndex)); - return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, - Size, Alignment, /*AllowCommute=*/true); + return foldMemoryOperandImpl(MF, MI, Ops[0], + MachineOperand::CreateFI(FrameIndex), Size, + Alignment, /*AllowCommute=*/true); } static bool isPartialRegisterLoad(const MachineInstr &LoadMI, @@ -5170,9 +5166,9 @@ static bool isPartialRegisterLoad(const MachineInstr &LoadMI, return false; } -MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, +MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, + ArrayRef<unsigned> Ops, MachineInstr *LoadMI) const { // If loading from a FrameIndex, fold directly from the FrameIndex. unsigned NumOps = LoadMI->getDesc().getNumOperands(); @@ -5295,8 +5291,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, return nullptr; // Folding a normal load. Just copy the load's address operands. - for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) - MOs.push_back(LoadMI->getOperand(i)); + MOs.append(LoadMI->operands_begin() + NumOps - X86::AddrNumOperands, + LoadMI->operands_begin() + NumOps); break; } } @@ -5304,9 +5300,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, /*Size=*/0, Alignment, /*AllowCommute=*/true); } - bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops) const { + ArrayRef<unsigned> Ops) const { // Check switch flag if (NoFusing) return 0; @@ -5559,7 +5554,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, } if (Load) BeforeOps.push_back(SDValue(Load, 0)); - std::copy(AfterOps.begin(), AfterOps.end(), std::back_inserter(BeforeOps)); + BeforeOps.insert(BeforeOps.end(), AfterOps.begin(), AfterOps.end()); SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps); NewNodes.push_back(NewNode); diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 4d15467..0dd8101 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -305,23 +305,21 @@ public: /// folding and return true, otherwise it should return false. If it folds /// the instruction, it is likely that the MachineInstruction the iterator /// references has been changed. - MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, - const SmallVectorImpl<unsigned> &Ops, + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, int FrameIndex) const override; /// foldMemoryOperand - Same as the previous version except it allows folding /// of any load and store from / to any address, not just from a specific /// stack slot. - MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, - const SmallVectorImpl<unsigned> &Ops, - MachineInstr* LoadMI) const override; + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, + MachineInstr *LoadMI) const override; /// canFoldMemoryOperand - Returns true if the specified load / store is /// folding is possible. - bool canFoldMemoryOperand(const MachineInstr*, - const SmallVectorImpl<unsigned> &) const override; + bool canFoldMemoryOperand(const MachineInstr *, + ArrayRef<unsigned>) const override; /// unfoldMemoryOperand - Separate a single instruction which folded a load or /// a store or a load and a store into two or more instruction. If this is @@ -406,10 +404,9 @@ public: void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override; - MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, unsigned OpNum, - const SmallVectorImpl<MachineOperand> &MOs, + ArrayRef<MachineOperand> MOs, unsigned Size, unsigned Alignment, bool AllowCommute) const; diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 9881caf..e9a0431 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -572,10 +572,13 @@ def X86GR32orGR64AsmOperand : AsmOperandClass { def GR32orGR64 : RegisterOperand<GR32> { let ParserMatchClass = X86GR32orGR64AsmOperand; } - +def AVX512RCOperand : AsmOperandClass { + let Name = "AVX512RC"; +} def AVX512RC : Operand<i32> { let PrintMethod = "printRoundingControl"; let OperandType = "OPERAND_IMMEDIATE"; + let ParserMatchClass = AVX512RCOperand; } // Sign-extended immediate classes. We don't need to define the full lattice @@ -713,9 +716,6 @@ def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr", def tls64baseaddr : ComplexPattern<i64, 5, "SelectTLSADDRAddr", [tglobaltlsaddr], []>; -def vectoraddr : ComplexPattern<iPTR, 5, "SelectAddr", [],[SDNPWantParent]>; -//def vectoraddr : ComplexPattern<iPTR, 5, "SelectVectorAddr", [],[SDNPWantParent]>; - //===----------------------------------------------------------------------===// // X86 Instruction Predicate Definitions. def HasCMov : Predicate<"Subtarget->hasCMov()">; @@ -855,11 +855,11 @@ def X86_COND_E_OR_NE : ImmLeaf<i8, [{ return (Imm == X86::COND_E) || (Imm == X86::COND_NE); }]>; -let FastIselShouldIgnore = 1 in { // FastIsel should ignore all simm8 instrs. - def i16immSExt8 : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>; - def i32immSExt8 : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>; - def i64immSExt8 : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>; -} + +def i16immSExt8 : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>; +def i32immSExt8 : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>; +def i64immSExt8 : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>; + def i64immSExt32 : ImmLeaf<i64, [{ return Imm == (int32_t)Imm; }]>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index d2929d2..ccdbf0e 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -3567,7 +3567,7 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32, f32mem, ssmem, sse_load_f32, !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, - itins, HasAVX, "SS">, XS, VEX_4V, VEX_LIG; + itins, UseAVX, "SS">, XS, VEX_4V, VEX_LIG; } multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -3579,7 +3579,7 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64, f64mem, sdmem, sse_load_f64, !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), - OpNode, itins, HasAVX, "SD">, XD, VEX_4V, VEX_LIG; + OpNode, itins, UseAVX, "SD">, XD, VEX_4V, VEX_LIG; } // Square root. @@ -4077,7 +4077,7 @@ defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128, // SSE2 - Packed Integer Logical Instructions //===---------------------------------------------------------------------===// -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, VR128, v8i16, v8i16, bc_v8i16, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; @@ -4123,7 +4123,7 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { } } // Predicates = [HasAVX] -let Predicates = [HasAVX2] in { +let Predicates = [HasAVX2, NoVLX] in { defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, VR256, v16i16, v8i16, bc_v8i16, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; @@ -5902,7 +5902,6 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; // On AVX2, we also support 256bit inputs. - // FIXME: remove these patterns when the old shuffle lowering goes away. def : Pat<(v16i16 (ExtOp (v32i8 VR256:$src))), (!cast<I>(OpcPrefix#BWYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; def : Pat<(v8i32 (ExtOp (v32i8 VR256:$src))), @@ -6955,6 +6954,34 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, Sched<[itins.Sched.Folded, ReadAfterLd]>; } +/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate +multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, bit Is2Addr = 1, + OpndItins itins = DEFAULT_ITINS> { + let isCommutable = 1 in + def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))], + itins.rr>, Sched<[itins.Sched]>; + def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, + (bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + let Predicates = [HasAVX] in { let isCommutable = 0 in { defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, @@ -6963,26 +6990,24 @@ let Predicates = [HasAVX] in { } let ExeDomain = SSEPackedSingle in { - defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, - VR128, loadv4f32, f128mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; - defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", - int_x86_avx_blend_ps_256, VR256, loadv8f32, - f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, - VEX_4V, VEX_L; + defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32, + VR128, loadv4f32, f128mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; + defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32, + VR256, loadv8f32, f256mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L; } let ExeDomain = SSEPackedDouble in { - defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, - VR128, loadv2f64, f128mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; - defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", - int_x86_avx_blend_pd_256,VR256, loadv4f64, - f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, - VEX_4V, VEX_L; + defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64, + VR128, loadv2f64, f128mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; + defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64, + VR256, loadv4f64, f256mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L; } - defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw, - VR128, loadv2i64, i128mem, 0, - DEFAULT_ITINS_BLENDSCHED>, VEX_4V; + defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16, + VR128, loadv2i64, i128mem, 0, + DEFAULT_ITINS_BLENDSCHED>, VEX_4V; let ExeDomain = SSEPackedSingle in defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, @@ -7004,9 +7029,9 @@ let Predicates = [HasAVX2] in { VR256, loadv4i64, i256mem, 0, DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L; } - defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw, - VR256, loadv4i64, i256mem, 0, - DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L; + defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16, + VR256, loadv4i64, i256mem, 0, + DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L; } let Constraints = "$src1 = $dst" in { @@ -7016,16 +7041,16 @@ let Constraints = "$src1 = $dst" in { 1, SSE_MPSADBW_ITINS>; } let ExeDomain = SSEPackedSingle in - defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps, - VR128, memopv4f32, f128mem, - 1, SSE_INTALU_ITINS_FBLEND_P>; + defm BLENDPS : SS41I_binop_rmi<0x0C, "blendps", X86Blendi, v4f32, + VR128, memopv4f32, f128mem, + 1, SSE_INTALU_ITINS_FBLEND_P>; let ExeDomain = SSEPackedDouble in - defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd, - VR128, memopv2f64, f128mem, - 1, SSE_INTALU_ITINS_FBLEND_P>; - defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw, - VR128, memopv2i64, i128mem, - 1, SSE_INTALU_ITINS_BLEND_P>; + defm BLENDPD : SS41I_binop_rmi<0x0D, "blendpd", X86Blendi, v2f64, + VR128, memopv2f64, f128mem, + 1, SSE_INTALU_ITINS_FBLEND_P>; + defm PBLENDW : SS41I_binop_rmi<0x0E, "pblendw", X86Blendi, v8i16, + VR128, memopv2i64, i128mem, + 1, SSE_INTALU_ITINS_BLEND_P>; let ExeDomain = SSEPackedSingle in defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, VR128, memopv4f32, f128mem, 1, @@ -7116,32 +7141,12 @@ let Predicates = [HasAVX] in { def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1), (v4f64 VR256:$src2))), (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - - def : Pat<(v8f32 (X86Blendi (v8f32 VR256:$src1), (v8f32 VR256:$src2), - (imm:$mask))), - (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$mask)>; - def : Pat<(v4f64 (X86Blendi (v4f64 VR256:$src1), (v4f64 VR256:$src2), - (imm:$mask))), - (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$mask)>; - - def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2), - (imm:$mask))), - (VPBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2), - (imm:$mask))), - (VBLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2), - (imm:$mask))), - (VBLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>; } let Predicates = [HasAVX2] in { def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1), (v32i8 VR256:$src2))), (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - def : Pat<(v16i16 (X86Blendi (v16i16 VR256:$src1), (v16i16 VR256:$src2), - (imm:$mask))), - (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>; } // Patterns @@ -7260,17 +7265,6 @@ let Predicates = [UseSSE41] in { def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1), (v2f64 VR128:$src2))), (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; - - def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2), - (imm:$mask))), - (PBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2), - (imm:$mask))), - (BLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2), - (imm:$mask))), - (BLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>; - } let SchedRW = [WriteLoad] in { @@ -7840,9 +7834,9 @@ def VBROADCASTSDYrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256, WriteFShuffle256>, VEX_L; let Predicates = [HasAVX2] in -def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem, - int_x86_avx2_vbroadcasti128, WriteLoad>, - VEX_L; +def VBROADCASTI128 : avx_broadcast_no_int<0x5A, "vbroadcasti128", VR256, + i128mem, v4i64, loadv2i64, + WriteLoad>, VEX_L; let Predicates = [HasAVX] in def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), @@ -8238,38 +8232,31 @@ let Predicates = [HasF16C] in { // AVX2 Instructions //===----------------------------------------------------------------------===// -/// AVX2_binop_rmi_int - AVX2 binary operator with 8-bit immediate -multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr, - Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, - X86MemOperand x86memop> { +/// AVX2_binop_rmi - AVX2 binary operator with 8-bit immediate +multiclass AVX2_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop> { let isCommutable = 1 in def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, Sched<[WriteBlend]>, VEX_4V; def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, - (IntId RC:$src1, - (bitconvert (memop_frag addr:$src2)), imm:$src3))]>, + (OpVT (OpNode RC:$src1, + (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>, Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V; } -defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128, - VR128, loadv2i64, i128mem>; -defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256, - VR256, loadv4i64, i256mem>, VEX_L; - -def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), - imm:$mask)), - (VPBLENDDrri VR128:$src1, VR128:$src2, imm:$mask)>; -def : Pat<(v8i32 (X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), - imm:$mask)), - (VPBLENDDYrri VR256:$src1, VR256:$src2, imm:$mask)>; +defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32, + VR128, loadv2i64, i128mem>; +defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32, + VR256, loadv4i64, i256mem>, VEX_L; //===----------------------------------------------------------------------===// // VPBROADCAST - Load from memory and broadcast to all elements of the @@ -8608,9 +8595,7 @@ def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), // def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), (ins VR256:$src1, u8imm:$src2), - "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, - (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>, + "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, Sched<[WriteShuffle256]>, VEX, VEX_L; let hasSideEffects = 0, mayStore = 1 in def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index e436811..42256b2 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -175,8 +175,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx2_permd, INTR_TYPE_2OP, X86ISD::VPERMV, 0), - X86_INTRINSIC_DATA(avx2_permps, INTR_TYPE_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 6af59d4..cd3076d 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -77,8 +77,8 @@ namespace llvm { X86AsmPrinter::StackMapShadowTracker::startFunction(MachineFunction &F) { MF = &F; CodeEmitter.reset(TM.getTarget().createMCCodeEmitter( - *MF->getSubtarget().getInstrInfo(), *MF->getSubtarget().getRegisterInfo(), - MF->getSubtarget(), MF->getContext())); + *MF->getSubtarget().getInstrInfo(), + *MF->getSubtarget().getRegisterInfo(), MF->getContext())); } void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst, diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index cab7ce8..06545bc 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "X86RegisterInfo.h" +#include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" @@ -53,26 +54,26 @@ static cl::opt<bool> EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true), cl::desc("Enable use of a base pointer for complex stack frames")); -X86RegisterInfo::X86RegisterInfo(const X86Subtarget &STI) - : X86GenRegisterInfo( - (STI.is64Bit() ? X86::RIP : X86::EIP), - X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), false), - X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), true), - (STI.is64Bit() ? X86::RIP : X86::EIP)), - Subtarget(STI) { +X86RegisterInfo::X86RegisterInfo(const Triple &TT) + : X86GenRegisterInfo((TT.isArch64Bit() ? X86::RIP : X86::EIP), + X86_MC::getDwarfRegFlavour(TT, false), + X86_MC::getDwarfRegFlavour(TT, true), + (TT.isArch64Bit() ? X86::RIP : X86::EIP)) { X86_MC::InitLLVM2SEHRegisterMapping(this); // Cache some information. - Is64Bit = Subtarget.is64Bit(); - IsWin64 = Subtarget.isTargetWin64(); + Is64Bit = TT.isArch64Bit(); + IsWin64 = Is64Bit && TT.isOSWindows(); // Use a callee-saved register as the base pointer. These registers must // not conflict with any ABI requirements. For example, in 32-bit mode PIC // requires GOT in the EBX register before function calls via PLT GOT pointer. if (Is64Bit) { SlotSize = 8; - bool Use64BitReg = - Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64(); + // This matches the simplified 32-bit pointer code in the data layout + // computation. + // FIXME: Should use the data layout? + bool Use64BitReg = TT.getEnvironment() != Triple::GNUX32; StackPtr = Use64BitReg ? X86::RSP : X86::ESP; FramePtr = Use64BitReg ? X86::RBP : X86::EBP; BasePtr = Use64BitReg ? X86::RBX : X86::EBX; @@ -120,8 +121,9 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx); } -const TargetRegisterClass* -X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{ +const TargetRegisterClass * +X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const { // Don't allow super-classes of GR8_NOREX. This class is only used after // extracting sub_8bit_hi sub-registers. The H sub-registers cannot be copied // to the full GR8 register class in 64-bit mode, so we cannot allow the @@ -161,6 +163,7 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{ const TargetRegisterClass * X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) const { + const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); switch (Kind) { default: llvm_unreachable("Unexpected Kind in getPointerRegClass!"); case 0: // Normal GPRs. @@ -172,9 +175,9 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, return &X86::GR64_NOSPRegClass; return &X86::GR32_NOSPRegClass; case 2: // Available for tailcall (not callee-saved GPRs). - if (Subtarget.isTargetWin64()) + if (IsWin64) return &X86::GR64_TCW64RegClass; - else if (Subtarget.is64Bit()) + else if (Is64Bit) return &X86::GR64_TCRegClass; const Function *F = MF.getFunction(); @@ -210,7 +213,7 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, case X86::GR64RegClassID: return 12 - FPDiff; case X86::VR128RegClassID: - return Subtarget.is64Bit() ? 10 : 4; + return Is64Bit ? 10 : 4; case X86::VR64RegClassID: return 4; } @@ -218,8 +221,10 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, const MCPhysReg * X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + const X86Subtarget &Subtarget = MF->getSubtarget<X86Subtarget>(); bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); + bool CallsEHReturn = MF->getMMI().callsEHReturn(); assert(MF && "MachineFunction required"); switch (MF->getFunction()->getCallingConv()) { @@ -253,11 +258,16 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (Is64Bit) return CSR_64_MostRegs_SaveList; break; + case CallingConv::X86_64_Win64: + return CSR_Win64_SaveList; + case CallingConv::X86_64_SysV: + if (CallsEHReturn) + return CSR_64EHRet_SaveList; + return CSR_64_SaveList; default: break; } - bool CallsEHReturn = MF->getMMI().callsEHReturn(); if (Is64Bit) { if (IsWin64) return CSR_Win64_SaveList; @@ -270,8 +280,10 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_32_SaveList; } -const uint32_t* -X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { +const uint32_t * +X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { + const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); @@ -308,6 +320,10 @@ X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { break; default: break; + case CallingConv::X86_64_Win64: + return CSR_Win64_RegMask; + case CallingConv::X86_64_SysV: + return CSR_64_RegMask; } // Unlike getCalleeSavedRegs(), we don't have MMI so we can't check @@ -349,7 +365,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Set the base-pointer register and its aliases as reserved if needed. if (hasBasePointer(MF)) { CallingConv::ID CC = MF.getFunction()->getCallingConv(); - const uint32_t* RegMask = getCallPreservedMask(CC); + const uint32_t *RegMask = getCallPreservedMask(MF, CC); if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister())) report_fatal_error( "Stack realignment in presence of dynamic allocas is not supported with" @@ -393,7 +409,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(*AI); } } - if (!Is64Bit || !Subtarget.hasAVX512()) { + if (!Is64Bit || !MF.getSubtarget<X86Subtarget>().hasAVX512()) { for (unsigned n = 16; n != 32; ++n) { for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI) Reserved.set(*AI); @@ -486,6 +502,24 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, else BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr); + // FRAME_ALLOC uses a single offset, with no register. It only works in the + // simple FP case, and doesn't work with stack realignment. On 32-bit, the + // offset is from the traditional base pointer location. On 64-bit, the + // offset is from the SP at the end of the prologue, not the FP location. This + // matches the behavior of llvm.frameaddress. + if (Opc == TargetOpcode::FRAME_ALLOC) { + MachineOperand &FI = MI.getOperand(FIOperandNum); + bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + int Offset; + if (IsWinEH) + Offset = static_cast<const X86FrameLowering *>(TFI) + ->getFrameIndexOffsetFromSP(MF, FrameIndex); + else + Offset = TFI->getFrameIndexOffset(MF, FrameIndex); + FI.ChangeToImmediate(Offset); + return; + } + // For LEA64_32r when BasePtr is 32-bits (X32) we can use full-size 64-bit // register as source operand, semantic is the same and destination is // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided. @@ -537,8 +571,9 @@ unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { return TFI->hasFP(MF) ? FramePtr : StackPtr; } -unsigned X86RegisterInfo::getPtrSizedFrameRegister( - const MachineFunction &MF) const { +unsigned +X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const { + const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); unsigned FrameReg = getFrameRegister(MF); if (Subtarget.isTarget64BitILP32()) FrameReg = getX86SubSuperRegister(FrameReg, MVT::i32, false); diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index 406b1fc..74edab9 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -20,14 +20,7 @@ #include "X86GenRegisterInfo.inc" namespace llvm { - class Type; - class TargetInstrInfo; - class X86Subtarget; - class X86RegisterInfo final : public X86GenRegisterInfo { -public: - const X86Subtarget &Subtarget; - private: /// Is64Bit - Is the target 64-bits. /// @@ -55,7 +48,7 @@ private: unsigned BasePtr; public: - X86RegisterInfo(const X86Subtarget &STI); + X86RegisterInfo(const Triple &TT); // FIXME: This should be tablegen'd like getDwarfRegNum is int getSEHRegNum(unsigned i) const; @@ -76,8 +69,9 @@ public: getSubClassWithSubReg(const TargetRegisterClass *RC, unsigned Idx) const override; - const TargetRegisterClass* - getLargestLegalSuperClass(const TargetRegisterClass *RC) const override; + const TargetRegisterClass * + getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const override; /// getPointerRegClass - Returns a TargetRegisterClass used for pointer /// values. @@ -98,7 +92,8 @@ public: /// callee-save registers on this target. const MCPhysReg * getCalleeSavedRegs(const MachineFunction* MF) const override; - const uint32_t *getCallPreservedMask(CallingConv::ID) const override; + const uint32_t *getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID) const override; const uint32_t *getNoPreservedMask() const; /// getReservedRegs - Returns a bitset indexed by physical register number diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td index 61c0600..677e824 100644 --- a/lib/Target/X86/X86SchedHaswell.td +++ b/lib/Target/X86/X86SchedHaswell.td @@ -2014,7 +2014,7 @@ def : InstRW<[WriteFMADDr], // 3p forms. "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)r(Y)?", // 3s forms. - "VF(N?)M(ADD|SUB)S(S|D)(r132|231|213)r", + "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)r", // 4s/4s_int forms. "VF(N?)M(ADD|SUB)S(S|D)4rr(_REV|_Int)?", // 4p forms. @@ -2031,7 +2031,7 @@ def : InstRW<[WriteFMADDm], // 3p forms. "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)m(Y)?", // 3s forms. - "VF(N?)M(ADD|SUB)S(S|D)(r132|231|213)m", + "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)m", // 4s/4s_int forms. "VF(N?)M(ADD|SUB)S(S|D)4(rm|mr)(_Int)?", // 4p forms. diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index 7feabf6..ca8fc9c 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -62,8 +62,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, #ifndef NDEBUG // If the base register might conflict with our physical registers, bail out. - unsigned ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI, - X86::ECX, X86::EAX, X86::EDI}; + const unsigned ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI, + X86::ECX, X86::EAX, X86::EDI}; assert(!isBaseRegConflictPossible(DAG, ClobberSet)); #endif @@ -228,8 +228,8 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( return SDValue(); // If the base register might conflict with our physical registers, bail out. - unsigned ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI, - X86::ECX, X86::ESI, X86::EDI}; + const unsigned ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI, + X86::ECX, X86::ESI, X86::EDI}; if (isBaseRegConflictPossible(DAG, ClobberSet)) return SDValue(); diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 4bde053..43d3895 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -37,10 +37,10 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { return make_unique<TargetLoweringObjectFileMachO>(); } - if (TT.isOSLinux()) - return make_unique<X86LinuxTargetObjectFile>(); + if (TT.isOSLinux() || TT.isOSNaCl()) + return make_unique<X86LinuxNaClTargetObjectFile>(); if (TT.isOSBinFormatELF()) - return make_unique<TargetLoweringObjectFileELF>(); + return make_unique<X86ELFTargetObjectFile>(); if (TT.isKnownWindowsMSVCEnvironment()) return make_unique<X86WindowsTargetObjectFile>(); if (TT.isOSBinFormatCOFF()) @@ -94,9 +94,9 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + : LLVMTargetMachine(T, computeDataLayout(Triple(TT)), TT, CPU, FS, Options, + RM, CM, OL), TLOF(createTLOF(Triple(getTargetTriple()))), - DL(computeDataLayout(Triple(TT))), Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) { // default to hard float ABI if (Options.FloatABIType == FloatABI::Default) diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index 283858d..c9833ed 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -24,8 +24,6 @@ class StringRef; class X86TargetMachine final : public LLVMTargetMachine { std::unique_ptr<TargetLoweringObjectFile> TLOF; - // Calculates type size & alignment - const DataLayout DL; X86Subtarget Subtarget; mutable StringMap<std::unique_ptr<X86Subtarget>> SubtargetMap; @@ -35,8 +33,6 @@ public: const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); ~X86TargetMachine() override; - const DataLayout *getDataLayout() const override { return &DL; } - const X86Subtarget *getSubtargetImpl() const override { return &Subtarget; } const X86Subtarget *getSubtargetImpl(const Function &F) const override; TargetIRAnalysis getTargetIRAnalysis() override; diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp index 1d1c32e..d65d3b0 100644 --- a/lib/Target/X86/X86TargetObjectFile.cpp +++ b/lib/Target/X86/X86TargetObjectFile.cpp @@ -15,17 +15,13 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCValue.h" #include "llvm/Support/Dwarf.h" #include "llvm/Target/TargetLowering.h" using namespace llvm; using namespace dwarf; -X86_64MachoTargetObjectFile::X86_64MachoTargetObjectFile() - : TargetLoweringObjectFileMachO() { - SupportIndirectSymViaGOTPCRel = true; -} - const MCExpr *X86_64MachoTargetObjectFile::getTTypeGlobalReference( const GlobalValue *GV, unsigned Encoding, Mangler &Mang, const TargetMachine &TM, MachineModuleInfo *MMI, @@ -52,28 +48,30 @@ MCSymbol *X86_64MachoTargetObjectFile::getCFIPersonalitySymbol( } const MCExpr *X86_64MachoTargetObjectFile::getIndirectSymViaGOTPCRel( - const MCSymbol *Sym, int64_t Offset) const { + const MCSymbol *Sym, const MCValue &MV, int64_t Offset, + MachineModuleInfo *MMI, MCStreamer &Streamer) const { // On Darwin/X86-64, we need to use foo@GOTPCREL+4 to access the got entry // from a data section. In case there's an additional offset, then use // foo@GOTPCREL+4+<offset>. + unsigned FinalOff = Offset+MV.getConstant()+4; const MCExpr *Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext()); - const MCExpr *Off = MCConstantExpr::Create(Offset+4, getContext()); + const MCExpr *Off = MCConstantExpr::Create(FinalOff, getContext()); return MCBinaryExpr::CreateAdd(Res, Off, getContext()); } +const MCExpr *X86ELFTargetObjectFile::getDebugThreadLocalSymbol( + const MCSymbol *Sym) const { + return MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext()); +} + void -X86LinuxTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { +X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx, + const TargetMachine &TM) { TargetLoweringObjectFileELF::Initialize(Ctx, TM); InitializeELF(TM.Options.UseInitArray); } -const MCExpr * -X86LinuxTargetObjectFile::getDebugThreadLocalSymbol( - const MCSymbol *Sym) const { - return MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext()); -} - const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol( const ConstantExpr *CE, Mangler &Mang, const TargetMachine &TM) const { // We are looking for the difference of two symbols, need a subtraction @@ -97,14 +95,12 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol( SubRHS->getPointerAddressSpace() != 0) return nullptr; - // Both ptrtoint instructions must wrap global variables: + // Both ptrtoint instructions must wrap global objects: // - Only global variables are eligible for image relative relocations. - // - The subtrahend refers to the special symbol __ImageBase, a global. - const GlobalVariable *GVLHS = - dyn_cast<GlobalVariable>(SubLHS->getPointerOperand()); - const GlobalVariable *GVRHS = - dyn_cast<GlobalVariable>(SubRHS->getPointerOperand()); - if (!GVLHS || !GVRHS) + // - The subtrahend refers to the special symbol __ImageBase, a GlobalVariable. + const auto *GOLHS = dyn_cast<GlobalObject>(SubLHS->getPointerOperand()); + const auto *GVRHS = dyn_cast<GlobalVariable>(SubRHS->getPointerOperand()); + if (!GOLHS || !GVRHS) return nullptr; // We expect __ImageBase to be a global variable without a section, externally @@ -117,10 +113,10 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol( return nullptr; // An image-relative, thread-local, symbol makes no sense. - if (GVLHS->isThreadLocal()) + if (GOLHS->isThreadLocal()) return nullptr; - return MCSymbolRefExpr::Create(TM.getSymbol(GVLHS, Mang), + return MCSymbolRefExpr::Create(TM.getSymbol(GOLHS, Mang), MCSymbolRefExpr::VK_COFF_IMGREL32, getContext()); } diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h index f745538..2e25fb2 100644 --- a/lib/Target/X86/X86TargetObjectFile.h +++ b/lib/Target/X86/X86TargetObjectFile.h @@ -19,8 +19,6 @@ namespace llvm { /// x86-64. class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO { public: - X86_64MachoTargetObjectFile(); - const MCExpr * getTTypeGlobalReference(const GlobalValue *GV, unsigned Encoding, Mangler &Mang, const TargetMachine &TM, @@ -33,20 +31,25 @@ namespace llvm { const TargetMachine &TM, MachineModuleInfo *MMI) const override; - const MCExpr * - getIndirectSymViaGOTPCRel(const MCSymbol *Sym, - int64_t Offset) const override; + const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym, + const MCValue &MV, int64_t Offset, + MachineModuleInfo *MMI, + MCStreamer &Streamer) const override; }; - /// X86LinuxTargetObjectFile - This implementation is used for linux x86 - /// and x86-64. - class X86LinuxTargetObjectFile : public TargetLoweringObjectFileELF { - void Initialize(MCContext &Ctx, const TargetMachine &TM) override; - + /// \brief This implemenatation is used for X86 ELF targets that don't + /// have a further specialization. + class X86ELFTargetObjectFile : public TargetLoweringObjectFileELF { /// \brief Describe a TLS variable address within debug info. const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override; }; + /// X86LinuxNaClTargetObjectFile - This implementation is used for linux and + /// Native Client on x86 and x86-64. + class X86LinuxNaClTargetObjectFile : public X86ELFTargetObjectFile { + void Initialize(MCContext &Ctx, const TargetMachine &TM) override; + }; + /// \brief This implementation is used for Windows targets on x86 and x86-64. class X86WindowsTargetObjectFile : public TargetLoweringObjectFileCOFF { const MCExpr * diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp index 4073549..d0a09b2 100644 --- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp +++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp @@ -126,15 +126,11 @@ void XCoreTargetAsmStreamer::emitCCBottomFunction(StringRef Name) { } } -static MCStreamer * -createXCoreMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS, - bool isVerboseAsm, bool useDwarfDirectory, - MCInstPrinter *InstPrint, MCCodeEmitter *CE, - MCAsmBackend *TAB, bool ShowInst) { - MCStreamer *S = llvm::createAsmStreamer( - Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst); - new XCoreTargetAsmStreamer(*S, OS); - return S; +static MCTargetStreamer *createTargetAsmStreamer(MCStreamer &S, + formatted_raw_ostream &OS, + MCInstPrinter *InstPrint, + bool isVerboseAsm) { + return new XCoreTargetAsmStreamer(S, OS); } // Force static initialization. @@ -160,5 +156,6 @@ extern "C" void LLVMInitializeXCoreTargetMC() { TargetRegistry::RegisterMCInstPrinter(TheXCoreTarget, createXCoreMCInstPrinter); - TargetRegistry::RegisterAsmStreamer(TheXCoreTarget, createXCoreMCAsmStreamer); + TargetRegistry::RegisterAsmTargetStreamer(TheXCoreTarget, + createTargetAsmStreamer); } diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h index 0ff5961..28e0275 100644 --- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h +++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h @@ -14,6 +14,8 @@ #ifndef LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCTARGETDESC_H #define LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCTARGETDESC_H +#include "llvm/Support/DataTypes.h" + namespace llvm { class Target; diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp index f79b78b..5c7ea5e 100644 --- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp +++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp @@ -65,7 +65,7 @@ namespace { // Complex Pattern Selectors. bool SelectADDRspii(SDValue Addr, SDValue &Base, SDValue &Offset); - bool SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, + bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) override; const char *getPassName() const override { @@ -108,12 +108,12 @@ bool XCoreDAGToDAGISel::SelectADDRspii(SDValue Addr, SDValue &Base, } bool XCoreDAGToDAGISel:: -SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, +SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) { SDValue Reg; - switch (ConstraintCode) { + switch (ConstraintID) { default: return true; - case 'm': // Memory. + case InlineAsm::Constraint_m: // Memory. switch (Op.getOpcode()) { default: return true; case XCoreISD::CPRelativeWrapper: diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h index 213ae4a..b20fc01 100644 --- a/lib/Target/XCore/XCoreISelLowering.h +++ b/lib/Target/XCore/XCoreISelLowering.h @@ -177,6 +177,12 @@ namespace llvm { const std::string &Constraint, MVT VT) const override; + unsigned getInlineAsmMemConstraint( + const std::string &ConstraintCode) const override { + // FIXME: Map different constraints differently. + return InlineAsm::Constraint_m; + } + // Expand specifics SDValue TryExpandADDWithMul(SDNode *Op, SelectionDAG &DAG) const; SDValue ExpandADDSUB(SDNode *Op, SelectionDAG &DAG) const; diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp index 5c666ae..1d569e8 100644 --- a/lib/Target/XCore/XCoreRegisterInfo.cpp +++ b/lib/Target/XCore/XCoreRegisterInfo.cpp @@ -208,8 +208,8 @@ bool XCoreRegisterInfo::needsFrameMoves(const MachineFunction &MF) { MF.getFunction()->needsUnwindTableEntry(); } -const MCPhysReg* XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) - const { +const MCPhysReg * +XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { // The callee saved registers LR & FP are explicitly handled during // emitPrologue & emitEpilogue and related functions. static const MCPhysReg CalleeSavedRegs[] = { diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h index 5d7721c..010fccd 100644 --- a/lib/Target/XCore/XCoreRegisterInfo.h +++ b/lib/Target/XCore/XCoreRegisterInfo.h @@ -29,8 +29,7 @@ public: /// Code Generation virtual methods... - const MCPhysReg * - getCalleeSavedRegs(const MachineFunction *MF =nullptr) const override; + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; BitVector getReservedRegs(const MachineFunction &MF) const override; diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp index 7998fc1..228dc1c 100644 --- a/lib/Target/XCore/XCoreTargetMachine.cpp +++ b/lib/Target/XCore/XCoreTargetMachine.cpp @@ -27,9 +27,10 @@ XCoreTargetMachine::XCoreTargetMachine(const Target &T, StringRef TT, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + : LLVMTargetMachine( + T, "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32", + TT, CPU, FS, Options, RM, CM, OL), TLOF(make_unique<XCoreTargetObjectFile>()), - DL("e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32"), Subtarget(TT, CPU, FS, *this) { initAsmInfo(); } diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h index c5df07c..0d324ab 100644 --- a/lib/Target/XCore/XCoreTargetMachine.h +++ b/lib/Target/XCore/XCoreTargetMachine.h @@ -21,7 +21,6 @@ namespace llvm { class XCoreTargetMachine : public LLVMTargetMachine { std::unique_ptr<TargetLoweringObjectFile> TLOF; - const DataLayout DL; // Calculates type size & alignment XCoreSubtarget Subtarget; public: XCoreTargetMachine(const Target &T, StringRef TT, @@ -30,8 +29,10 @@ public: CodeGenOpt::Level OL); ~XCoreTargetMachine() override; - const DataLayout *getDataLayout() const override { return &DL; } - const XCoreSubtarget *getSubtargetImpl() const override { return &Subtarget; } + const XCoreSubtarget *getSubtargetImpl() const { return &Subtarget; } + const XCoreSubtarget *getSubtargetImpl(const Function &) const override { + return &Subtarget; + } // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; |