diff options
author | Stephen Hines <srhines@google.com> | 2014-12-01 14:51:49 -0800 |
---|---|---|
committer | Stephen Hines <srhines@google.com> | 2014-12-02 16:08:10 -0800 |
commit | 37ed9c199ca639565f6ce88105f9e39e898d82d0 (patch) | |
tree | 8fb36d3910e3ee4c4e1b7422f4f017108efc52f5 /lib/Target/X86 | |
parent | d2327b22152ced7bc46dc629fc908959e8a52d03 (diff) | |
download | external_llvm-37ed9c199ca639565f6ce88105f9e39e898d82d0.zip external_llvm-37ed9c199ca639565f6ce88105f9e39e898d82d0.tar.gz external_llvm-37ed9c199ca639565f6ce88105f9e39e898d82d0.tar.bz2 |
Update aosp/master LLVM for rebase to r222494.
Change-Id: Ic787f5e0124df789bd26f3f24680f45e678eef2d
Diffstat (limited to 'lib/Target/X86')
93 files changed, 14513 insertions, 7873 deletions
diff --git a/lib/Target/X86/Android.mk b/lib/Target/X86/Android.mk index e2c4be7..861a41d 100644 --- a/lib/Target/X86/Android.mk +++ b/lib/Target/X86/Android.mk @@ -12,8 +12,6 @@ x86_codegen_TBLGEN_TABLES := \ x86_codegen_SRC_FILES := \ X86AsmPrinter.cpp \ - X86AtomicExpandPass.cpp \ - X86CodeEmitter.cpp \ X86FastISel.cpp \ X86FixupLEAs.cpp \ X86FloatingPoint.cpp \ @@ -21,7 +19,6 @@ x86_codegen_SRC_FILES := \ X86ISelDAGToDAG.cpp \ X86ISelLowering.cpp \ X86InstrInfo.cpp \ - X86JITInfo.cpp \ X86MachineFunctionInfo.cpp \ X86MCInstLower.cpp \ X86PadShortFunction.cpp \ diff --git a/lib/Target/X86/AsmParser/CMakeLists.txt b/lib/Target/X86/AsmParser/CMakeLists.txt index b022a41..2c1926e 100644 --- a/lib/Target/X86/AsmParser/CMakeLists.txt +++ b/lib/Target/X86/AsmParser/CMakeLists.txt @@ -1,4 +1,7 @@ add_llvm_library(LLVMX86AsmParser X86AsmInstrumentation.cpp X86AsmParser.cpp + + LINK_LIBS + LLVMX86CodeGen ) diff --git a/lib/Target/X86/AsmParser/LLVMBuild.txt b/lib/Target/X86/AsmParser/LLVMBuild.txt index 9f94d5d..284bfd0 100644 --- a/lib/Target/X86/AsmParser/LLVMBuild.txt +++ b/lib/Target/X86/AsmParser/LLVMBuild.txt @@ -19,5 +19,5 @@ type = Library name = X86AsmParser parent = X86 -required_libraries = MC MCParser Support X86Desc X86Info +required_libraries = MC MCParser Support X86CodeGen X86Desc X86Info add_to_library_groups = X86 diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp index a365f62..9c49a11 100644 --- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp +++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp @@ -10,9 +10,12 @@ #include "MCTargetDesc/X86BaseInfo.h" #include "X86AsmInstrumentation.h" #include "X86Operand.h" +#include "X86RegisterInfo.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Triple.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/IR/Function.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstBuilder.h" @@ -23,6 +26,73 @@ #include "llvm/MC/MCTargetAsmParser.h" #include "llvm/MC/MCTargetOptions.h" #include "llvm/Support/CommandLine.h" +#include <algorithm> +#include <cassert> +#include <vector> + +// Following comment describes how assembly instrumentation works. +// Currently we have only AddressSanitizer instrumentation, but we're +// planning to implement MemorySanitizer for inline assembly too. If +// you're not familiar with AddressSanitizer algorithm, please, read +// https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerAlgorithm. +// +// When inline assembly is parsed by an instance of X86AsmParser, all +// instructions are emitted via EmitInstruction method. That's the +// place where X86AsmInstrumentation analyzes an instruction and +// decides, whether the instruction should be emitted as is or +// instrumentation is required. The latter case happens when an +// instruction reads from or writes to memory. Now instruction opcode +// is explicitly checked, and if an instruction has a memory operand +// (for instance, movq (%rsi, %rcx, 8), %rax) - it should be +// instrumented. There're also exist instructions that modify +// memory but don't have an explicit memory operands, for instance, +// movs. +// +// Let's consider at first 8-byte memory accesses when an instruction +// has an explicit memory operand. In this case we need two registers - +// AddressReg to compute address of a memory cells which are accessed +// and ShadowReg to compute corresponding shadow address. So, we need +// to spill both registers before instrumentation code and restore them +// after instrumentation. Thus, in general, instrumentation code will +// look like this: +// PUSHF # Store flags, otherwise they will be overwritten +// PUSH AddressReg # spill AddressReg +// PUSH ShadowReg # spill ShadowReg +// LEA MemOp, AddressReg # compute address of the memory operand +// MOV AddressReg, ShadowReg +// SHR ShadowReg, 3 +// # ShadowOffset(AddressReg >> 3) contains address of a shadow +// # corresponding to MemOp. +// CMP ShadowOffset(ShadowReg), 0 # test shadow value +// JZ .Done # when shadow equals to zero, everything is fine +// MOV AddressReg, RDI +// # Call __asan_report function with AddressReg as an argument +// CALL __asan_report +// .Done: +// POP ShadowReg # Restore ShadowReg +// POP AddressReg # Restore AddressReg +// POPF # Restore flags +// +// Memory accesses with different size (1-, 2-, 4- and 16-byte) are +// handled in a similar manner, but small memory accesses (less than 8 +// byte) require an additional ScratchReg, which is used for shadow value. +// +// If, suppose, we're instrumenting an instruction like movs, only +// contents of RDI, RDI + AccessSize * RCX, RSI, RSI + AccessSize * +// RCX are checked. In this case there're no need to spill and restore +// AddressReg , ShadowReg or flags four times, they're saved on stack +// just once, before instrumentation of these four addresses, and restored +// at the end of the instrumentation. +// +// There exist several things which complicate this simple algorithm. +// * Instrumented memory operand can have RSP as a base or an index +// register. So we need to add a constant offset before computation +// of memory address, since flags, AddressReg, ShadowReg, etc. were +// already stored on stack and RSP was modified. +// * Debug info (usually, DWARF) should be adjusted, because sometimes +// RSP is used as a frame register. So, we need to select some +// register as a frame register and temprorary override current CFA +// register. namespace llvm { namespace { @@ -32,10 +102,23 @@ static cl::opt<bool> ClAsanInstrumentAssembly( cl::desc("instrument assembly with AddressSanitizer checks"), cl::Hidden, cl::init(false)); -bool IsStackReg(unsigned Reg) { - return Reg == X86::RSP || Reg == X86::ESP || Reg == X86::SP; +const int64_t MinAllowedDisplacement = std::numeric_limits<int32_t>::min(); +const int64_t MaxAllowedDisplacement = std::numeric_limits<int32_t>::max(); + +int64_t ApplyDisplacementBounds(int64_t Displacement) { + return std::max(std::min(MaxAllowedDisplacement, Displacement), + MinAllowedDisplacement); +} + +void CheckDisplacementBounds(int64_t Displacement) { + assert(Displacement >= MinAllowedDisplacement && + Displacement <= MaxAllowedDisplacement); } +bool IsStackReg(unsigned Reg) { return Reg == X86::RSP || Reg == X86::ESP; } + +bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; } + std::string FuncName(unsigned AccessSize, bool IsWrite) { return std::string("__asan_report_") + (IsWrite ? "store" : "load") + utostr(AccessSize); @@ -43,60 +126,245 @@ std::string FuncName(unsigned AccessSize, bool IsWrite) { class X86AddressSanitizer : public X86AsmInstrumentation { public: - X86AddressSanitizer(const MCSubtargetInfo &STI) : STI(STI) {} + struct RegisterContext { + private: + enum RegOffset { + REG_OFFSET_ADDRESS = 0, + REG_OFFSET_SHADOW, + REG_OFFSET_SCRATCH + }; + + public: + RegisterContext(unsigned AddressReg, unsigned ShadowReg, + unsigned ScratchReg) { + BusyRegs.push_back(convReg(AddressReg, MVT::i64)); + BusyRegs.push_back(convReg(ShadowReg, MVT::i64)); + BusyRegs.push_back(convReg(ScratchReg, MVT::i64)); + } + + unsigned AddressReg(MVT::SimpleValueType VT) const { + return convReg(BusyRegs[REG_OFFSET_ADDRESS], VT); + } + + unsigned ShadowReg(MVT::SimpleValueType VT) const { + return convReg(BusyRegs[REG_OFFSET_SHADOW], VT); + } + + unsigned ScratchReg(MVT::SimpleValueType VT) const { + return convReg(BusyRegs[REG_OFFSET_SCRATCH], VT); + } + + void AddBusyReg(unsigned Reg) { + if (Reg != X86::NoRegister) + BusyRegs.push_back(convReg(Reg, MVT::i64)); + } + + void AddBusyRegs(const X86Operand &Op) { + AddBusyReg(Op.getMemBaseReg()); + AddBusyReg(Op.getMemIndexReg()); + } + + unsigned ChooseFrameReg(MVT::SimpleValueType VT) const { + static const unsigned Candidates[] = { X86::RBP, X86::RAX, X86::RBX, + X86::RCX, X86::RDX, X86::RDI, + X86::RSI }; + for (unsigned Reg : Candidates) { + if (!std::count(BusyRegs.begin(), BusyRegs.end(), Reg)) + return convReg(Reg, VT); + } + return X86::NoRegister; + } + + private: + unsigned convReg(unsigned Reg, MVT::SimpleValueType VT) const { + return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, VT); + } + + std::vector<unsigned> BusyRegs; + }; + + X86AddressSanitizer(const MCSubtargetInfo &STI) + : X86AsmInstrumentation(STI), RepPrefix(false), OrigSPOffset(0) {} + virtual ~X86AddressSanitizer() {} // X86AsmInstrumentation implementation: - virtual void InstrumentInstruction( - const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, - const MCInstrInfo &MII, MCStreamer &Out) override { + virtual void InstrumentAndEmitInstruction(const MCInst &Inst, + OperandVector &Operands, + MCContext &Ctx, + const MCInstrInfo &MII, + MCStreamer &Out) override { + InstrumentMOVS(Inst, Operands, Ctx, MII, Out); + if (RepPrefix) + EmitInstruction(Out, MCInstBuilder(X86::REP_PREFIX)); + InstrumentMOV(Inst, Operands, Ctx, MII, Out); - } - // Should be implemented differently in x86_32 and x86_64 subclasses. - virtual void InstrumentMemOperandSmallImpl( - X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) = 0; - virtual void InstrumentMemOperandLargeImpl( - X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) = 0; + RepPrefix = (Inst.getOpcode() == X86::REP_PREFIX); + if (!RepPrefix) + EmitInstruction(Out, Inst); + } - void InstrumentMemOperand(MCParsedAsmOperand &Op, unsigned AccessSize, - bool IsWrite, MCContext &Ctx, MCStreamer &Out); + // Adjusts up stack and saves all registers used in instrumentation. + virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) = 0; + + // Restores all registers used in instrumentation and adjusts stack. + virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) = 0; + + virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, MCStreamer &Out) = 0; + virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, MCStreamer &Out) = 0; + + virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, + MCStreamer &Out) = 0; + + void InstrumentMemOperand(X86Operand &Op, unsigned AccessSize, bool IsWrite, + const RegisterContext &RegCtx, MCContext &Ctx, + MCStreamer &Out); + void InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg, unsigned CntReg, + unsigned AccessSize, MCContext &Ctx, MCStreamer &Out); + + void InstrumentMOVS(const MCInst &Inst, OperandVector &Operands, + MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out); void InstrumentMOV(const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out); - void EmitInstruction(MCStreamer &Out, const MCInst &Inst) { - Out.EmitInstruction(Inst, STI); - } +protected: void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); } -protected: - const MCSubtargetInfo &STI; + void EmitLEA(X86Operand &Op, MVT::SimpleValueType VT, unsigned Reg, + MCStreamer &Out) { + assert(VT == MVT::i32 || VT == MVT::i64); + MCInst Inst; + Inst.setOpcode(VT == MVT::i32 ? X86::LEA32r : X86::LEA64r); + Inst.addOperand(MCOperand::CreateReg(getX86SubSuperRegister(Reg, VT))); + Op.addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + } + + void ComputeMemOperandAddress(X86Operand &Op, MVT::SimpleValueType VT, + unsigned Reg, MCContext &Ctx, MCStreamer &Out); + + // Creates new memory operand with Displacement added to an original + // displacement. Residue will contain a residue which could happen when the + // total displacement exceeds 32-bit limitation. + std::unique_ptr<X86Operand> AddDisplacement(X86Operand &Op, + int64_t Displacement, + MCContext &Ctx, int64_t *Residue); + + // True when previous instruction was actually REP prefix. + bool RepPrefix; + + // Offset from the original SP register. + int64_t OrigSPOffset; }; void X86AddressSanitizer::InstrumentMemOperand( - MCParsedAsmOperand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) { + X86Operand &Op, unsigned AccessSize, bool IsWrite, + const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { assert(Op.isMem() && "Op should be a memory operand."); assert((AccessSize & (AccessSize - 1)) == 0 && AccessSize <= 16 && "AccessSize should be a power of two, less or equal than 16."); + // FIXME: take into account load/store alignment. + if (IsSmallMemAccess(AccessSize)) + InstrumentMemOperandSmall(Op, AccessSize, IsWrite, RegCtx, Ctx, Out); + else + InstrumentMemOperandLarge(Op, AccessSize, IsWrite, RegCtx, Ctx, Out); +} + +void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg, + unsigned CntReg, + unsigned AccessSize, + MCContext &Ctx, MCStreamer &Out) { + // FIXME: check whole ranges [DstReg .. DstReg + AccessSize * (CntReg - 1)] + // and [SrcReg .. SrcReg + AccessSize * (CntReg - 1)]. + RegisterContext RegCtx(X86::RDX /* AddressReg */, X86::RAX /* ShadowReg */, + IsSmallMemAccess(AccessSize) + ? X86::RBX + : X86::NoRegister /* ScratchReg */); + RegCtx.AddBusyReg(DstReg); + RegCtx.AddBusyReg(SrcReg); + RegCtx.AddBusyReg(CntReg); + + InstrumentMemOperandPrologue(RegCtx, Ctx, Out); + + // Test (%SrcReg) + { + const MCExpr *Disp = MCConstantExpr::Create(0, Ctx); + std::unique_ptr<X86Operand> Op(X86Operand::CreateMem( + 0, Disp, SrcReg, 0, AccessSize, SMLoc(), SMLoc())); + InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx, + Out); + } + + // Test -1(%SrcReg, %CntReg, AccessSize) + { + const MCExpr *Disp = MCConstantExpr::Create(-1, Ctx); + std::unique_ptr<X86Operand> Op(X86Operand::CreateMem( + 0, Disp, SrcReg, CntReg, AccessSize, SMLoc(), SMLoc())); + InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx, + Out); + } + + // Test (%DstReg) + { + const MCExpr *Disp = MCConstantExpr::Create(0, Ctx); + std::unique_ptr<X86Operand> Op(X86Operand::CreateMem( + 0, Disp, DstReg, 0, AccessSize, SMLoc(), SMLoc())); + InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out); + } + + // Test -1(%DstReg, %CntReg, AccessSize) + { + const MCExpr *Disp = MCConstantExpr::Create(-1, Ctx); + std::unique_ptr<X86Operand> Op(X86Operand::CreateMem( + 0, Disp, DstReg, CntReg, AccessSize, SMLoc(), SMLoc())); + InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out); + } + + InstrumentMemOperandEpilogue(RegCtx, Ctx, Out); +} + +void X86AddressSanitizer::InstrumentMOVS(const MCInst &Inst, + OperandVector &Operands, + MCContext &Ctx, const MCInstrInfo &MII, + MCStreamer &Out) { + // Access size in bytes. + unsigned AccessSize = 0; - X86Operand &MemOp = static_cast<X86Operand &>(Op); - // FIXME: get rid of this limitation. - if (IsStackReg(MemOp.getMemBaseReg()) || IsStackReg(MemOp.getMemIndexReg())) + switch (Inst.getOpcode()) { + case X86::MOVSB: + AccessSize = 1; + break; + case X86::MOVSW: + AccessSize = 2; + break; + case X86::MOVSL: + AccessSize = 4; + break; + case X86::MOVSQ: + AccessSize = 8; + break; + default: return; + } - // FIXME: take into account load/store alignment. - if (AccessSize < 8) - InstrumentMemOperandSmallImpl(MemOp, AccessSize, IsWrite, Ctx, Out); - else - InstrumentMemOperandLargeImpl(MemOp, AccessSize, IsWrite, Ctx, Out); + InstrumentMOVSImpl(AccessSize, Ctx, Out); } -void X86AddressSanitizer::InstrumentMOV( - const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, - const MCInstrInfo &MII, MCStreamer &Out) { +void X86AddressSanitizer::InstrumentMOV(const MCInst &Inst, + OperandVector &Operands, MCContext &Ctx, + const MCInstrInfo &MII, + MCStreamer &Out) { // Access size in bytes. unsigned AccessSize = 0; @@ -132,41 +400,199 @@ void X86AddressSanitizer::InstrumentMOV( } const bool IsWrite = MII.get(Inst.getOpcode()).mayStore(); + for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) { assert(Operands[Ix]); MCParsedAsmOperand &Op = *Operands[Ix]; - if (Op.isMem()) - InstrumentMemOperand(Op, AccessSize, IsWrite, Ctx, Out); + if (Op.isMem()) { + X86Operand &MemOp = static_cast<X86Operand &>(Op); + RegisterContext RegCtx( + X86::RDI /* AddressReg */, X86::RAX /* ShadowReg */, + IsSmallMemAccess(AccessSize) ? X86::RCX + : X86::NoRegister /* ScratchReg */); + RegCtx.AddBusyRegs(MemOp); + InstrumentMemOperandPrologue(RegCtx, Ctx, Out); + InstrumentMemOperand(MemOp, AccessSize, IsWrite, RegCtx, Ctx, Out); + InstrumentMemOperandEpilogue(RegCtx, Ctx, Out); + } } } +void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op, + MVT::SimpleValueType VT, + unsigned Reg, MCContext &Ctx, + MCStreamer &Out) { + int64_t Displacement = 0; + if (IsStackReg(Op.getMemBaseReg())) + Displacement -= OrigSPOffset; + if (IsStackReg(Op.getMemIndexReg())) + Displacement -= OrigSPOffset * Op.getMemScale(); + + assert(Displacement >= 0); + + // Emit Op as is. + if (Displacement == 0) { + EmitLEA(Op, VT, Reg, Out); + return; + } + + int64_t Residue; + std::unique_ptr<X86Operand> NewOp = + AddDisplacement(Op, Displacement, Ctx, &Residue); + EmitLEA(*NewOp, VT, Reg, Out); + + while (Residue != 0) { + const MCConstantExpr *Disp = + MCConstantExpr::Create(ApplyDisplacementBounds(Residue), Ctx); + std::unique_ptr<X86Operand> DispOp = + X86Operand::CreateMem(0, Disp, Reg, 0, 1, SMLoc(), SMLoc()); + EmitLEA(*DispOp, VT, Reg, Out); + Residue -= Disp->getValue(); + } +} + +std::unique_ptr<X86Operand> +X86AddressSanitizer::AddDisplacement(X86Operand &Op, int64_t Displacement, + MCContext &Ctx, int64_t *Residue) { + assert(Displacement >= 0); + + if (Displacement == 0 || + (Op.getMemDisp() && Op.getMemDisp()->getKind() != MCExpr::Constant)) { + *Residue = Displacement; + return X86Operand::CreateMem(Op.getMemSegReg(), Op.getMemDisp(), + Op.getMemBaseReg(), Op.getMemIndexReg(), + Op.getMemScale(), SMLoc(), SMLoc()); + } + + int64_t OrigDisplacement = + static_cast<const MCConstantExpr *>(Op.getMemDisp())->getValue(); + CheckDisplacementBounds(OrigDisplacement); + Displacement += OrigDisplacement; + + int64_t NewDisplacement = ApplyDisplacementBounds(Displacement); + CheckDisplacementBounds(NewDisplacement); + + *Residue = Displacement - NewDisplacement; + const MCExpr *Disp = MCConstantExpr::Create(NewDisplacement, Ctx); + return X86Operand::CreateMem(Op.getMemSegReg(), Disp, Op.getMemBaseReg(), + Op.getMemIndexReg(), Op.getMemScale(), SMLoc(), + SMLoc()); +} + class X86AddressSanitizer32 : public X86AddressSanitizer { public: static const long kShadowOffset = 0x20000000; X86AddressSanitizer32(const MCSubtargetInfo &STI) : X86AddressSanitizer(STI) {} + virtual ~X86AddressSanitizer32() {} - virtual void InstrumentMemOperandSmallImpl( - X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) override; - virtual void InstrumentMemOperandLargeImpl( - X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) override; + unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) { + unsigned FrameReg = GetFrameRegGeneric(Ctx, Out); + if (FrameReg == X86::NoRegister) + return FrameReg; + return getX86SubSuperRegister(FrameReg, MVT::i32); + } + + void SpillReg(MCStreamer &Out, unsigned Reg) { + EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(Reg)); + OrigSPOffset -= 4; + } + + void RestoreReg(MCStreamer &Out, unsigned Reg) { + EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(Reg)); + OrigSPOffset += 4; + } + + void StoreFlags(MCStreamer &Out) { + EmitInstruction(Out, MCInstBuilder(X86::PUSHF32)); + OrigSPOffset -= 4; + } + + void RestoreFlags(MCStreamer &Out) { + EmitInstruction(Out, MCInstBuilder(X86::POPF32)); + OrigSPOffset += 4; + } + + virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i32); + assert(LocalFrameReg != X86::NoRegister); + + const MCRegisterInfo *MRI = Ctx.getRegisterInfo(); + unsigned FrameReg = GetFrameReg(Ctx, Out); + if (MRI && FrameReg != X86::NoRegister) { + SpillReg(Out, LocalFrameReg); + if (FrameReg == X86::ESP) { + Out.EmitCFIAdjustCfaOffset(4 /* byte size of the LocalFrameReg */); + Out.EmitCFIRelOffset( + MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0); + } + EmitInstruction( + Out, + MCInstBuilder(X86::MOV32rr).addReg(LocalFrameReg).addReg(FrameReg)); + Out.EmitCFIRememberState(); + Out.EmitCFIDefCfaRegister( + MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */)); + } + + SpillReg(Out, RegCtx.AddressReg(MVT::i32)); + SpillReg(Out, RegCtx.ShadowReg(MVT::i32)); + if (RegCtx.ScratchReg(MVT::i32) != X86::NoRegister) + SpillReg(Out, RegCtx.ScratchReg(MVT::i32)); + StoreFlags(Out); + } + + virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i32); + assert(LocalFrameReg != X86::NoRegister); + + RestoreFlags(Out); + if (RegCtx.ScratchReg(MVT::i32) != X86::NoRegister) + RestoreReg(Out, RegCtx.ScratchReg(MVT::i32)); + RestoreReg(Out, RegCtx.ShadowReg(MVT::i32)); + RestoreReg(Out, RegCtx.AddressReg(MVT::i32)); + + unsigned FrameReg = GetFrameReg(Ctx, Out); + if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) { + RestoreReg(Out, LocalFrameReg); + Out.EmitCFIRestoreState(); + if (FrameReg == X86::ESP) + Out.EmitCFIAdjustCfaOffset(-4 /* byte size of the LocalFrameReg */); + } + } - private: - void EmitCallAsanReport(MCContext &Ctx, MCStreamer &Out, unsigned AccessSize, - bool IsWrite, unsigned AddressReg) { + virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, + MCStreamer &Out) override; + +private: + void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out, const RegisterContext &RegCtx) { EmitInstruction(Out, MCInstBuilder(X86::CLD)); EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS)); - EmitInstruction(Out, MCInstBuilder(X86::AND64ri8).addReg(X86::ESP) - .addReg(X86::ESP).addImm(-16)); - EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(AddressReg)); + EmitInstruction(Out, MCInstBuilder(X86::AND64ri8) + .addReg(X86::ESP) + .addReg(X86::ESP) + .addImm(-16)); + EmitInstruction( + Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(MVT::i32))); - - const std::string& Fn = FuncName(AccessSize, IsWrite); + const std::string &Fn = FuncName(AccessSize, IsWrite); MCSymbol *FnSym = Ctx.GetOrCreateSymbol(StringRef(Fn)); const MCSymbolRefExpr *FnExpr = MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); @@ -174,67 +600,64 @@ public: } }; -void X86AddressSanitizer32::InstrumentMemOperandSmallImpl( - X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) { - EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX)); - EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::ECX)); - EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EDX)); - EmitInstruction(Out, MCInstBuilder(X86::PUSHF32)); +void X86AddressSanitizer32::InstrumentMemOperandSmall( + X86Operand &Op, unsigned AccessSize, bool IsWrite, + const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { + unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32); + unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32); + unsigned ShadowRegI8 = RegCtx.ShadowReg(MVT::i8); - { - MCInst Inst; - Inst.setOpcode(X86::LEA32r); - Inst.addOperand(MCOperand::CreateReg(X86::EAX)); - Op.addMemOperands(Inst, 5); - EmitInstruction(Out, Inst); - } + assert(RegCtx.ScratchReg(MVT::i32) != X86::NoRegister); + unsigned ScratchRegI32 = RegCtx.ScratchReg(MVT::i32); - EmitInstruction( - Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EAX)); - EmitInstruction(Out, MCInstBuilder(X86::SHR32ri).addReg(X86::ECX) - .addReg(X86::ECX).addImm(3)); + ComputeMemOperandAddress(Op, MVT::i32, AddressRegI32, Ctx, Out); + + EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg( + AddressRegI32)); + EmitInstruction(Out, MCInstBuilder(X86::SHR32ri) + .addReg(ShadowRegI32) + .addReg(ShadowRegI32) + .addImm(3)); { MCInst Inst; Inst.setOpcode(X86::MOV8rm); - Inst.addOperand(MCOperand::CreateReg(X86::CL)); + Inst.addOperand(MCOperand::CreateReg(ShadowRegI8)); const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc())); + X86Operand::CreateMem(0, Disp, ShadowRegI32, 0, 1, SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); EmitInstruction(Out, Inst); } - EmitInstruction(Out, - MCInstBuilder(X86::TEST8rr).addReg(X86::CL).addReg(X86::CL)); + EmitInstruction( + Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8)); MCSymbol *DoneSym = Ctx.CreateTempSymbol(); const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); - EmitInstruction( - Out, MCInstBuilder(X86::MOV32rr).addReg(X86::EDX).addReg(X86::EAX)); - EmitInstruction(Out, MCInstBuilder(X86::AND32ri).addReg(X86::EDX) - .addReg(X86::EDX).addImm(7)); + EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg( + AddressRegI32)); + EmitInstruction(Out, MCInstBuilder(X86::AND32ri) + .addReg(ScratchRegI32) + .addReg(ScratchRegI32) + .addImm(7)); switch (AccessSize) { case 1: break; case 2: { - MCInst Inst; - Inst.setOpcode(X86::LEA32r); - Inst.addOperand(MCOperand::CreateReg(X86::EDX)); - const MCExpr *Disp = MCConstantExpr::Create(1, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, X86::EDX, 0, 1, SMLoc(), SMLoc())); - Op->addMemOperands(Inst, 5); - EmitInstruction(Out, Inst); + X86Operand::CreateMem(0, Disp, ScratchRegI32, 0, 1, SMLoc(), SMLoc())); + EmitLEA(*Op, MVT::i32, ScratchRegI32, Out); break; } case 4: - EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8).addReg(X86::EDX) - .addReg(X86::EDX).addImm(3)); + EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8) + .addReg(ScratchRegI32) + .addReg(ScratchRegI32) + .addImm(3)); break; default: assert(false && "Incorrect access size"); @@ -242,54 +665,46 @@ void X86AddressSanitizer32::InstrumentMemOperandSmallImpl( } EmitInstruction( - Out, MCInstBuilder(X86::MOVSX32rr8).addReg(X86::ECX).addReg(X86::CL)); - EmitInstruction( - Out, MCInstBuilder(X86::CMP32rr).addReg(X86::EDX).addReg(X86::ECX)); + Out, + MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8)); + EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg( + ShadowRegI32)); EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr)); - EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite, X86::EAX); + EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); EmitLabel(Out, DoneSym); - - EmitInstruction(Out, MCInstBuilder(X86::POPF32)); - EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EDX)); - EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::ECX)); - EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX)); } -void X86AddressSanitizer32::InstrumentMemOperandLargeImpl( - X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) { - EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX)); - EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::ECX)); - EmitInstruction(Out, MCInstBuilder(X86::PUSHF32)); +void X86AddressSanitizer32::InstrumentMemOperandLarge( + X86Operand &Op, unsigned AccessSize, bool IsWrite, + const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { + unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32); + unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32); - { - MCInst Inst; - Inst.setOpcode(X86::LEA32r); - Inst.addOperand(MCOperand::CreateReg(X86::EAX)); - Op.addMemOperands(Inst, 5); - EmitInstruction(Out, Inst); - } - EmitInstruction( - Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EAX)); - EmitInstruction(Out, MCInstBuilder(X86::SHR32ri).addReg(X86::ECX) - .addReg(X86::ECX).addImm(3)); + ComputeMemOperandAddress(Op, MVT::i32, AddressRegI32, Ctx, Out); + + EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg( + AddressRegI32)); + EmitInstruction(Out, MCInstBuilder(X86::SHR32ri) + .addReg(ShadowRegI32) + .addReg(ShadowRegI32) + .addImm(3)); { MCInst Inst; switch (AccessSize) { - case 8: - Inst.setOpcode(X86::CMP8mi); - break; - case 16: - Inst.setOpcode(X86::CMP16mi); - break; - default: - assert(false && "Incorrect access size"); - break; + case 8: + Inst.setOpcode(X86::CMP8mi); + break; + case 16: + Inst.setOpcode(X86::CMP16mi); + break; + default: + assert(false && "Incorrect access size"); + break; } const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc())); + X86Operand::CreateMem(0, Disp, ShadowRegI32, 0, 1, SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); Inst.addOperand(MCOperand::CreateImm(0)); EmitInstruction(Out, Inst); @@ -298,12 +713,28 @@ void X86AddressSanitizer32::InstrumentMemOperandLargeImpl( const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); - EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite, X86::EAX); + EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); EmitLabel(Out, DoneSym); +} + +void X86AddressSanitizer32::InstrumentMOVSImpl(unsigned AccessSize, + MCContext &Ctx, + MCStreamer &Out) { + StoreFlags(Out); + + // No need to test when ECX is equals to zero. + MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); + EmitInstruction( + Out, MCInstBuilder(X86::TEST32rr).addReg(X86::ECX).addReg(X86::ECX)); + EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + + // Instrument first and last elements in src and dst range. + InstrumentMOVSBase(X86::EDI /* DstReg */, X86::ESI /* SrcReg */, + X86::ECX /* CntReg */, AccessSize, Ctx, Out); - EmitInstruction(Out, MCInstBuilder(X86::POPF32)); - EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::ECX)); - EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX)); + EmitLabel(Out, DoneSym); + RestoreFlags(Out); } class X86AddressSanitizer64 : public X86AddressSanitizer { @@ -312,37 +743,126 @@ public: X86AddressSanitizer64(const MCSubtargetInfo &STI) : X86AddressSanitizer(STI) {} + virtual ~X86AddressSanitizer64() {} - virtual void InstrumentMemOperandSmallImpl( - X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) override; - virtual void InstrumentMemOperandLargeImpl( - X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) override; + unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) { + unsigned FrameReg = GetFrameRegGeneric(Ctx, Out); + if (FrameReg == X86::NoRegister) + return FrameReg; + return getX86SubSuperRegister(FrameReg, MVT::i64); + } + + void SpillReg(MCStreamer &Out, unsigned Reg) { + EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(Reg)); + OrigSPOffset -= 8; + } + + void RestoreReg(MCStreamer &Out, unsigned Reg) { + EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(Reg)); + OrigSPOffset += 8; + } + + void StoreFlags(MCStreamer &Out) { + EmitInstruction(Out, MCInstBuilder(X86::PUSHF64)); + OrigSPOffset -= 8; + } + + void RestoreFlags(MCStreamer &Out) { + EmitInstruction(Out, MCInstBuilder(X86::POPF64)); + OrigSPOffset += 8; + } + + virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i64); + assert(LocalFrameReg != X86::NoRegister); + + const MCRegisterInfo *MRI = Ctx.getRegisterInfo(); + unsigned FrameReg = GetFrameReg(Ctx, Out); + if (MRI && FrameReg != X86::NoRegister) { + SpillReg(Out, X86::RBP); + if (FrameReg == X86::RSP) { + Out.EmitCFIAdjustCfaOffset(8 /* byte size of the LocalFrameReg */); + Out.EmitCFIRelOffset( + MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0); + } + EmitInstruction( + Out, + MCInstBuilder(X86::MOV64rr).addReg(LocalFrameReg).addReg(FrameReg)); + Out.EmitCFIRememberState(); + Out.EmitCFIDefCfaRegister( + MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */)); + } + + EmitAdjustRSP(Ctx, Out, -128); + SpillReg(Out, RegCtx.ShadowReg(MVT::i64)); + SpillReg(Out, RegCtx.AddressReg(MVT::i64)); + if (RegCtx.ScratchReg(MVT::i64) != X86::NoRegister) + SpillReg(Out, RegCtx.ScratchReg(MVT::i64)); + StoreFlags(Out); + } + + virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i64); + assert(LocalFrameReg != X86::NoRegister); + + RestoreFlags(Out); + if (RegCtx.ScratchReg(MVT::i64) != X86::NoRegister) + RestoreReg(Out, RegCtx.ScratchReg(MVT::i64)); + RestoreReg(Out, RegCtx.AddressReg(MVT::i64)); + RestoreReg(Out, RegCtx.ShadowReg(MVT::i64)); + EmitAdjustRSP(Ctx, Out, 128); + + unsigned FrameReg = GetFrameReg(Ctx, Out); + if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) { + RestoreReg(Out, LocalFrameReg); + Out.EmitCFIRestoreState(); + if (FrameReg == X86::RSP) + Out.EmitCFIAdjustCfaOffset(-8 /* byte size of the LocalFrameReg */); + } + } + + virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, + MCStreamer &Out) override; private: void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) { - MCInst Inst; - Inst.setOpcode(X86::LEA64r); - Inst.addOperand(MCOperand::CreateReg(X86::RSP)); - const MCExpr *Disp = MCConstantExpr::Create(Offset, Ctx); std::unique_ptr<X86Operand> Op( X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc())); - Op->addMemOperands(Inst, 5); - EmitInstruction(Out, Inst); + EmitLEA(*Op, MVT::i64, X86::RSP, Out); + OrigSPOffset += Offset; } - void EmitCallAsanReport(MCContext &Ctx, MCStreamer &Out, unsigned AccessSize, - bool IsWrite) { + void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out, const RegisterContext &RegCtx) { EmitInstruction(Out, MCInstBuilder(X86::CLD)); EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS)); - EmitInstruction(Out, MCInstBuilder(X86::AND64ri8).addReg(X86::RSP) - .addReg(X86::RSP).addImm(-16)); + EmitInstruction(Out, MCInstBuilder(X86::AND64ri8) + .addReg(X86::RSP) + .addReg(X86::RSP) + .addImm(-16)); - const std::string& Fn = FuncName(AccessSize, IsWrite); + if (RegCtx.AddressReg(MVT::i64) != X86::RDI) { + EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RDI).addReg( + RegCtx.AddressReg(MVT::i64))); + } + const std::string &Fn = FuncName(AccessSize, IsWrite); MCSymbol *FnSym = Ctx.GetOrCreateSymbol(StringRef(Fn)); const MCSymbolRefExpr *FnExpr = MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); @@ -350,65 +870,65 @@ private: } }; -void X86AddressSanitizer64::InstrumentMemOperandSmallImpl( - X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) { - EmitAdjustRSP(Ctx, Out, -128); - EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RAX)); - EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RCX)); - EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RDI)); - EmitInstruction(Out, MCInstBuilder(X86::PUSHF64)); - { - MCInst Inst; - Inst.setOpcode(X86::LEA64r); - Inst.addOperand(MCOperand::CreateReg(X86::RDI)); - Op.addMemOperands(Inst, 5); - EmitInstruction(Out, Inst); - } - EmitInstruction( - Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RAX).addReg(X86::RDI)); - EmitInstruction(Out, MCInstBuilder(X86::SHR64ri).addReg(X86::RAX) - .addReg(X86::RAX).addImm(3)); +void X86AddressSanitizer64::InstrumentMemOperandSmall( + X86Operand &Op, unsigned AccessSize, bool IsWrite, + const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { + unsigned AddressRegI64 = RegCtx.AddressReg(MVT::i64); + unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32); + unsigned ShadowRegI64 = RegCtx.ShadowReg(MVT::i64); + unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32); + unsigned ShadowRegI8 = RegCtx.ShadowReg(MVT::i8); + + assert(RegCtx.ScratchReg(MVT::i32) != X86::NoRegister); + unsigned ScratchRegI32 = RegCtx.ScratchReg(MVT::i32); + + ComputeMemOperandAddress(Op, MVT::i64, AddressRegI64, Ctx, Out); + + EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg( + AddressRegI64)); + EmitInstruction(Out, MCInstBuilder(X86::SHR64ri) + .addReg(ShadowRegI64) + .addReg(ShadowRegI64) + .addImm(3)); { MCInst Inst; Inst.setOpcode(X86::MOV8rm); - Inst.addOperand(MCOperand::CreateReg(X86::AL)); + Inst.addOperand(MCOperand::CreateReg(ShadowRegI8)); const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, X86::RAX, 0, 1, SMLoc(), SMLoc())); + X86Operand::CreateMem(0, Disp, ShadowRegI64, 0, 1, SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); EmitInstruction(Out, Inst); } - EmitInstruction(Out, - MCInstBuilder(X86::TEST8rr).addReg(X86::AL).addReg(X86::AL)); + EmitInstruction( + Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8)); MCSymbol *DoneSym = Ctx.CreateTempSymbol(); const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); - EmitInstruction( - Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EDI)); - EmitInstruction(Out, MCInstBuilder(X86::AND32ri).addReg(X86::ECX) - .addReg(X86::ECX).addImm(7)); + EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg( + AddressRegI32)); + EmitInstruction(Out, MCInstBuilder(X86::AND32ri) + .addReg(ScratchRegI32) + .addReg(ScratchRegI32) + .addImm(7)); switch (AccessSize) { case 1: break; case 2: { - MCInst Inst; - Inst.setOpcode(X86::LEA32r); - Inst.addOperand(MCOperand::CreateReg(X86::ECX)); - const MCExpr *Disp = MCConstantExpr::Create(1, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc())); - Op->addMemOperands(Inst, 5); - EmitInstruction(Out, Inst); + X86Operand::CreateMem(0, Disp, ScratchRegI32, 0, 1, SMLoc(), SMLoc())); + EmitLEA(*Op, MVT::i32, ScratchRegI32, Out); break; } case 4: - EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8).addReg(X86::ECX) - .addReg(X86::ECX).addImm(3)); + EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8) + .addReg(ScratchRegI32) + .addReg(ScratchRegI32) + .addImm(3)); break; default: assert(false && "Incorrect access size"); @@ -416,37 +936,30 @@ void X86AddressSanitizer64::InstrumentMemOperandSmallImpl( } EmitInstruction( - Out, MCInstBuilder(X86::MOVSX32rr8).addReg(X86::EAX).addReg(X86::AL)); - EmitInstruction( - Out, MCInstBuilder(X86::CMP32rr).addReg(X86::ECX).addReg(X86::EAX)); + Out, + MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8)); + EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg( + ShadowRegI32)); EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr)); - EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite); + EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); EmitLabel(Out, DoneSym); - - EmitInstruction(Out, MCInstBuilder(X86::POPF64)); - EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RDI)); - EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RCX)); - EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RAX)); - EmitAdjustRSP(Ctx, Out, 128); } -void X86AddressSanitizer64::InstrumentMemOperandLargeImpl( - X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) { - EmitAdjustRSP(Ctx, Out, -128); - EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RAX)); - EmitInstruction(Out, MCInstBuilder(X86::PUSHF64)); +void X86AddressSanitizer64::InstrumentMemOperandLarge( + X86Operand &Op, unsigned AccessSize, bool IsWrite, + const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { + unsigned AddressRegI64 = RegCtx.AddressReg(MVT::i64); + unsigned ShadowRegI64 = RegCtx.ShadowReg(MVT::i64); - { - MCInst Inst; - Inst.setOpcode(X86::LEA64r); - Inst.addOperand(MCOperand::CreateReg(X86::RAX)); - Op.addMemOperands(Inst, 5); - EmitInstruction(Out, Inst); - } - EmitInstruction(Out, MCInstBuilder(X86::SHR64ri).addReg(X86::RAX) - .addReg(X86::RAX).addImm(3)); + ComputeMemOperandAddress(Op, MVT::i64, AddressRegI64, Ctx, Out); + + EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg( + AddressRegI64)); + EmitInstruction(Out, MCInstBuilder(X86::SHR64ri) + .addReg(ShadowRegI64) + .addReg(ShadowRegI64) + .addImm(3)); { MCInst Inst; switch (AccessSize) { @@ -462,7 +975,7 @@ void X86AddressSanitizer64::InstrumentMemOperandLargeImpl( } const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, X86::RAX, 0, 1, SMLoc(), SMLoc())); + X86Operand::CreateMem(0, Disp, ShadowRegI64, 0, 1, SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); Inst.addOperand(MCOperand::CreateImm(0)); EmitInstruction(Out, Inst); @@ -472,22 +985,66 @@ void X86AddressSanitizer64::InstrumentMemOperandLargeImpl( const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); - EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite); + EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); EmitLabel(Out, DoneSym); +} + +void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize, + MCContext &Ctx, + MCStreamer &Out) { + StoreFlags(Out); + + // No need to test when RCX is equals to zero. + MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); + EmitInstruction( + Out, MCInstBuilder(X86::TEST64rr).addReg(X86::RCX).addReg(X86::RCX)); + EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); - EmitInstruction(Out, MCInstBuilder(X86::POPF64)); - EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RAX)); - EmitAdjustRSP(Ctx, Out, 128); + // Instrument first and last elements in src and dst range. + InstrumentMOVSBase(X86::RDI /* DstReg */, X86::RSI /* SrcReg */, + X86::RCX /* CntReg */, AccessSize, Ctx, Out); + + EmitLabel(Out, DoneSym); + RestoreFlags(Out); } } // End anonymous namespace -X86AsmInstrumentation::X86AsmInstrumentation() {} +X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo &STI) + : STI(STI), InitialFrameReg(0) {} + X86AsmInstrumentation::~X86AsmInstrumentation() {} -void X86AsmInstrumentation::InstrumentInstruction( +void X86AsmInstrumentation::InstrumentAndEmitInstruction( const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, - const MCInstrInfo &MII, MCStreamer &Out) {} + const MCInstrInfo &MII, MCStreamer &Out) { + EmitInstruction(Out, Inst); +} + +void X86AsmInstrumentation::EmitInstruction(MCStreamer &Out, + const MCInst &Inst) { + Out.EmitInstruction(Inst, STI); +} + +unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx, + MCStreamer &Out) { + if (!Out.getNumFrameInfos()) // No active dwarf frame + return X86::NoRegister; + const MCDwarfFrameInfo &Frame = Out.getDwarfFrameInfos().back(); + if (Frame.End) // Active dwarf frame is closed + return X86::NoRegister; + const MCRegisterInfo *MRI = Ctx.getRegisterInfo(); + if (!MRI) // No register info + return X86::NoRegister; + + if (InitialFrameReg) { + // FrameReg is set explicitly, we're instrumenting a MachineFunction. + return InitialFrameReg; + } + + return MRI->getLLVMRegNum(Frame.CurrentCfaRegister, true /* IsEH */); +} X86AsmInstrumentation * CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, @@ -501,7 +1058,7 @@ CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, if ((STI.getFeatureBits() & X86::Mode64Bit) != 0) return new X86AddressSanitizer64(STI); } - return new X86AsmInstrumentation(); + return new X86AsmInstrumentation(STI); } } // End llvm namespace diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h index 1bc3c09..19ebcc4 100644 --- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h +++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86_ASM_INSTRUMENTATION_H -#define X86_ASM_INSTRUMENTATION_H +#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H +#define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H #include "llvm/ADT/SmallVector.h" @@ -34,11 +34,15 @@ class X86AsmInstrumentation { public: virtual ~X86AsmInstrumentation(); - // Instruments Inst. Should be called just before the original - // instruction is sent to Out. - virtual void InstrumentInstruction( + // Sets frame register corresponding to a current frame. + void SetInitialFrameRegister(unsigned RegNo) { + InitialFrameReg = RegNo; + } + + // Tries to instrument and emit instruction. + virtual void InstrumentAndEmitInstruction( const MCInst &Inst, - SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>> &Operands, + SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand> > &Operands, MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out); protected: @@ -46,9 +50,17 @@ protected: CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, const MCContext &Ctx, const MCSubtargetInfo &STI); - X86AsmInstrumentation(); + X86AsmInstrumentation(const MCSubtargetInfo &STI); + + unsigned GetFrameRegGeneric(const MCContext &Ctx, MCStreamer &Out); + + void EmitInstruction(MCStreamer &Out, const MCInst &Inst); + + const MCSubtargetInfo &STI; + + unsigned InitialFrameReg; }; } // End llvm namespace -#endif // X86_ASM_INSTRUMENTATION_H +#endif diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index f0765ed..8ef2a55 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -32,6 +32,7 @@ #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" +#include <algorithm> #include <memory> using namespace llvm; @@ -55,12 +56,12 @@ static const char OpPrecedence[] = { class X86AsmParser : public MCTargetAsmParser { MCSubtargetInfo &STI; - MCAsmParser &Parser; const MCInstrInfo &MII; ParseInstructionInfo *InstInfo; std::unique_ptr<X86AsmInstrumentation> Instrumentation; private: SMLoc consumeToken() { + MCAsmParser &Parser = getParser(); SMLoc Result = Parser.getTok().getLoc(); Parser.Lex(); return Result; @@ -630,13 +631,10 @@ private: } }; - MCAsmParser &getParser() const { return Parser; } - - MCAsmLexer &getLexer() const { return Parser.getLexer(); } - bool Error(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges = None, bool MatchingInlineAsm = false) { + MCAsmParser &Parser = getParser(); if (MatchingInlineAsm) return true; return Parser.Error(L, Msg, Ranges); } @@ -644,8 +642,9 @@ private: bool ErrorAndEatStatement(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges = None, bool MatchingInlineAsm = false) { - Parser.eatToEndOfStatement(); - return Error(L, Msg, Ranges, MatchingInlineAsm); + MCAsmParser &Parser = getParser(); + Parser.eatToEndOfStatement(); + return Error(L, Msg, Ranges, MatchingInlineAsm); } std::nullptr_t ErrorOperand(SMLoc Loc, StringRef Msg) { @@ -693,9 +692,34 @@ private: bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, - unsigned &ErrorInfo, + uint64_t &ErrorInfo, bool MatchingInlineAsm) override; + void MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, OperandVector &Operands, + MCStreamer &Out, bool MatchingInlineAsm); + + bool ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo, + bool MatchingInlineAsm); + + bool MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm); + + bool MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm); + + unsigned getPointerSize() { + if (is16BitMode()) return 16; + if (is32BitMode()) return 32; + if (is64BitMode()) return 64; + llvm_unreachable("invalid mode"); + } + + bool OmitRegisterFromClobberLists(unsigned RegNo) override; + /// doSrcDstMatch - Returns true if operands are matching in their /// word size (%si and %di, %esi and %edi, etc.). Order depends on /// the parsing mode (Intel vs. AT&T). @@ -728,6 +752,13 @@ private: (X86::Mode64Bit | X86::Mode32Bit | X86::Mode16Bit))); } + unsigned getPointerWidth() { + if (is16BitMode()) return 16; + if (is32BitMode()) return 32; + if (is64BitMode()) return 64; + llvm_unreachable("invalid mode"); + } + bool isParsingIntelSyntax() { return getParser().getAssemblerDialect(); } @@ -741,11 +772,9 @@ private: /// } public: - X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser, - const MCInstrInfo &mii, - const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(sti), Parser(parser), MII(mii), - InstInfo(nullptr) { + X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &Parser, + const MCInstrInfo &mii, const MCTargetOptions &Options) + : MCTargetAsmParser(), STI(sti), MII(mii), InstInfo(nullptr) { // Initialize the set of available features. setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); @@ -755,6 +784,8 @@ public: bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; + void SetFrameRegister(unsigned RegNo) override; + bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; @@ -830,6 +861,7 @@ bool X86AsmParser::doSrcDstMatch(X86Operand &Op1, X86Operand &Op2) bool X86AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { + MCAsmParser &Parser = getParser(); RegNo = 0; const AsmToken &PercentTok = Parser.getTok(); StartLoc = PercentTok.getLoc(); @@ -937,6 +969,10 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, return false; } +void X86AsmParser::SetFrameRegister(unsigned RegNo) { + Instrumentation->SetInitialFrameRegister(RegNo); +} + std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) { unsigned basereg = is64BitMode() ? X86::RSI : (is32BitMode() ? X86::ESI : X86::SI); @@ -979,15 +1015,20 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier, InlineAsmIdentifierInfo &Info) { - // If this is not a VarDecl then assume it is a FuncDecl or some other label - // reference. We need an 'r' constraint here, so we need to create register - // operand to ensure proper matching. Just pick a GPR based on the size of - // a pointer. - if (isa<MCSymbolRefExpr>(Disp) && !Info.IsVarDecl) { - unsigned RegNo = - is64BitMode() ? X86::RBX : (is32BitMode() ? X86::EBX : X86::BX); - return X86Operand::CreateReg(RegNo, Start, End, /*AddressOf=*/true, - SMLoc(), Identifier, Info.OpDecl); + // If we found a decl other than a VarDecl, then assume it is a FuncDecl or + // some other label reference. + if (isa<MCSymbolRefExpr>(Disp) && Info.OpDecl && !Info.IsVarDecl) { + // Insert an explicit size if the user didn't have one. + if (!Size) { + Size = getPointerWidth(); + InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_SizeDirective, Start, + /*Len=*/0, Size)); + } + + // Create an absolute memory reference in order to match against + // instructions taking a PC relative operand. + return X86Operand::CreateMem(Disp, Start, End, Size, Identifier, + Info.OpDecl); } // We either have a direct symbol reference, or an offset from a symbol. The @@ -1076,6 +1117,7 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites, } bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { + MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); bool Done = false; @@ -1197,6 +1239,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { std::unique_ptr<X86Operand> X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, int64_t ImmDisp, unsigned Size) { + MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc(); if (getLexer().isNot(AsmToken::LBrac)) @@ -1272,13 +1315,16 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier, InlineAsmIdentifierInfo &Info, bool IsUnevaluatedOperand, SMLoc &End) { + MCAsmParser &Parser = getParser(); assert (isParsingInlineAsm() && "Expected to be parsing inline assembly."); Val = nullptr; StringRef LineBuf(Identifier.data()); - SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand); + void *Result = + SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand); const AsmToken &Tok = Parser.getTok(); + SMLoc Loc = Tok.getLoc(); // Advance the token stream until the end of the current token is // after the end of what the frontend claimed. @@ -1290,9 +1336,22 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, assert(End.getPointer() <= EndPtr && "frontend claimed part of a token?"); if (End.getPointer() == EndPtr) break; } + Identifier = LineBuf; + + // If the identifier lookup was unsuccessful, assume that we are dealing with + // a label. + if (!Result) { + StringRef InternalName = + SemaCallback->LookupInlineAsmLabel(Identifier, getSourceManager(), + Loc, false); + assert(InternalName.size() && "We should have an internal name here."); + // Push a rewrite for replacing the identifier name with the internal name. + InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Label, Loc, + Identifier.size(), + InternalName)); + } // Create the symbol reference. - Identifier = LineBuf; MCSymbol *Sym = getContext().GetOrCreateSymbol(Identifier); MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None; Val = MCSymbolRefExpr::Create(Sym, Variant, getParser().getContext()); @@ -1303,6 +1362,7 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, std::unique_ptr<X86Operand> X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size) { + MCAsmParser &Parser = getParser(); assert(SegReg != 0 && "Tried to parse a segment override without a segment!"); const AsmToken &Tok = Parser.getTok(); // Eat colon. if (Tok.isNot(AsmToken::Colon)) @@ -1354,6 +1414,7 @@ X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start, unsigned Size) { + MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); SMLoc End; @@ -1413,6 +1474,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, /// Parse the '.' operator. bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp) { + MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); int64_t OrigDispVal, DotDispVal; @@ -1457,6 +1519,7 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, /// Parse the 'offset' operator. This operator is used to specify the /// location rather then the content of a variable. std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() { + MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); SMLoc OffsetOfLoc = Tok.getLoc(); Parser.Lex(); // Eat offset. @@ -1494,6 +1557,7 @@ enum IntelOperatorKind { /// TYPE operator returns the size of a C or C++ type or variable. If the /// variable is an array, TYPE returns the size of a single element. std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) { + MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); SMLoc TypeLoc = Tok.getLoc(); Parser.Lex(); // Eat operator. @@ -1527,6 +1591,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) { } std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { + MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); SMLoc Start, End; @@ -1547,7 +1612,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { if (Size) { Parser.Lex(); // Eat operand size (e.g., byte, word). if (Tok.getString() != "PTR" && Tok.getString() != "ptr") - return ErrorOperand(Start, "Expected 'PTR' or 'ptr' token!"); + return ErrorOperand(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!"); Parser.Lex(); // Eat ptr. } Start = Tok.getLoc(); @@ -1609,6 +1674,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { } std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { + MCAsmParser &Parser = getParser(); switch (getLexer().getKind()) { default: // Parse a memory operand with no segment register. @@ -1629,6 +1695,9 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { if (getLexer().isNot(AsmToken::Colon)) return X86Operand::CreateReg(RegNo, Start, End); + if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo)) + return ErrorOperand(Start, "invalid segment register"); + getParser().Lex(); // Eat the colon. return ParseMemOperand(RegNo, Start); } @@ -1646,6 +1715,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands, const MCParsedAsmOperand &Op) { + MCAsmParser &Parser = getParser(); if(STI.getFeatureBits() & X86::FeatureAVX512) { if (getLexer().is(AsmToken::LCurly)) { // Eat "{" and mark the current place. @@ -1664,6 +1734,8 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands, // Recognize only reasonable suffixes. const char *BroadcastPrimitive = StringSwitch<const char*>(getLexer().getTok().getIdentifier()) + .Case("to2", "{1to2}") + .Case("to4", "{1to4}") .Case("to8", "{1to8}") .Case("to16", "{1to16}") .Default(nullptr); @@ -1715,6 +1787,7 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands, std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { + MCAsmParser &Parser = getParser(); // We have to disambiguate a parenthesized expression "(4+5)" from the start // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)". The // only way to do this without lookahead is to eat the '(' and see what is @@ -1872,12 +1945,15 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg, return nullptr; } - return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, - MemStart, MemEnd); + if (SegReg || BaseReg || IndexReg) + return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, + MemStart, MemEnd); + return X86Operand::CreateMem(Disp, MemStart, MemEnd); } bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { + MCAsmParser &Parser = getParser(); InstInfo = &Info; StringRef PatchedName = Name; @@ -2275,51 +2351,79 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { } } -static const char *getSubtargetFeatureName(unsigned Val); +static const char *getSubtargetFeatureName(uint64_t Val); void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out) { - Instrumentation->InstrumentInstruction(Inst, Operands, getContext(), MII, - Out); - Out.EmitInstruction(Inst, STI); + Instrumentation->InstrumentAndEmitInstruction(Inst, Operands, getContext(), + MII, Out); } bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, - MCStreamer &Out, unsigned &ErrorInfo, + MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { - assert(!Operands.empty() && "Unexpect empty operand list!"); - X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); - assert(Op.isToken() && "Leading operand should always be a mnemonic!"); - ArrayRef<SMRange> EmptyRanges = None; + if (isParsingIntelSyntax()) + return MatchAndEmitIntelInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo, + MatchingInlineAsm); + return MatchAndEmitATTInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo, + MatchingInlineAsm); +} - // First, handle aliases that expand to multiple instructions. +void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, + OperandVector &Operands, MCStreamer &Out, + bool MatchingInlineAsm) { // FIXME: This should be replaced with a real .td file alias mechanism. // Also, MatchInstructionImpl should actually *do* the EmitInstruction // call. - if (Op.getToken() == "fstsw" || Op.getToken() == "fstcw" || - Op.getToken() == "fstsww" || Op.getToken() == "fstcww" || - Op.getToken() == "finit" || Op.getToken() == "fsave" || - Op.getToken() == "fstenv" || Op.getToken() == "fclex") { + const char *Repl = StringSwitch<const char *>(Op.getToken()) + .Case("finit", "fninit") + .Case("fsave", "fnsave") + .Case("fstcw", "fnstcw") + .Case("fstcww", "fnstcw") + .Case("fstenv", "fnstenv") + .Case("fstsw", "fnstsw") + .Case("fstsww", "fnstsw") + .Case("fclex", "fnclex") + .Default(nullptr); + if (Repl) { MCInst Inst; Inst.setOpcode(X86::WAIT); Inst.setLoc(IDLoc); if (!MatchingInlineAsm) EmitInstruction(Inst, Operands, Out); - - const char *Repl = StringSwitch<const char *>(Op.getToken()) - .Case("finit", "fninit") - .Case("fsave", "fnsave") - .Case("fstcw", "fnstcw") - .Case("fstcww", "fnstcw") - .Case("fstenv", "fnstenv") - .Case("fstsw", "fnstsw") - .Case("fstsww", "fnstsw") - .Case("fclex", "fnclex") - .Default(nullptr); - assert(Repl && "Unknown wait-prefixed instruction"); Operands[0] = X86Operand::CreateToken(Repl, IDLoc); } +} + +bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo, + bool MatchingInlineAsm) { + assert(ErrorInfo && "Unknown missing feature!"); + ArrayRef<SMRange> EmptyRanges = None; + SmallString<126> Msg; + raw_svector_ostream OS(Msg); + OS << "instruction requires:"; + uint64_t Mask = 1; + for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) { + if (ErrorInfo & Mask) + OS << ' ' << getSubtargetFeatureName(ErrorInfo & Mask); + Mask <<= 1; + } + return Error(IDLoc, OS.str(), EmptyRanges, MatchingInlineAsm); +} + +bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) { + assert(!Operands.empty() && "Unexpect empty operand list!"); + X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); + assert(Op.isToken() && "Leading operand should always be a mnemonic!"); + ArrayRef<SMRange> EmptyRanges = None; + + // First, handle aliases that expand to multiple instructions. + MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm); bool WasOriginallyInvalidOperand = false; MCInst Inst; @@ -2342,21 +2446,8 @@ bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, EmitInstruction(Inst, Operands, Out); Opcode = Inst.getOpcode(); return false; - case Match_MissingFeature: { - assert(ErrorInfo && "Unknown missing feature!"); - // Special case the error message for the very common case where only - // a single subtarget feature is missing. - std::string Msg = "instruction requires:"; - unsigned Mask = 1; - for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) { - if (ErrorInfo & Mask) { - Msg += " "; - Msg += getSubtargetFeatureName(ErrorInfo & Mask); - } - Mask <<= 1; - } - return Error(IDLoc, Msg, EmptyRanges, MatchingInlineAsm); - } + case Match_MissingFeature: + return ErrorMissingFeature(IDLoc, ErrorInfo, MatchingInlineAsm); case Match_InvalidOperand: WasOriginallyInvalidOperand = true; break; @@ -2385,34 +2476,18 @@ bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0"; // Check for the various suffix matches. - Tmp[Base.size()] = Suffixes[0]; - unsigned ErrorInfoIgnore; - unsigned ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings. - unsigned Match1, Match2, Match3, Match4; - - Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, - MatchingInlineAsm, isParsingIntelSyntax()); - // If this returned as a missing feature failure, remember that. - if (Match1 == Match_MissingFeature) - ErrorInfoMissingFeature = ErrorInfoIgnore; - Tmp[Base.size()] = Suffixes[1]; - Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, - MatchingInlineAsm, isParsingIntelSyntax()); - // If this returned as a missing feature failure, remember that. - if (Match2 == Match_MissingFeature) - ErrorInfoMissingFeature = ErrorInfoIgnore; - Tmp[Base.size()] = Suffixes[2]; - Match3 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, - MatchingInlineAsm, isParsingIntelSyntax()); - // If this returned as a missing feature failure, remember that. - if (Match3 == Match_MissingFeature) - ErrorInfoMissingFeature = ErrorInfoIgnore; - Tmp[Base.size()] = Suffixes[3]; - Match4 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, - MatchingInlineAsm, isParsingIntelSyntax()); - // If this returned as a missing feature failure, remember that. - if (Match4 == Match_MissingFeature) - ErrorInfoMissingFeature = ErrorInfoIgnore; + uint64_t ErrorInfoIgnore; + uint64_t ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings. + unsigned Match[4]; + + for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) { + Tmp.back() = Suffixes[I]; + Match[I] = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, + MatchingInlineAsm, isParsingIntelSyntax()); + // If this returned as a missing feature failure, remember that. + if (Match[I] == Match_MissingFeature) + ErrorInfoMissingFeature = ErrorInfoIgnore; + } // Restore the old token. Op.setTokenValue(Base); @@ -2421,8 +2496,7 @@ bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // instruction will already have been filled in correctly, since the failing // matches won't have modified it). unsigned NumSuccessfulMatches = - (Match1 == Match_Success) + (Match2 == Match_Success) + - (Match3 == Match_Success) + (Match4 == Match_Success); + std::count(std::begin(Match), std::end(Match), Match_Success); if (NumSuccessfulMatches == 1) { Inst.setLoc(IDLoc); if (!MatchingInlineAsm) @@ -2438,10 +2512,9 @@ bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, if (NumSuccessfulMatches > 1) { char MatchChars[4]; unsigned NumMatches = 0; - if (Match1 == Match_Success) MatchChars[NumMatches++] = Suffixes[0]; - if (Match2 == Match_Success) MatchChars[NumMatches++] = Suffixes[1]; - if (Match3 == Match_Success) MatchChars[NumMatches++] = Suffixes[2]; - if (Match4 == Match_Success) MatchChars[NumMatches++] = Suffixes[3]; + for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) + if (Match[I] == Match_Success) + MatchChars[NumMatches++] = Suffixes[I]; SmallString<126> Msg; raw_svector_ostream OS(Msg); @@ -2462,8 +2535,7 @@ bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // If all of the instructions reported an invalid mnemonic, then the original // mnemonic was invalid. - if ((Match1 == Match_MnemonicFail) && (Match2 == Match_MnemonicFail) && - (Match3 == Match_MnemonicFail) && (Match4 == Match_MnemonicFail)) { + if (std::count(std::begin(Match), std::end(Match), Match_MnemonicFail) == 4) { if (!WasOriginallyInvalidOperand) { ArrayRef<SMRange> Ranges = MatchingInlineAsm ? EmptyRanges : Op.getLocRange(); @@ -2472,7 +2544,7 @@ bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, } // Recover location info for the operand if we know which was the problem. - if (ErrorInfo != ~0U) { + if (ErrorInfo != ~0ULL) { if (ErrorInfo >= Operands.size()) return Error(IDLoc, "too few operands for instruction", EmptyRanges, MatchingInlineAsm); @@ -2491,27 +2563,19 @@ bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // If one instruction matched with a missing feature, report this as a // missing feature. - if ((Match1 == Match_MissingFeature) + (Match2 == Match_MissingFeature) + - (Match3 == Match_MissingFeature) + (Match4 == Match_MissingFeature) == 1){ - std::string Msg = "instruction requires:"; - unsigned Mask = 1; - for (unsigned i = 0; i < (sizeof(ErrorInfoMissingFeature)*8-1); ++i) { - if (ErrorInfoMissingFeature & Mask) { - Msg += " "; - Msg += getSubtargetFeatureName(ErrorInfoMissingFeature & Mask); - } - Mask <<= 1; - } - return Error(IDLoc, Msg, EmptyRanges, MatchingInlineAsm); + if (std::count(std::begin(Match), std::end(Match), + Match_MissingFeature) == 1) { + ErrorInfo = ErrorInfoMissingFeature; + return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature, + MatchingInlineAsm); } // If one instruction matched with an invalid operand, report this as an // operand failure. - if ((Match1 == Match_InvalidOperand) + (Match2 == Match_InvalidOperand) + - (Match3 == Match_InvalidOperand) + (Match4 == Match_InvalidOperand) == 1){ - Error(IDLoc, "invalid operand for instruction", EmptyRanges, - MatchingInlineAsm); - return true; + if (std::count(std::begin(Match), std::end(Match), + Match_InvalidOperand) == 1) { + return Error(IDLoc, "invalid operand for instruction", EmptyRanges, + MatchingInlineAsm); } // If all of these were an outright failure, report it in a useless way. @@ -2520,22 +2584,173 @@ bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return true; } +bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) { + assert(!Operands.empty() && "Unexpect empty operand list!"); + X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); + assert(Op.isToken() && "Leading operand should always be a mnemonic!"); + StringRef Mnemonic = Op.getToken(); + ArrayRef<SMRange> EmptyRanges = None; + + // First, handle aliases that expand to multiple instructions. + MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm); + + MCInst Inst; + + // Find one unsized memory operand, if present. + X86Operand *UnsizedMemOp = nullptr; + for (const auto &Op : Operands) { + X86Operand *X86Op = static_cast<X86Operand *>(Op.get()); + if (X86Op->isMemUnsized()) + UnsizedMemOp = X86Op; + } + + // Allow some instructions to have implicitly pointer-sized operands. This is + // compatible with gas. + if (UnsizedMemOp) { + static const char *const PtrSizedInstrs[] = {"call", "jmp", "push"}; + for (const char *Instr : PtrSizedInstrs) { + if (Mnemonic == Instr) { + UnsizedMemOp->Mem.Size = getPointerSize(); + break; + } + } + } + + // If an unsized memory operand is present, try to match with each memory + // operand size. In Intel assembly, the size is not part of the instruction + // mnemonic. + SmallVector<unsigned, 8> Match; + uint64_t ErrorInfoMissingFeature = 0; + if (UnsizedMemOp && UnsizedMemOp->isMemUnsized()) { + static const unsigned MopSizes[] = {8, 16, 32, 64, 80}; + for (unsigned Size : MopSizes) { + UnsizedMemOp->Mem.Size = Size; + uint64_t ErrorInfoIgnore; + unsigned LastOpcode = Inst.getOpcode(); + unsigned M = + MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, + MatchingInlineAsm, isParsingIntelSyntax()); + if (Match.empty() || LastOpcode != Inst.getOpcode()) + Match.push_back(M); + + // If this returned as a missing feature failure, remember that. + if (Match.back() == Match_MissingFeature) + ErrorInfoMissingFeature = ErrorInfoIgnore; + } + + // Restore the size of the unsized memory operand if we modified it. + if (UnsizedMemOp) + UnsizedMemOp->Mem.Size = 0; + } + + // If we haven't matched anything yet, this is not a basic integer or FPU + // operation. There shouldn't be any ambiguity in our mneumonic table, so try + // matching with the unsized operand. + if (Match.empty()) { + Match.push_back(MatchInstructionImpl(Operands, Inst, ErrorInfo, + MatchingInlineAsm, + isParsingIntelSyntax())); + // If this returned as a missing feature failure, remember that. + if (Match.back() == Match_MissingFeature) + ErrorInfoMissingFeature = ErrorInfo; + } + + // Restore the size of the unsized memory operand if we modified it. + if (UnsizedMemOp) + UnsizedMemOp->Mem.Size = 0; + + // If it's a bad mnemonic, all results will be the same. + if (Match.back() == Match_MnemonicFail) { + ArrayRef<SMRange> Ranges = + MatchingInlineAsm ? EmptyRanges : Op.getLocRange(); + return Error(IDLoc, "invalid instruction mnemonic '" + Mnemonic + "'", + Ranges, MatchingInlineAsm); + } + + // If exactly one matched, then we treat that as a successful match (and the + // instruction will already have been filled in correctly, since the failing + // matches won't have modified it). + unsigned NumSuccessfulMatches = + std::count(std::begin(Match), std::end(Match), Match_Success); + if (NumSuccessfulMatches == 1) { + // Some instructions need post-processing to, for example, tweak which + // encoding is selected. Loop on it while changes happen so the individual + // transformations can chain off each other. + if (!MatchingInlineAsm) + while (processInstruction(Inst, Operands)) + ; + Inst.setLoc(IDLoc); + if (!MatchingInlineAsm) + EmitInstruction(Inst, Operands, Out); + Opcode = Inst.getOpcode(); + return false; + } else if (NumSuccessfulMatches > 1) { + assert(UnsizedMemOp && + "multiple matches only possible with unsized memory operands"); + ArrayRef<SMRange> Ranges = + MatchingInlineAsm ? EmptyRanges : UnsizedMemOp->getLocRange(); + return Error(UnsizedMemOp->getStartLoc(), + "ambiguous operand size for instruction '" + Mnemonic + "\'", + Ranges, MatchingInlineAsm); + } + + // If one instruction matched with a missing feature, report this as a + // missing feature. + if (std::count(std::begin(Match), std::end(Match), + Match_MissingFeature) == 1) { + ErrorInfo = ErrorInfoMissingFeature; + return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature, + MatchingInlineAsm); + } + + // If one instruction matched with an invalid operand, report this as an + // operand failure. + if (std::count(std::begin(Match), std::end(Match), + Match_InvalidOperand) == 1) { + return Error(IDLoc, "invalid operand for instruction", EmptyRanges, + MatchingInlineAsm); + } + + // If all of these were an outright failure, report it in a useless way. + return Error(IDLoc, "unknown instruction mnemonic", EmptyRanges, + MatchingInlineAsm); +} + +bool X86AsmParser::OmitRegisterFromClobberLists(unsigned RegNo) { + return X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo); +} bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { + MCAsmParser &Parser = getParser(); StringRef IDVal = DirectiveID.getIdentifier(); if (IDVal == ".word") return ParseDirectiveWord(2, DirectiveID.getLoc()); else if (IDVal.startswith(".code")) return ParseDirectiveCode(IDVal, DirectiveID.getLoc()); else if (IDVal.startswith(".att_syntax")) { + if (getLexer().isNot(AsmToken::EndOfStatement)) { + if (Parser.getTok().getString() == "prefix") + Parser.Lex(); + else if (Parser.getTok().getString() == "noprefix") + return Error(DirectiveID.getLoc(), "'.att_syntax noprefix' is not " + "supported: registers must have a " + "'%' prefix in .att_syntax"); + } getParser().setAssemblerDialect(0); return false; } else if (IDVal.startswith(".intel_syntax")) { getParser().setAssemblerDialect(1); if (getLexer().isNot(AsmToken::EndOfStatement)) { - // FIXME: Handle noprefix if (Parser.getTok().getString() == "noprefix") Parser.Lex(); + else if (Parser.getTok().getString() == "prefix") + return Error(DirectiveID.getLoc(), "'.intel_syntax prefix' is not " + "supported: registers must not have " + "a '%' prefix in .intel_syntax"); } return false; } @@ -2545,6 +2760,7 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { /// ParseDirectiveWord /// ::= .word [ expression (, expression)* ] bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { + MCAsmParser &Parser = getParser(); if (getLexer().isNot(AsmToken::EndOfStatement)) { for (;;) { const MCExpr *Value; @@ -2572,6 +2788,7 @@ bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { /// ParseDirectiveCode /// ::= .code16 | .code32 | .code64 bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) { + MCAsmParser &Parser = getParser(); if (IDVal == ".code16") { Parser.Lex(); if (!is16BitMode()) { diff --git a/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/lib/Target/X86/AsmParser/X86AsmParserCommon.h index ef1565f..72aeeaa 100644 --- a/lib/Target/X86/AsmParser/X86AsmParserCommon.h +++ b/lib/Target/X86/AsmParser/X86AsmParserCommon.h @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86_ASM_PARSER_COMMON_H -#define X86_ASM_PARSER_COMMON_H +#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMPARSERCOMMON_H +#define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMPARSERCOMMON_H namespace llvm { @@ -24,10 +24,6 @@ inline bool isImmSExti32i8Value(uint64_t Value) { (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); } -inline bool isImmZExtu32u8Value(uint64_t Value) { - return (Value <= 0x00000000000000FFULL); -} - inline bool isImmSExti64i8Value(uint64_t Value) { return (( Value <= 0x000000000000007FULL)|| (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); @@ -40,4 +36,4 @@ inline bool isImmSExti64i32Value(uint64_t Value) { } // End of namespace llvm -#endif // X86_ASM_PARSER_COMMON_H +#endif diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h index 1bbfc11..e0fab8d 100644 --- a/lib/Target/X86/AsmParser/X86Operand.h +++ b/lib/Target/X86/AsmParser/X86Operand.h @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86_OPERAND_H -#define X86_OPERAND_H +#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H +#define LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H #include "X86AsmParserCommon.h" #include "llvm/MC/MCExpr.h" @@ -153,20 +153,6 @@ struct X86Operand : public MCParsedAsmOperand { // extension. return isImmSExti32i8Value(CE->getValue()); } - bool isImmZExtu32u8() const { - if (!isImm()) - return false; - - // If this isn't a constant expr, just assume it fits and let relaxation - // handle it. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); - if (!CE) - return true; - - // Otherwise, check the value is in a range that makes sense for this - // extension. - return isImmZExtu32u8Value(CE->getValue()); - } bool isImmSExti64i8() const { if (!isImm()) return false; @@ -205,6 +191,9 @@ struct X86Operand : public MCParsedAsmOperand { } bool isMem() const override { return Kind == Memory; } + bool isMemUnsized() const { + return Kind == Memory && Mem.Size == 0; + } bool isMem8() const { return Kind == Memory && (!Mem.Size || Mem.Size == 8); } @@ -485,4 +474,4 @@ struct X86Operand : public MCParsedAsmOperand { } // End of namespace llvm -#endif // X86_OPERAND +#endif diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index a09767e..1083fad 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -14,15 +14,12 @@ add_public_tablegen_target(X86CommonTableGen) set(sources X86AsmPrinter.cpp - X86AtomicExpandPass.cpp - X86CodeEmitter.cpp X86FastISel.cpp X86FloatingPoint.cpp X86FrameLowering.cpp X86ISelDAGToDAG.cpp X86ISelLowering.cpp X86InstrInfo.cpp - X86JITInfo.cpp X86MCInstLower.cpp X86MachineFunctionInfo.cpp X86PadShortFunction.cpp diff --git a/lib/Target/X86/Disassembler/LLVMBuild.txt b/lib/Target/X86/Disassembler/LLVMBuild.txt index cac7adf..e003fc9 100644 --- a/lib/Target/X86/Disassembler/LLVMBuild.txt +++ b/lib/Target/X86/Disassembler/LLVMBuild.txt @@ -19,5 +19,5 @@ type = Library name = X86Disassembler parent = X86 -required_libraries = MC Support X86Info +required_libraries = MCDisassembler Support X86Info add_to_library_groups = X86 diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index c366725..5e8c2d6 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -23,7 +23,6 @@ #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/MemoryObject.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" @@ -97,16 +96,26 @@ X86GenericDisassembler::X86GenericDisassembler( } } -/// regionReader - a callback function that wraps the readByte method from -/// MemoryObject. +struct Region { + ArrayRef<uint8_t> Bytes; + uint64_t Base; + Region(ArrayRef<uint8_t> Bytes, uint64_t Base) : Bytes(Bytes), Base(Base) {} +}; + +/// A callback function that wraps the readByte method from Region. /// -/// @param arg - The generic callback parameter. In this case, this should -/// be a pointer to a MemoryObject. -/// @param byte - A pointer to the byte to be read. -/// @param address - The address to be read. -static int regionReader(const void* arg, uint8_t* byte, uint64_t address) { - const MemoryObject* region = static_cast<const MemoryObject*>(arg); - return region->readByte(address, byte); +/// @param Arg - The generic callback parameter. In this case, this should +/// be a pointer to a Region. +/// @param Byte - A pointer to the byte to be read. +/// @param Address - The address to be read. +static int regionReader(const void *Arg, uint8_t *Byte, uint64_t Address) { + auto *R = static_cast<const Region *>(Arg); + ArrayRef<uint8_t> Bytes = R->Bytes; + unsigned Index = Address - R->Base; + if (Bytes.size() <= Index) + return -1; + *Byte = Bytes[Index]; + return 0; } /// logger - a callback function that wraps the operator<< method from @@ -127,38 +136,29 @@ static void logger(void* arg, const char* log) { // Public interface for the disassembler // -MCDisassembler::DecodeStatus -X86GenericDisassembler::getInstruction(MCInst &instr, - uint64_t &size, - const MemoryObject ®ion, - uint64_t address, - raw_ostream &vStream, - raw_ostream &cStream) const { - CommentStream = &cStream; +MCDisassembler::DecodeStatus X86GenericDisassembler::getInstruction( + MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, + raw_ostream &VStream, raw_ostream &CStream) const { + CommentStream = &CStream; - InternalInstruction internalInstr; + InternalInstruction InternalInstr; - dlog_t loggerFn = logger; - if (&vStream == &nulls()) - loggerFn = nullptr; // Disable logging completely if it's going to nulls(). - - int ret = decodeInstruction(&internalInstr, - regionReader, - (const void*)®ion, - loggerFn, - (void*)&vStream, - (const void*)MII.get(), - address, - fMode); - - if (ret) { - size = internalInstr.readerCursor - address; + dlog_t LoggerFn = logger; + if (&VStream == &nulls()) + LoggerFn = nullptr; // Disable logging completely if it's going to nulls(). + + Region R(Bytes, Address); + + int Ret = decodeInstruction(&InternalInstr, regionReader, (const void *)&R, + LoggerFn, (void *)&VStream, + (const void *)MII.get(), Address, fMode); + + if (Ret) { + Size = InternalInstr.readerCursor - Address; return Fail; - } - else { - size = internalInstr.length; - return (!translateInstruction(instr, internalInstr, this)) ? - Success : Fail; + } else { + Size = InternalInstr.length; + return (!translateInstruction(Instr, InternalInstr, this)) ? Success : Fail; } } @@ -717,7 +717,7 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, return false; case ENCODING_WRITEMASK: return translateMaskRegister(mcInst, insn.writemask); - case ENCODING_RM: + CASE_ENCODING_RM: return translateRM(mcInst, operand, insn, Dis); case ENCODING_CB: case ENCODING_CW: diff --git a/lib/Target/X86/Disassembler/X86Disassembler.h b/lib/Target/X86/Disassembler/X86Disassembler.h index 4dc7c29..d7f426b 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.h +++ b/lib/Target/X86/Disassembler/X86Disassembler.h @@ -71,8 +71,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86DISASSEMBLER_H -#define X86DISASSEMBLER_H +#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLER_H +#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLER_H #include "X86DisassemblerDecoderCommon.h" #include "llvm/MC/MCDisassembler.h" @@ -87,21 +87,17 @@ class raw_ostream; namespace X86Disassembler { -/// X86GenericDisassembler - Generic disassembler for all X86 platforms. -/// All each platform class should have to do is subclass the constructor, and -/// provide a different disassemblerMode value. +/// Generic disassembler for all X86 platforms. All each platform class should +/// have to do is subclass the constructor, and provide a different +/// disassemblerMode value. class X86GenericDisassembler : public MCDisassembler { std::unique_ptr<const MCInstrInfo> MII; public: - /// Constructor - Initializes the disassembler. - /// X86GenericDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, std::unique_ptr<const MCInstrInfo> MII); public: - - /// getInstruction - See MCDisassembler. DecodeStatus getInstruction(MCInst &instr, uint64_t &size, - const MemoryObject ®ion, uint64_t address, + ArrayRef<uint8_t> Bytes, uint64_t Address, raw_ostream &vStream, raw_ostream &cStream) const override; diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index 55587d4..98b3440 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -1,4 +1,4 @@ -//===-- X86DisassemblerDecoder.c - Disassembler decoder -------------------===// +//===-- X86DisassemblerDecoder.cpp - Disassembler decoder -----------------===// // // The LLVM Compiler Infrastructure // @@ -13,10 +13,10 @@ // //===----------------------------------------------------------------------===// -#include <stdarg.h> /* for va_*() */ -#include <stdio.h> /* for vsnprintf() */ -#include <stdlib.h> /* for exit() */ -#include <string.h> /* for memset() */ +#include <cstdarg> /* for va_*() */ +#include <cstdio> /* for vsnprintf() */ +#include <cstdlib> /* for exit() */ +#include <cstring> /* for memset() */ #include "X86DisassemblerDecoder.h" @@ -472,8 +472,7 @@ static int readPrefixes(struct InternalInstruction* insn) { if ((insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) && ((~byte1 & 0xc) == 0xc) && ((byte2 & 0x4) == 0x4)) { insn->vectorExtensionType = TYPE_EVEX; - } - else { + } else { unconsumeByte(insn); /* unconsume byte1 */ unconsumeByte(insn); /* unconsume byte */ insn->necessaryPrefixLocation = insn->readerCursor - 2; @@ -504,8 +503,7 @@ static int readPrefixes(struct InternalInstruction* insn) { insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1], insn->vectorExtensionPrefix[2], insn->vectorExtensionPrefix[3]); } - } - else if (byte == 0xc4) { + } else if (byte == 0xc4) { uint8_t byte1; if (lookAtByte(insn, &byte1)) { @@ -516,8 +514,7 @@ static int readPrefixes(struct InternalInstruction* insn) { if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) { insn->vectorExtensionType = TYPE_VEX_3B; insn->necessaryPrefixLocation = insn->readerCursor - 1; - } - else { + } else { unconsumeByte(insn); insn->necessaryPrefixLocation = insn->readerCursor - 1; } @@ -541,8 +538,7 @@ static int readPrefixes(struct InternalInstruction* insn) { insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1], insn->vectorExtensionPrefix[2]); } - } - else if (byte == 0xc5) { + } else if (byte == 0xc5) { uint8_t byte1; if (lookAtByte(insn, &byte1)) { @@ -552,8 +548,7 @@ static int readPrefixes(struct InternalInstruction* insn) { if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) { insn->vectorExtensionType = TYPE_VEX_2B; - } - else { + } else { unconsumeByte(insn); } @@ -566,8 +561,7 @@ static int readPrefixes(struct InternalInstruction* insn) { | (rFromVEX2of2(insn->vectorExtensionPrefix[1]) << 2); } - switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) - { + switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) { default: break; case VEX_PREFIX_66: @@ -579,8 +573,7 @@ static int readPrefixes(struct InternalInstruction* insn) { insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1]); } - } - else if (byte == 0x8f) { + } else if (byte == 0x8f) { uint8_t byte1; if (lookAtByte(insn, &byte1)) { @@ -591,8 +584,7 @@ static int readPrefixes(struct InternalInstruction* insn) { if ((byte1 & 0x38) != 0x0) { /* 0 in these 3 bits is a POP instruction. */ insn->vectorExtensionType = TYPE_XOP; insn->necessaryPrefixLocation = insn->readerCursor - 1; - } - else { + } else { unconsumeByte(insn); insn->necessaryPrefixLocation = insn->readerCursor - 1; } @@ -612,8 +604,7 @@ static int readPrefixes(struct InternalInstruction* insn) { | (bFromXOP2of3(insn->vectorExtensionPrefix[1]) << 0); } - switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) - { + switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) { default: break; case VEX_PREFIX_66: @@ -625,8 +616,7 @@ static int readPrefixes(struct InternalInstruction* insn) { insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1], insn->vectorExtensionPrefix[2]); } - } - else { + } else { if (insn->mode == MODE_64BIT) { if ((byte & 0xf0) == 0x40) { uint8_t opcodeByte; @@ -698,8 +688,7 @@ static int readOpcode(struct InternalInstruction* insn) { insn->opcodeType = ONEBYTE; - if (insn->vectorExtensionType == TYPE_EVEX) - { + if (insn->vectorExtensionType == TYPE_EVEX) { switch (mmFromEVEX2of4(insn->vectorExtensionPrefix[1])) { default: dbgprintf(insn, "Unhandled mm field for instruction (0x%hhx)", @@ -715,8 +704,7 @@ static int readOpcode(struct InternalInstruction* insn) { insn->opcodeType = THREEBYTE_3A; return consumeByte(insn, &insn->opcode); } - } - else if (insn->vectorExtensionType == TYPE_VEX_3B) { + } else if (insn->vectorExtensionType == TYPE_VEX_3B) { switch (mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])) { default: dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", @@ -732,12 +720,10 @@ static int readOpcode(struct InternalInstruction* insn) { insn->opcodeType = THREEBYTE_3A; return consumeByte(insn, &insn->opcode); } - } - else if (insn->vectorExtensionType == TYPE_VEX_2B) { + } else if (insn->vectorExtensionType == TYPE_VEX_2B) { insn->opcodeType = TWOBYTE; return consumeByte(insn, &insn->opcode); - } - else if (insn->vectorExtensionType == TYPE_XOP) { + } else if (insn->vectorExtensionType == TYPE_XOP) { switch (mmmmmFromXOP2of3(insn->vectorExtensionPrefix[1])) { default: dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", @@ -866,6 +852,22 @@ static bool is16BitEquivalent(const char* orig, const char* equiv) { } /* + * is64Bit - Determines whether this instruction is a 64-bit instruction. + * + * @param name - The instruction that is not 16-bit + */ +static bool is64Bit(const char* name) { + off_t i; + + for (i = 0;; ++i) { + if (name[i] == '\0') + return false; + if (name[i] == '6' && name[i+1] == '4') + return true; + } +} + +/* * getID - Determines the ID of an instruction, consuming the ModR/M byte as * appropriate for extended and escape opcodes. Determines the attributes and * context for the instruction before doing so. @@ -911,8 +913,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { attrMask |= ATTR_EVEXL; if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3])) attrMask |= ATTR_EVEXL2; - } - else if (insn->vectorExtensionType == TYPE_VEX_3B) { + } else if (insn->vectorExtensionType == TYPE_VEX_3B) { switch (ppFromVEX3of3(insn->vectorExtensionPrefix[2])) { case VEX_PREFIX_66: attrMask |= ATTR_OPSIZE; @@ -927,8 +928,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { if (lFromVEX3of3(insn->vectorExtensionPrefix[2])) attrMask |= ATTR_VEXL; - } - else if (insn->vectorExtensionType == TYPE_VEX_2B) { + } else if (insn->vectorExtensionType == TYPE_VEX_2B) { switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) { case VEX_PREFIX_66: attrMask |= ATTR_OPSIZE; @@ -943,8 +943,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { if (lFromVEX2of2(insn->vectorExtensionPrefix[1])) attrMask |= ATTR_VEXL; - } - else if (insn->vectorExtensionType == TYPE_XOP) { + } else if (insn->vectorExtensionType == TYPE_XOP) { switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) { case VEX_PREFIX_66: attrMask |= ATTR_OPSIZE; @@ -959,12 +958,10 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { if (lFromXOP3of3(insn->vectorExtensionPrefix[2])) attrMask |= ATTR_VEXL; - } - else { + } else { return -1; } - } - else { + } else { if (insn->mode != MODE_16BIT && isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) attrMask |= ATTR_OPSIZE; else if (isPrefixAtLocation(insn, 0x67, insn->necessaryPrefixLocation)) @@ -1002,6 +999,37 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { /* The following clauses compensate for limitations of the tables. */ + if (insn->mode != MODE_64BIT && + insn->vectorExtensionType != TYPE_NO_VEX_XOP) { + /* + * The tables can't distinquish between cases where the W-bit is used to + * select register size and cases where its a required part of the opcode. + */ + if ((insn->vectorExtensionType == TYPE_EVEX && + wFromEVEX3of4(insn->vectorExtensionPrefix[2])) || + (insn->vectorExtensionType == TYPE_VEX_3B && + wFromVEX3of3(insn->vectorExtensionPrefix[2])) || + (insn->vectorExtensionType == TYPE_XOP && + wFromXOP3of3(insn->vectorExtensionPrefix[2]))) { + + uint16_t instructionIDWithREXW; + if (getIDWithAttrMask(&instructionIDWithREXW, + insn, attrMask | ATTR_REXW)) { + insn->instructionID = instructionID; + insn->spec = specifierForUID(instructionID); + return 0; + } + + const char *SpecName = GetInstrName(instructionIDWithREXW, miiArg); + // If not a 64-bit instruction. Switch the opcode. + if (!is64Bit(SpecName)) { + insn->instructionID = instructionIDWithREXW; + insn->spec = specifierForUID(instructionIDWithREXW); + return 0; + } + } + } + if ((insn->mode == MODE_16BIT || insn->prefixPresent[0x66]) && !(attrMask & ATTR_OPSIZE)) { /* @@ -1488,7 +1516,7 @@ static int fixupReg(struct InternalInstruction *insn, if (!valid) return -1; break; - case ENCODING_RM: + CASE_ENCODING_RM: if (insn->eaBase >= insn->eaRegBase) { insn->eaBase = (EABase)fixupRMValue(insn, (OperandType)op->type, @@ -1681,11 +1709,14 @@ static int readOperands(struct InternalInstruction* insn) { case ENCODING_DI: break; case ENCODING_REG: - case ENCODING_RM: + CASE_ENCODING_RM: if (readModRM(insn)) return -1; if (fixupReg(insn, &Op)) return -1; + // Apply the AVX512 compressed displacement scaling factor. + if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8) + insn->displacement *= 1 << (Op.encoding - ENCODING_RM); break; case ENCODING_CB: case ENCODING_CW: diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index 8c45402..457b382 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -13,8 +13,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86DISASSEMBLERDECODER_H -#define X86DISASSEMBLERDECODER_H +#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H +#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H #include "X86DisassemblerDecoderCommon.h" #include "llvm/ADT/ArrayRef.h" diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h index f59e0b6..bec4f0e 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h @@ -14,8 +14,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86DISASSEMBLERDECODERCOMMON_H -#define X86DISASSEMBLERDECODERCOMMON_H +#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODERCOMMON_H +#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODERCOMMON_H #include "llvm/Support/DataTypes.h" @@ -265,7 +265,7 @@ enum attributeBits { ENUM_ENTRY(IC_EVEX_L2_W_KZ, 3, "requires EVEX_KZ, L2 and W") \ ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ, 4, "requires EVEX_KZ, L2, W and XS prefix") \ ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ, 4, "requires EVEX_KZ, L2, W and XD prefix") \ - ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4, "requires EVEX_KZ, L2, W and OpSize") + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4, "requires EVEX_KZ, L2, W and OpSize") #define ENUM_ENTRY(n, r, d) n, enum InstructionContext { @@ -325,11 +325,26 @@ enum ModRMDecisionType { }; #undef ENUM_ENTRY +#define CASE_ENCODING_RM \ + case ENCODING_RM: \ + case ENCODING_RM_CD2: \ + case ENCODING_RM_CD4: \ + case ENCODING_RM_CD8: \ + case ENCODING_RM_CD16: \ + case ENCODING_RM_CD32: \ + case ENCODING_RM_CD64 + // Physical encodings of instruction operands. #define ENCODINGS \ ENUM_ENTRY(ENCODING_NONE, "") \ ENUM_ENTRY(ENCODING_REG, "Register operand in ModR/M byte.") \ ENUM_ENTRY(ENCODING_RM, "R/M operand in ModR/M byte.") \ + ENUM_ENTRY(ENCODING_RM_CD2, "R/M operand with CDisp scaling of 2") \ + ENUM_ENTRY(ENCODING_RM_CD4, "R/M operand with CDisp scaling of 4") \ + ENUM_ENTRY(ENCODING_RM_CD8, "R/M operand with CDisp scaling of 8") \ + ENUM_ENTRY(ENCODING_RM_CD16,"R/M operand with CDisp scaling of 16") \ + ENUM_ENTRY(ENCODING_RM_CD32,"R/M operand with CDisp scaling of 32") \ + ENUM_ENTRY(ENCODING_RM_CD64,"R/M operand with CDisp scaling of 64") \ ENUM_ENTRY(ENCODING_VVVV, "Register operand in VEX.vvvv byte.") \ ENUM_ENTRY(ENCODING_WRITEMASK, "Register operand in EVEX.aaa byte.") \ ENUM_ENTRY(ENCODING_CB, "1-byte code offset (possible new CS value)") \ @@ -438,8 +453,12 @@ enum OperandEncoding { ENUM_ENTRY(TYPE_XMM256, "32-byte") \ ENUM_ENTRY(TYPE_XMM512, "64-byte") \ ENUM_ENTRY(TYPE_VK1, "1-bit") \ + ENUM_ENTRY(TYPE_VK2, "2-bit") \ + ENUM_ENTRY(TYPE_VK4, "4-bit") \ ENUM_ENTRY(TYPE_VK8, "8-bit") \ ENUM_ENTRY(TYPE_VK16, "16-bit") \ + ENUM_ENTRY(TYPE_VK32, "32-bit") \ + ENUM_ENTRY(TYPE_VK64, "64-bit") \ ENUM_ENTRY(TYPE_XMM0, "Implicit use of XMM0") \ ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \ ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \ @@ -481,7 +500,7 @@ enum ModifierType { }; #undef ENUM_ENTRY -static const unsigned X86_MAX_OPERANDS = 5; +static const unsigned X86_MAX_OPERANDS = 6; /// Decoding mode for the Intel disassembler. 16-bit, 32-bit, and 64-bit mode /// are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode, diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index b45b118..b72730c 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -45,19 +45,31 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, const MCInstrDesc &Desc = MII.get(MI->getOpcode()); uint64_t TSFlags = Desc.TSFlags; + // If verbose assembly is enabled, we can print some informative comments. + if (CommentStream) + HasCustomInstComment = + EmitAnyX86InstComments(MI, *CommentStream, getRegisterName); + if (TSFlags & X86II::LOCK) OS << "\tlock\n"; + // Output CALLpcrel32 as "callq" in 64-bit mode. + // In Intel annotation it's always emitted as "call". + // + // TODO: Probably this hack should be redesigned via InstAlias in + // InstrInfo.td as soon as Requires clause is supported properly + // for InstAlias. + if (MI->getOpcode() == X86::CALLpcrel32 && + (getAvailableFeatures() & X86::Mode64Bit) != 0) { + OS << "\tcallq\t"; + printPCRelImm(MI, 0, OS); + } // Try to print any aliases first. - if (!printAliasInstr(MI, OS)) + else if (!printAliasInstr(MI, OS)) printInstruction(MI, OS); // Next always print the annotation. printAnnotation(OS, Annot); - - // If verbose assembly is enabled, we can print some informative comments. - if (CommentStream) - EmitAnyX86InstComments(MI, *CommentStream, getRegisterName); } void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op, @@ -170,7 +182,11 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, << '$' << formatImm((int64_t)Op.getImm()) << markup(">"); - if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256)) + // If there are no instruction-specific comments, add a comment clarifying + // the hex value of the immediate operand when it isn't in the range + // [-256,255]. + if (CommentStream && !HasCustomInstComment && + (Op.getImm() > 255 || Op.getImm() < -256)) *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm()); } else { diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h index 531183b..41be14b 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -11,10 +11,11 @@ // //===----------------------------------------------------------------------===// -#ifndef X86_ATT_INST_PRINTER_H -#define X86_ATT_INST_PRINTER_H +#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H +#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H #include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCSubtargetInfo.h" namespace llvm { @@ -23,8 +24,11 @@ class MCOperand; class X86ATTInstPrinter final : public MCInstPrinter { public: X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} + const MCRegisterInfo &MRI, const MCSubtargetInfo &STI) + : MCInstPrinter(MAI, MII, MRI) { + // Initialize the set of available features. + setAvailableFeatures(STI.getFeatureBits()); + } void printRegName(raw_ostream &OS, unsigned RegNo) const override; void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot) override; @@ -129,6 +133,9 @@ public: void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemOffset(MI, OpNo, O); } + +private: + bool HasCustomInstComment; }; } diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index baf6507..a8f15e6 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -28,13 +28,116 @@ using namespace llvm; /// EmitAnyX86InstComments - This function decodes x86 instructions and prints /// newline terminated strings to the specified string if desired. This /// information is shown in disassembly dumps when verbose assembly is enabled. -void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, +bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, const char *(*getRegName)(unsigned)) { // If this is a shuffle operation, the switch should fill in this state. SmallVector<int, 8> ShuffleMask; const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr; switch (MI->getOpcode()) { + default: + // Not an instruction for which we can decode comments. + return false; + + case X86::BLENDPDrri: + case X86::VBLENDPDrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::BLENDPDrmi: + case X86::VBLENDPDrmi: + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodeBLENDMask(MVT::v2f64, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VBLENDPDYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VBLENDPDYrmi: + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodeBLENDMask(MVT::v4f64, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::BLENDPSrri: + case X86::VBLENDPSrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::BLENDPSrmi: + case X86::VBLENDPSrmi: + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodeBLENDMask(MVT::v4f32, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VBLENDPSYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VBLENDPSYrmi: + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodeBLENDMask(MVT::v8f32, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::PBLENDWrri: + case X86::VPBLENDWrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::PBLENDWrmi: + case X86::VPBLENDWrmi: + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodeBLENDMask(MVT::v8i16, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VPBLENDWYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPBLENDWYrmi: + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodeBLENDMask(MVT::v16i16, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::VPBLENDDrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPBLENDDrmi: + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodeBLENDMask(MVT::v4i32, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::VPBLENDDYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPBLENDDYrmi: + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodeBLENDMask(MVT::v8i32, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::INSERTPSrr: case X86::VINSERTPSrr: DestName = getRegName(MI->getOperand(0).getReg()); @@ -60,6 +163,80 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, DecodeMOVHLPSMask(2, ShuffleMask); break; + case X86::MOVSLDUPrr: + case X86::VMOVSLDUPrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::MOVSLDUPrm: + case X86::VMOVSLDUPrm: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVSLDUPMask(MVT::v4f32, ShuffleMask); + break; + + case X86::VMOVSHDUPYrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VMOVSHDUPYrm: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVSHDUPMask(MVT::v8f32, ShuffleMask); + break; + + case X86::VMOVSLDUPYrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VMOVSLDUPYrm: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVSLDUPMask(MVT::v8f32, ShuffleMask); + break; + + case X86::MOVSHDUPrr: + case X86::VMOVSHDUPrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::MOVSHDUPrm: + case X86::VMOVSHDUPrm: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVSHDUPMask(MVT::v4f32, ShuffleMask); + break; + + case X86::PSLLDQri: + case X86::VPSLLDQri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodePSLLDQMask(MVT::v16i8, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + + case X86::VPSLLDQYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodePSLLDQMask(MVT::v32i8, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + + case X86::PSRLDQri: + case X86::VPSRLDQri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodePSRLDQMask(MVT::v16i8, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + + case X86::VPSRLDQYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodePSRLDQMask(MVT::v32i8, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + case X86::PALIGNR128rr: case X86::VPALIGNR128rr: Src1Name = getRegName(MI->getOperand(2).getReg()); @@ -489,54 +666,59 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, break; } + // The only comments we decode are shuffles, so give up if we were unable to + // decode a shuffle mask. + if (ShuffleMask.empty()) + return false; - // If this was a shuffle operation, print the shuffle mask. - if (!ShuffleMask.empty()) { - if (!DestName) DestName = Src1Name; - OS << (DestName ? DestName : "mem") << " = "; + if (!DestName) DestName = Src1Name; + OS << (DestName ? DestName : "mem") << " = "; - // If the two sources are the same, canonicalize the input elements to be - // from the first src so that we get larger element spans. - if (Src1Name == Src2Name) { - for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { - if ((int)ShuffleMask[i] >= 0 && // Not sentinel. - ShuffleMask[i] >= (int)e) // From second mask. - ShuffleMask[i] -= e; - } + // If the two sources are the same, canonicalize the input elements to be + // from the first src so that we get larger element spans. + if (Src1Name == Src2Name) { + for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { + if ((int)ShuffleMask[i] >= 0 && // Not sentinel. + ShuffleMask[i] >= (int)e) // From second mask. + ShuffleMask[i] -= e; } + } - // The shuffle mask specifies which elements of the src1/src2 fill in the - // destination, with a few sentinel values. Loop through and print them - // out. - for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { - if (i != 0) + // The shuffle mask specifies which elements of the src1/src2 fill in the + // destination, with a few sentinel values. Loop through and print them + // out. + for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { + if (i != 0) + OS << ','; + if (ShuffleMask[i] == SM_SentinelZero) { + OS << "zero"; + continue; + } + + // Otherwise, it must come from src1 or src2. Print the span of elements + // that comes from this src. + bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size(); + const char *SrcName = isSrc1 ? Src1Name : Src2Name; + OS << (SrcName ? SrcName : "mem") << '['; + bool IsFirst = true; + while (i != e && (int)ShuffleMask[i] != SM_SentinelZero && + (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) { + if (!IsFirst) OS << ','; - if (ShuffleMask[i] == SM_SentinelZero) { - OS << "zero"; - continue; - } - - // Otherwise, it must come from src1 or src2. Print the span of elements - // that comes from this src. - bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size(); - const char *SrcName = isSrc1 ? Src1Name : Src2Name; - OS << (SrcName ? SrcName : "mem") << '['; - bool IsFirst = true; - while (i != e && - (int)ShuffleMask[i] >= 0 && - (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) { - if (!IsFirst) - OS << ','; - else - IsFirst = false; + else + IsFirst = false; + if (ShuffleMask[i] == SM_SentinelUndef) + OS << "u"; + else OS << ShuffleMask[i] % ShuffleMask.size(); - ++i; - } - OS << ']'; - --i; // For loop increments element #. + ++i; } - //MI->print(OS, 0); - OS << "\n"; + OS << ']'; + --i; // For loop increments element #. } + //MI->print(OS, 0); + OS << "\n"; + // We successfully added a comment to this instruction. + return true; } diff --git a/lib/Target/X86/InstPrinter/X86InstComments.h b/lib/Target/X86/InstPrinter/X86InstComments.h index 13fdf9a..687581b 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.h +++ b/lib/Target/X86/InstPrinter/X86InstComments.h @@ -12,13 +12,13 @@ // //===----------------------------------------------------------------------===// -#ifndef X86_INST_COMMENTS_H -#define X86_INST_COMMENTS_H +#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H +#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H namespace llvm { class MCInst; class raw_ostream; - void EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, + bool EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, const char *(*getRegName)(unsigned)); } diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h index 4d9b481..d082f0b 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86_INTEL_INST_PRINTER_H -#define X86_INTEL_INST_PRINTER_H +#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H +#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H #include "llvm/MC/MCInstPrinter.h" #include "llvm/Support/raw_ostream.h" diff --git a/lib/Target/X86/MCTargetDesc/LLVMBuild.txt b/lib/Target/X86/MCTargetDesc/LLVMBuild.txt index 146d111..b9fdc9c 100644 --- a/lib/Target/X86/MCTargetDesc/LLVMBuild.txt +++ b/lib/Target/X86/MCTargetDesc/LLVMBuild.txt @@ -19,5 +19,5 @@ type = Library name = X86Desc parent = X86 -required_libraries = MC Object Support X86AsmPrinter X86Info +required_libraries = MC MCDisassembler Object Support X86AsmPrinter X86Info add_to_library_groups = X86 diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 23bca0d..befa6c2 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -11,7 +11,6 @@ #include "MCTargetDesc/X86FixupKinds.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/MC/MCAsmBackend.h" -#include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixupKindInfo.h" @@ -437,10 +436,30 @@ class DarwinX86AsmBackend : public X86AsmBackend { bool Is64Bit; unsigned OffsetSize; ///< Offset of a "push" instruction. - unsigned PushInstrSize; ///< Size of a "push" instruction. unsigned MoveInstrSize; ///< Size of a "move" instruction. - unsigned StackDivide; ///< Amount to adjust stack stize by. + unsigned StackDivide; ///< Amount to adjust stack size by. protected: + /// \brief Size of a "push" instruction for the given register. + unsigned PushInstrSize(unsigned Reg) const { + switch (Reg) { + case X86::EBX: + case X86::ECX: + case X86::EDX: + case X86::EDI: + case X86::ESI: + case X86::EBP: + case X86::RBX: + case X86::RBP: + return 1; + case X86::R12: + case X86::R13: + case X86::R14: + case X86::R15: + return 2; + } + return 1; + } + /// \brief Implementation of algorithm to generate the compact unwind encoding /// for the CFI instructions. uint32_t @@ -530,7 +549,7 @@ protected: unsigned Reg = MRI.getLLVMRegNum(Inst.getRegister(), true); SavedRegs[SavedRegIdx++] = Reg; StackAdjust += OffsetSize; - InstrOffset += PushInstrSize; + InstrOffset += PushInstrSize(Reg); break; } } @@ -724,7 +743,6 @@ public: OffsetSize = Is64Bit ? 8 : 4; MoveInstrSize = Is64Bit ? 3 : 2; StackDivide = Is64Bit ? 8 : 4; - PushInstrSize = 1; } }; diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index 6aeb1f2..365cf0c 100644 --- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -14,8 +14,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86BASEINFO_H -#define X86BASEINFO_H +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86BASEINFO_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86BASEINFO_H #include "X86MCTargetDesc.h" #include "llvm/MC/MCInstrDesc.h" @@ -216,7 +216,7 @@ namespace X86II { MO_SECREL }; - enum { + enum : uint64_t { //===------------------------------------------------------------------===// // Instruction encodings. These are the standard/most common forms for X86 // instructions. @@ -303,17 +303,18 @@ namespace X86II { //// MRM_XX - A mod/rm byte of exactly 0xXX. MRM_C0 = 32, MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35, MRM_C4 = 36, MRM_C8 = 37, MRM_C9 = 38, MRM_CA = 39, - MRM_CB = 40, MRM_D0 = 41, MRM_D1 = 42, MRM_D4 = 43, - MRM_D5 = 44, MRM_D6 = 45, MRM_D8 = 46, MRM_D9 = 47, - MRM_DA = 48, MRM_DB = 49, MRM_DC = 50, MRM_DD = 51, - MRM_DE = 52, MRM_DF = 53, MRM_E0 = 54, MRM_E1 = 55, - MRM_E2 = 56, MRM_E3 = 57, MRM_E4 = 58, MRM_E5 = 59, - MRM_E8 = 60, MRM_E9 = 61, MRM_EA = 62, MRM_EB = 63, - MRM_EC = 64, MRM_ED = 65, MRM_EE = 66, MRM_F0 = 67, - MRM_F1 = 68, MRM_F2 = 69, MRM_F3 = 70, MRM_F4 = 71, - MRM_F5 = 72, MRM_F6 = 73, MRM_F7 = 74, MRM_F8 = 75, - MRM_F9 = 76, MRM_FA = 77, MRM_FB = 78, MRM_FC = 79, - MRM_FD = 80, MRM_FE = 81, MRM_FF = 82, + MRM_CB = 40, MRM_CF = 41, MRM_D0 = 42, MRM_D1 = 43, + MRM_D4 = 44, MRM_D5 = 45, MRM_D6 = 46, MRM_D7 = 47, + MRM_D8 = 48, MRM_D9 = 49, MRM_DA = 50, MRM_DB = 51, + MRM_DC = 52, MRM_DD = 53, MRM_DE = 54, MRM_DF = 55, + MRM_E0 = 56, MRM_E1 = 57, MRM_E2 = 58, MRM_E3 = 59, + MRM_E4 = 60, MRM_E5 = 61, MRM_E8 = 62, MRM_E9 = 63, + MRM_EA = 64, MRM_EB = 65, MRM_EC = 66, MRM_ED = 67, + MRM_EE = 68, MRM_F0 = 69, MRM_F1 = 70, MRM_F2 = 71, + MRM_F3 = 72, MRM_F4 = 73, MRM_F5 = 74, MRM_F6 = 75, + MRM_F7 = 76, MRM_F8 = 77, MRM_F9 = 78, MRM_FA = 79, + MRM_FB = 80, MRM_FC = 81, MRM_FD = 82, MRM_FE = 83, + MRM_FF = 84, FormMask = 127, @@ -327,8 +328,8 @@ namespace X86II { OpSizeShift = 7, OpSizeMask = 0x3 << OpSizeShift, - OpSize16 = 1, - OpSize32 = 2, + OpSize16 = 1 << OpSizeShift, + OpSize32 = 2 << OpSizeShift, // AsSize - Set if this instruction requires an operand size prefix (0x67), // which most often indicates that the instruction address 16 bit address @@ -454,51 +455,53 @@ namespace X86II { EncodingMask = 0x3 << EncodingShift, // VEX - encoding using 0xC4/0xC5 - VEX = 1, + VEX = 1 << EncodingShift, /// XOP - Opcode prefix used by XOP instructions. - XOP = 2, + XOP = 2 << EncodingShift, // VEX_EVEX - Specifies that this instruction use EVEX form which provides // syntax support up to 32 512-bit register operands and up to 7 16-bit // mask operands as well as source operand data swizzling/memory operand // conversion, eviction hint, and rounding mode. - EVEX = 3, + EVEX = 3 << EncodingShift, // Opcode OpcodeShift = EncodingShift + 2, - //===------------------------------------------------------------------===// - /// VEX - The opcode prefix used by AVX instructions - VEXShift = OpcodeShift + 8, - /// VEX_W - Has a opcode specific functionality, but is used in the same /// way as REX_W is for regular SSE instructions. - VEX_W = 1U << 0, + VEX_WShift = OpcodeShift + 8, + VEX_W = 1ULL << VEX_WShift, /// VEX_4V - Used to specify an additional AVX/SSE register. Several 2 /// address instructions in SSE are represented as 3 address ones in AVX /// and the additional register is encoded in VEX_VVVV prefix. - VEX_4V = 1U << 1, + VEX_4VShift = VEX_WShift + 1, + VEX_4V = 1ULL << VEX_4VShift, /// VEX_4VOp3 - Similar to VEX_4V, but used on instructions that encode /// operand 3 with VEX.vvvv. - VEX_4VOp3 = 1U << 2, + VEX_4VOp3Shift = VEX_4VShift + 1, + VEX_4VOp3 = 1ULL << VEX_4VOp3Shift, /// VEX_I8IMM - Specifies that the last register used in a AVX instruction, /// must be encoded in the i8 immediate field. This usually happens in /// instructions with 4 operands. - VEX_I8IMM = 1U << 3, + VEX_I8IMMShift = VEX_4VOp3Shift + 1, + VEX_I8IMM = 1ULL << VEX_I8IMMShift, /// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current /// instruction uses 256-bit wide registers. This is usually auto detected /// if a VR256 register is used, but some AVX instructions also have this /// field marked when using a f256 memory references. - VEX_L = 1U << 4, + VEX_LShift = VEX_I8IMMShift + 1, + VEX_L = 1ULL << VEX_LShift, // VEX_LIG - Specifies that this instruction ignores the L-bit in the VEX // prefix. Usually used for scalar instructions. Needed by disassembler. - VEX_LIG = 1U << 5, + VEX_LIGShift = VEX_LShift + 1, + VEX_LIG = 1ULL << VEX_LIGShift, // TODO: we should combine VEX_L and VEX_LIG together to form a 2-bit field // with following encoding: @@ -509,24 +512,24 @@ namespace X86II { // this will save 1 tsflag bit // EVEX_K - Set if this instruction requires masking - EVEX_K = 1U << 6, + EVEX_KShift = VEX_LIGShift + 1, + EVEX_K = 1ULL << EVEX_KShift, // EVEX_Z - Set if this instruction has EVEX.Z field set. - EVEX_Z = 1U << 7, + EVEX_ZShift = EVEX_KShift + 1, + EVEX_Z = 1ULL << EVEX_ZShift, // EVEX_L2 - Set if this instruction has EVEX.L' field set. - EVEX_L2 = 1U << 8, + EVEX_L2Shift = EVEX_ZShift + 1, + EVEX_L2 = 1ULL << EVEX_L2Shift, // EVEX_B - Set if this instruction has EVEX.B field set. - EVEX_B = 1U << 9, + EVEX_BShift = EVEX_L2Shift + 1, + EVEX_B = 1ULL << EVEX_BShift, - // EVEX_CD8E - compressed disp8 form, element-size - EVEX_CD8EShift = VEXShift + 10, - EVEX_CD8EMask = 3, - - // EVEX_CD8V - compressed disp8 form, vector-width - EVEX_CD8VShift = EVEX_CD8EShift + 2, - EVEX_CD8VMask = 7, + // The scaling factor for the AVX512's 8-bit compressed displacement. + CD8_Scale_Shift = EVEX_BShift + 1, + CD8_Scale_Mask = 127ULL << CD8_Scale_Shift, /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the /// wacky 0x0F 0x0F prefix for 3DNow! instructions. The manual documents @@ -534,14 +537,17 @@ namespace X86II { /// storing a classifier in the imm8 field. To simplify our implementation, /// we handle this by storeing the classifier in the opcode field and using /// this flag to indicate that the encoder should do the wacky 3DNow! thing. - Has3DNow0F0FOpcode = 1U << 15, + Has3DNow0F0FOpcodeShift = CD8_Scale_Shift + 7, + Has3DNow0F0FOpcode = 1ULL << Has3DNow0F0FOpcodeShift, /// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in /// ModRM or I8IMM. This is used for FMA4 and XOP instructions. - MemOp4 = 1U << 16, + MemOp4Shift = Has3DNow0F0FOpcodeShift + 1, + MemOp4 = 1ULL << MemOp4Shift, /// Explicitly specified rounding control - EVEX_RC = 1U << 17 + EVEX_RCShift = MemOp4Shift + 1, + EVEX_RC = 1ULL << EVEX_RCShift }; // getBaseOpcodeFor - This function returns the "base" X86 opcode for the @@ -643,10 +649,10 @@ namespace X86II { /// counted as one operand. /// inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) { - bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; - bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; - bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K); - + bool HasVEX_4V = TSFlags & X86II::VEX_4V; + bool HasMemOp4 = TSFlags & X86II::MemOp4; + bool HasEVEX_K = TSFlags & X86II::EVEX_K; + switch (TSFlags & X86II::FormMask) { default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!"); case X86II::Pseudo: @@ -687,7 +693,7 @@ namespace X86II { case X86II::MRM2m: case X86II::MRM3m: case X86II::MRM4m: case X86II::MRM5m: case X86II::MRM6m: case X86II::MRM7m: { - bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; + bool HasVEX_4V = TSFlags & X86II::VEX_4V; unsigned FirstMemOp = 0; if (HasVEX_4V) ++FirstMemOp;// Skip the register dest (which is encoded in VEX_VVVV). @@ -698,20 +704,21 @@ namespace X86II { case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2: case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8: case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB: - case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D4: - case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D8: - case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB: - case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE: - case X86II::MRM_DF: case X86II::MRM_E0: case X86II::MRM_E1: - case X86II::MRM_E2: case X86II::MRM_E3: case X86II::MRM_E4: - case X86II::MRM_E5: case X86II::MRM_E8: case X86II::MRM_E9: - case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC: - case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_F0: - case X86II::MRM_F1: case X86II::MRM_F2: case X86II::MRM_F3: - case X86II::MRM_F4: case X86II::MRM_F5: case X86II::MRM_F6: - case X86II::MRM_F7: case X86II::MRM_F8: case X86II::MRM_F9: - case X86II::MRM_FA: case X86II::MRM_FB: case X86II::MRM_FC: - case X86II::MRM_FD: case X86II::MRM_FE: case X86II::MRM_FF: + case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1: + case X86II::MRM_D4: case X86II::MRM_D5: case X86II::MRM_D6: + case X86II::MRM_D7: case X86II::MRM_D8: case X86II::MRM_D9: + case X86II::MRM_DA: case X86II::MRM_DB: case X86II::MRM_DC: + case X86II::MRM_DD: case X86II::MRM_DE: case X86II::MRM_DF: + case X86II::MRM_E0: case X86II::MRM_E1: case X86II::MRM_E2: + case X86II::MRM_E3: case X86II::MRM_E4: case X86II::MRM_E5: + case X86II::MRM_E8: case X86II::MRM_E9: case X86II::MRM_EA: + case X86II::MRM_EB: case X86II::MRM_EC: case X86II::MRM_ED: + case X86II::MRM_EE: case X86II::MRM_F0: case X86II::MRM_F1: + case X86II::MRM_F2: case X86II::MRM_F3: case X86II::MRM_F4: + case X86II::MRM_F5: case X86II::MRM_F6: case X86II::MRM_F7: + case X86II::MRM_F8: case X86II::MRM_F9: case X86II::MRM_FA: + case X86II::MRM_FB: case X86II::MRM_FC: case X86II::MRM_FD: + case X86II::MRM_FE: case X86II::MRM_FF: return -1; } } diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index 3fdec87..be6a8e4 100644 --- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -77,7 +77,7 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, break; case MCSymbolRefExpr::VK_GOTTPOFF: Type = ELF::R_X86_64_GOTTPOFF; - break; + break; case MCSymbolRefExpr::VK_TLSGD: Type = ELF::R_X86_64_TLSGD; break; diff --git a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h index 09396b7..4899900 100644 --- a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h +++ b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_X86_X86FIXUPKINDS_H -#define LLVM_X86_X86FIXUPKINDS_H +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86FIXUPKINDS_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86FIXUPKINDS_H #include "llvm/MC/MCFixup.h" diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index 83b2777..5679d63 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -72,11 +72,10 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) { if (T.isMacOSX() && T.isMacOSXVersionLT(10, 6)) HasWeakDefCanBeHiddenDirective = false; - // FIXME: this should not depend on the target OS version, but on the ld64 - // version in use. From at least >= ld64-97.17 (Xcode 3.2.6) the abs-ified - // FDE relocs may be used. We also use them for the ios simulator. - DwarfFDESymbolsUseAbsDiff = (T.isMacOSX() && !T.isMacOSXVersionLT(10, 6)) - || T.isiOS(); + // Assume ld64 is new enough that the abs-ified FDE relocs may be used + // (actually, must, since otherwise the non-extern relocations we produce + // overwhelm ld64's tiny little mind and it fails). + DwarfFDESymbolsUseAbsDiff = true; UseIntegratedAssembler = true; } @@ -103,9 +102,6 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { TextAlignFillValue = 0x90; - // Set up DWARF directives - HasLEB128 = true; // Target asm supports leb128 directives (little-endian) - // Debug Information SupportsDebugInformation = true; @@ -134,19 +130,14 @@ X86_64MCAsmInfoDarwin::getExprForPersonalitySymbol(const MCSymbol *Sym, return MCBinaryExpr::CreateAdd(Res, Four, Context); } -const MCSection *X86ELFMCAsmInfo:: -getNonexecutableStackSection(MCContext &Ctx) const { - return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS, - 0, SectionKind::getMetadata()); -} - void X86MCAsmInfoMicrosoft::anchor() { } X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) { if (Triple.getArch() == Triple::x86_64) { PrivateGlobalPrefix = ".L"; PointerSize = 8; - ExceptionsType = ExceptionHandling::WinEH; + WinEHEncodingType = WinEH::EncodingType::Itanium; + ExceptionsType = ExceptionHandling::ItaniumWinEH; } AssemblerDialect = AsmWriterFlavor; @@ -165,7 +156,8 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) { if (Triple.getArch() == Triple::x86_64) { PrivateGlobalPrefix = ".L"; PointerSize = 8; - ExceptionsType = ExceptionHandling::WinEH; + WinEHEncodingType = WinEH::EncodingType::Itanium; + ExceptionsType = ExceptionHandling::ItaniumWinEH; } else { ExceptionsType = ExceptionHandling::DwarfCFI; } diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h index a7509b0..f2f06c3 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h +++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86TARGETASMINFO_H -#define X86TARGETASMINFO_H +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAsmInfoCOFF.h" @@ -39,8 +39,6 @@ namespace llvm { void anchor() override; public: explicit X86ELFMCAsmInfo(const Triple &Triple); - const MCSection * - getNonexecutableStackSection(MCContext &Ctx) const override; }; class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft { diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 2152b21..31b8e2d 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -185,42 +185,21 @@ static bool isDisp8(int Value) { /// isCDisp8 - Return true if this signed displacement fits in a 8-bit /// compressed dispacement field. static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) { - assert((TSFlags & X86II::EncodingMask) >> X86II::EncodingShift == X86II::EVEX && + assert(((TSFlags & X86II::EncodingMask) == X86II::EVEX) && "Compressed 8-bit displacement is only valid for EVEX inst."); - unsigned CD8E = (TSFlags >> X86II::EVEX_CD8EShift) & X86II::EVEX_CD8EMask; - unsigned CD8V = (TSFlags >> X86II::EVEX_CD8VShift) & X86II::EVEX_CD8VMask; - - if (CD8V == 0 && CD8E == 0) { + unsigned CD8_Scale = + (TSFlags & X86II::CD8_Scale_Mask) >> X86II::CD8_Scale_Shift; + if (CD8_Scale == 0) { CValue = Value; return isDisp8(Value); } - - unsigned MemObjSize = 1U << CD8E; - if (CD8V & 4) { - // Fixed vector length - MemObjSize *= 1U << (CD8V & 0x3); - } else { - // Modified vector length - bool EVEX_b = (TSFlags >> X86II::VEXShift) & X86II::EVEX_B; - if (!EVEX_b) { - unsigned EVEX_LL = ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) ? 1 : 0; - EVEX_LL += ((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2) ? 2 : 0; - assert(EVEX_LL < 3 && ""); - - unsigned NumElems = (1U << (EVEX_LL + 4)) / MemObjSize; - NumElems /= 1U << (CD8V & 0x3); - - MemObjSize *= NumElems; - } - } - unsigned MemObjMask = MemObjSize - 1; - assert((MemObjSize & MemObjMask) == 0 && "Invalid memory object size."); - - if (Value & MemObjMask) // Unaligned offset + unsigned Mask = CD8_Scale - 1; + assert((CD8_Scale & Mask) == 0 && "Invalid memory object size."); + if (Value & Mask) // Unaligned offset return false; - Value /= (int)MemObjSize; + Value /= (int)CD8_Scale; bool Ret = (Value == (signed char)Value); if (Ret) @@ -393,9 +372,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, const MCOperand &Scale = MI.getOperand(Op+X86::AddrScaleAmt); const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); unsigned BaseReg = Base.getReg(); - unsigned char Encoding = (TSFlags & X86II::EncodingMask) >> - X86II::EncodingShift; - bool HasEVEX = (Encoding == X86II::EVEX); + bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX; // Handle %rip relative addressing. if (BaseReg == X86::RIP) { // [disp32+RIP] in X86-64 mode @@ -613,13 +590,12 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand, const MCInst &MI, const MCInstrDesc &Desc, raw_ostream &OS) const { - unsigned char Encoding = (TSFlags & X86II::EncodingMask) >> - X86II::EncodingShift; - bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K); - bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; - bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3; - bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; - bool HasEVEX_RC = (TSFlags >> X86II::VEXShift) & X86II::EVEX_RC; + uint64_t Encoding = TSFlags & X86II::EncodingMask; + bool HasEVEX_K = TSFlags & X86II::EVEX_K; + bool HasVEX_4V = TSFlags & X86II::VEX_4V; + bool HasVEX_4VOp3 = TSFlags & X86II::VEX_4VOp3; + bool HasMemOp4 = TSFlags & X86II::MemOp4; + bool HasEVEX_RC = TSFlags & X86II::EVEX_RC; // VEX_R: opcode externsion equivalent to REX.R in // 1's complement (inverted) form @@ -700,18 +676,18 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, bool EncodeRC = false; - if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W) + if (TSFlags & X86II::VEX_W) VEX_W = 1; - if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) + if (TSFlags & X86II::VEX_L) VEX_L = 1; - if (((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2)) + if (TSFlags & X86II::EVEX_L2) EVEX_L2 = 1; - if (HasEVEX_K && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_Z)) + if (HasEVEX_K && (TSFlags & X86II::EVEX_Z)) EVEX_z = 1; - if (((TSFlags >> X86II::VEXShift) & X86II::EVEX_B)) + if ((TSFlags & X86II::EVEX_B)) EVEX_b = 1; switch (TSFlags & X86II::OpPrefixMask) { @@ -1129,8 +1105,8 @@ void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, raw_ostream &OS) const { // Emit the operand size opcode prefix as needed. - unsigned char OpSize = (TSFlags & X86II::OpSizeMask) >> X86II::OpSizeShift; - if (OpSize == (is16BitMode(STI) ? X86II::OpSize32 : X86II::OpSize16)) + if ((TSFlags & X86II::OpSizeMask) == (is16BitMode(STI) ? X86II::OpSize32 + : X86II::OpSize16)) EmitByte(0x66, CurByte, OS); switch (TSFlags & X86II::OpPrefixMask) { @@ -1190,19 +1166,18 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, unsigned CurByte = 0; // Encoding type for this instruction. - unsigned char Encoding = (TSFlags & X86II::EncodingMask) >> - X86II::EncodingShift; + uint64_t Encoding = TSFlags & X86II::EncodingMask; // It uses the VEX.VVVV field? - bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; - bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3; - bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; + bool HasVEX_4V = TSFlags & X86II::VEX_4V; + bool HasVEX_4VOp3 = TSFlags & X86II::VEX_4VOp3; + bool HasMemOp4 = TSFlags & X86II::MemOp4; const unsigned MemOp4_I8IMMOperand = 2; // It uses the EVEX.aaa field? - bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K); - bool HasEVEX_RC = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_RC); - + bool HasEVEX_K = TSFlags & X86II::EVEX_K; + bool HasEVEX_RC = TSFlags & X86II::EVEX_RC; + // Determine where the memory operand starts, if present. int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode); if (MemoryOperand != -1) MemoryOperand += CurOp; @@ -1257,7 +1232,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, unsigned char BaseOpcode = X86II::getBaseOpcodeFor(TSFlags); - if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode) + if (TSFlags & X86II::Has3DNow0F0FOpcode) BaseOpcode = 0x0F; // Weird 3DNow! encoding. unsigned SrcRegNum = 0; @@ -1457,20 +1432,21 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2: case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8: case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB: - case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D4: - case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D8: - case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB: - case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE: - case X86II::MRM_DF: case X86II::MRM_E0: case X86II::MRM_E1: - case X86II::MRM_E2: case X86II::MRM_E3: case X86II::MRM_E4: - case X86II::MRM_E5: case X86II::MRM_E8: case X86II::MRM_E9: - case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC: - case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_F0: - case X86II::MRM_F1: case X86II::MRM_F2: case X86II::MRM_F3: - case X86II::MRM_F4: case X86II::MRM_F5: case X86II::MRM_F6: - case X86II::MRM_F7: case X86II::MRM_F8: case X86II::MRM_F9: - case X86II::MRM_FA: case X86II::MRM_FB: case X86II::MRM_FC: - case X86II::MRM_FD: case X86II::MRM_FE: case X86II::MRM_FF: + case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1: + case X86II::MRM_D4: case X86II::MRM_D5: case X86II::MRM_D6: + case X86II::MRM_D7: case X86II::MRM_D8: case X86II::MRM_D9: + case X86II::MRM_DA: case X86II::MRM_DB: case X86II::MRM_DC: + case X86II::MRM_DD: case X86II::MRM_DE: case X86II::MRM_DF: + case X86II::MRM_E0: case X86II::MRM_E1: case X86II::MRM_E2: + case X86II::MRM_E3: case X86II::MRM_E4: case X86II::MRM_E5: + case X86II::MRM_E8: case X86II::MRM_E9: case X86II::MRM_EA: + case X86II::MRM_EB: case X86II::MRM_EC: case X86II::MRM_ED: + case X86II::MRM_EE: case X86II::MRM_F0: case X86II::MRM_F1: + case X86II::MRM_F2: case X86II::MRM_F3: case X86II::MRM_F4: + case X86II::MRM_F5: case X86II::MRM_F6: case X86II::MRM_F7: + case X86II::MRM_F8: case X86II::MRM_F9: case X86II::MRM_FA: + case X86II::MRM_FB: case X86II::MRM_FC: case X86II::MRM_FD: + case X86II::MRM_FE: case X86II::MRM_FF: EmitByte(BaseOpcode, CurByte, OS); unsigned char MRM; @@ -1485,11 +1461,13 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, case X86II::MRM_C9: MRM = 0xC9; break; case X86II::MRM_CA: MRM = 0xCA; break; case X86II::MRM_CB: MRM = 0xCB; break; + case X86II::MRM_CF: MRM = 0xCF; break; case X86II::MRM_D0: MRM = 0xD0; break; case X86II::MRM_D1: MRM = 0xD1; break; case X86II::MRM_D4: MRM = 0xD4; break; case X86II::MRM_D5: MRM = 0xD5; break; case X86II::MRM_D6: MRM = 0xD6; break; + case X86II::MRM_D7: MRM = 0xD7; break; case X86II::MRM_D8: MRM = 0xD8; break; case X86II::MRM_D9: MRM = 0xD9; break; case X86II::MRM_DA: MRM = 0xDA; break; @@ -1538,7 +1516,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, while (CurOp != NumOps && NumOps - CurOp <= 2) { // The last source register of a 4 operand instruction in AVX is encoded // in bits[7:4] of a immediate byte. - if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) { + if (TSFlags & X86II::VEX_I8IMM) { const MCOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand : CurOp); ++CurOp; @@ -1564,7 +1542,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, } } - if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode) + if (TSFlags & X86II::Has3DNow0F0FOpcode) EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS); #ifndef NDEBUG diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 5e29e5c..5a9181d 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -272,7 +272,8 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) { MAI = new X86ELFMCAsmInfo(TheTriple); } else if (TheTriple.isWindowsMSVCEnvironment()) { MAI = new X86MCAsmInfoMicrosoft(TheTriple); - } else if (TheTriple.isOSCygMing()) { + } else if (TheTriple.isOSCygMing() || + TheTriple.isWindowsItaniumEnvironment()) { MAI = new X86MCAsmInfoGNUCOFF(TheTriple); } else { // The default is ELF. @@ -350,11 +351,8 @@ static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM, static MCStreamer *createMCStreamer(const Target &T, StringRef TT, MCContext &Ctx, MCAsmBackend &MAB, - raw_ostream &_OS, - MCCodeEmitter *_Emitter, - const MCSubtargetInfo &STI, - bool RelaxAll, - bool NoExecStack) { + raw_ostream &_OS, MCCodeEmitter *_Emitter, + const MCSubtargetInfo &STI, bool RelaxAll) { Triple TheTriple(TT); switch (TheTriple.getObjectFormat()) { @@ -365,7 +363,7 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT, assert(TheTriple.isOSWindows() && "only Windows COFF is supported"); return createX86WinCOFFStreamer(Ctx, MAB, _Emitter, _OS, RelaxAll); case Triple::ELF: - return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack); + return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll); } } @@ -376,7 +374,7 @@ static MCInstPrinter *createX86MCInstPrinter(const Target &T, const MCRegisterInfo &MRI, const MCSubtargetInfo &STI) { if (SyntaxVariant == 0) - return new X86ATTInstPrinter(MAI, MII, MRI); + return new X86ATTInstPrinter(MAI, MII, MRI, STI); if (SyntaxVariant == 1) return new X86IntelInstPrinter(MAI, MII, MRI); return nullptr; diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index ebe74cf..aef9571 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86MCTARGETDESC_H -#define X86MCTARGETDESC_H +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H #include "llvm/Support/DataTypes.h" #include <string> diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index ead3338..5685a7f 100644 --- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -179,11 +179,14 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, if (A_Base == B_Base && A_Base) report_fatal_error("unsupported relocation with identical base", false); - // A subtraction expression where both symbols are undefined is a + // A subtraction expression where either symbol is undefined is a // non-relocatable expression. - if (A->isUndefined() && B->isUndefined()) - report_fatal_error("unsupported relocation with subtraction expression", - false); + if (A->isUndefined() || B->isUndefined()) { + StringRef Name = A->isUndefined() ? A->getName() : B->getName(); + Asm.getContext().FatalError(Fixup.getLoc(), + "unsupported relocation with subtraction expression, symbol '" + + Name + "' can not be undefined in a subtraction expression"); + } Value += Writer->getSymbolAddress(&A_SD, Layout) - (!A_Base ? 0 : Writer->getSymbolAddress(A_Base, Layout)); @@ -572,7 +575,7 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, // For external relocations, make sure to offset the fixup value to // compensate for the addend of the symbol address, if it was // undefined. This occurs with weak definitions, for example. - if (!SD->Symbol->isUndefined()) + if (!SD->getSymbol().isUndefined()) FixedValue -= Layout.getSymbolOffset(SD); } else { // The index is the section ordinal (1-based). diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index 7fa4180..5f1596c 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -8,18 +8,21 @@ //===----------------------------------------------------------------------===// #include "X86MCTargetDesc.h" +#include "llvm/MC/MCWin64EH.h" #include "llvm/MC/MCWinCOFFStreamer.h" using namespace llvm; namespace { class X86WinCOFFStreamer : public MCWinCOFFStreamer { + Win64EH::UnwindEmitter EHStreamer; public: X86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter *CE, raw_ostream &OS) : MCWinCOFFStreamer(C, AB, *CE, OS) { } void EmitWinEHHandlerData() override; + void EmitWindowsUnwindTables() override; void FinishImpl() override; }; @@ -28,12 +31,18 @@ void X86WinCOFFStreamer::EmitWinEHHandlerData() { // We have to emit the unwind info now, because this directive // actually switches to the .xdata section! - MCWin64EHUnwindEmitter::EmitUnwindInfo(*this, getCurrentW64UnwindInfo()); + EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo()); +} + +void X86WinCOFFStreamer::EmitWindowsUnwindTables() { + if (!getNumWinFrameInfos()) + return; + EHStreamer.Emit(*this); } void X86WinCOFFStreamer::FinishImpl() { EmitFrames(nullptr); - EmitW64Tables(); + EmitWindowsUnwindTables(); MCWinCOFFStreamer::FinishImpl(); } diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index 52d3c01..19a1832 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -2,17 +2,6 @@ // Random ideas for the X86 backend. //===---------------------------------------------------------------------===// -This should be one DIV/IDIV instruction, not a libcall: - -unsigned test(unsigned long long X, unsigned Y) { - return X/Y; -} - -This can be done trivially with a custom legalizer. What about overflow -though? http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224 - -//===---------------------------------------------------------------------===// - Improvements to the multiply -> shift/add algorithm: http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html @@ -83,43 +72,6 @@ It appears icc use push for parameter passing. Need to investigate. //===---------------------------------------------------------------------===// -This: - -void foo(void); -void bar(int x, int *P) { - x >>= 2; - if (x) - foo(); - *P = x; -} - -compiles into: - - movq %rsi, %rbx - movl %edi, %r14d - sarl $2, %r14d - testl %r14d, %r14d - je LBB0_2 - -Instead of doing an explicit test, we can use the flags off the sar. This -occurs in a bigger testcase like this, which is pretty common: - -#include <vector> -int test1(std::vector<int> &X) { - int Sum = 0; - for (long i = 0, e = X.size(); i != e; ++i) - X[i] = 0; - return Sum; -} - -//===---------------------------------------------------------------------===// - -Only use inc/neg/not instructions on processors where they are faster than -add/sub/xor. They are slower on the P4 due to only updating some processor -flags. - -//===---------------------------------------------------------------------===// - The instruction selector sometimes misses folding a load into a compare. The pattern is written as (cmp reg, (load p)). Because the compare isn't commutative, it is not matched with the load on both sides. The dag combiner @@ -303,42 +255,6 @@ opposed to two cycles for the movl+lea variant. //===---------------------------------------------------------------------===// -__builtin_ffs codegen is messy. - -int ffs_(unsigned X) { return __builtin_ffs(X); } - -llvm produces: -ffs_: - movl 4(%esp), %ecx - bsfl %ecx, %eax - movl $32, %edx - cmove %edx, %eax - incl %eax - xorl %edx, %edx - testl %ecx, %ecx - cmove %edx, %eax - ret - -vs gcc: - -_ffs_: - movl $-1, %edx - bsfl 4(%esp), %eax - cmove %edx, %eax - addl $1, %eax - ret - -Another example of __builtin_ffs (use predsimplify to eliminate a select): - -int foo (unsigned long j) { - if (j) - return __builtin_ffs (j) - 1; - else - return 0; -} - -//===---------------------------------------------------------------------===// - It appears gcc place string data with linkonce linkage in .section __TEXT,__const_coal,coalesced instead of .section __DATA,__const_coal,coalesced. @@ -466,85 +382,6 @@ We should inline lrintf and probably other libc functions. //===---------------------------------------------------------------------===// -Use the FLAGS values from arithmetic instructions more. For example, compile: - -int add_zf(int *x, int y, int a, int b) { - if ((*x += y) == 0) - return a; - else - return b; -} - -to: - addl %esi, (%rdi) - movl %edx, %eax - cmovne %ecx, %eax - ret -instead of: - -_add_zf: - addl (%rdi), %esi - movl %esi, (%rdi) - testl %esi, %esi - cmove %edx, %ecx - movl %ecx, %eax - ret - -As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll -without a test instruction. - -//===---------------------------------------------------------------------===// - -These two functions have identical effects: - -unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;} -unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;} - -We currently compile them to: - -_f: - movl 4(%esp), %eax - movl %eax, %ecx - incl %ecx - movl 8(%esp), %edx - cmpl %edx, %ecx - jne LBB1_2 #UnifiedReturnBlock -LBB1_1: #cond_true - addl $2, %eax - ret -LBB1_2: #UnifiedReturnBlock - movl %ecx, %eax - ret -_f2: - movl 4(%esp), %eax - movl %eax, %ecx - incl %ecx - cmpl 8(%esp), %ecx - sete %cl - movzbl %cl, %ecx - leal 1(%ecx,%eax), %eax - ret - -both of which are inferior to GCC's: - -_f: - movl 4(%esp), %edx - leal 1(%edx), %eax - addl $2, %edx - cmpl 8(%esp), %eax - cmove %edx, %eax - ret -_f2: - movl 4(%esp), %eax - addl $1, %eax - xorl %edx, %edx - cmpl 8(%esp), %eax - sete %dl - addl %edx, %eax - ret - -//===---------------------------------------------------------------------===// - This code: void test(int X) { @@ -1398,20 +1235,6 @@ A similar code sequence works for division. //===---------------------------------------------------------------------===// -These should compile to the same code, but the later codegen's to useless -instructions on X86. This may be a trivial dag combine (GCC PR7061): - -struct s1 { unsigned char a, b; }; -unsigned long f1(struct s1 x) { - return x.a + x.b; -} -struct s2 { unsigned a: 8, b: 8; }; -unsigned long f2(struct s2 x) { - return x.a + x.b; -} - -//===---------------------------------------------------------------------===// - We currently compile this: define i32 @func1(i32 %v1, i32 %v2) nounwind { diff --git a/lib/Target/X86/Utils/LLVMBuild.txt b/lib/Target/X86/Utils/LLVMBuild.txt index fdb886f..de0a30f 100644 --- a/lib/Target/X86/Utils/LLVMBuild.txt +++ b/lib/Target/X86/Utils/LLVMBuild.txt @@ -19,5 +19,5 @@ type = Library name = X86Utils parent = X86 -required_libraries = Support +required_libraries = Core Support add_to_library_groups = X86 diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp index 5f2441c..ba6cbc8 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "X86ShuffleDecode.h" +#include "llvm/IR/Constants.h" #include "llvm/CodeGen/MachineValueType.h" //===----------------------------------------------------------------------===// @@ -62,6 +63,51 @@ void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) { ShuffleMask.push_back(NElts+i); } +void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + for (int i = 0, e = NumElts / 2; i < e; ++i) { + ShuffleMask.push_back(2 * i); + ShuffleMask.push_back(2 * i); + } +} + +void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + for (int i = 0, e = NumElts / 2; i < e; ++i) { + ShuffleMask.push_back(2 * i + 1); + ShuffleMask.push_back(2 * i + 1); + } +} + +void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { + unsigned VectorSizeInBits = VT.getSizeInBits(); + unsigned NumElts = VectorSizeInBits / 8; + unsigned NumLanes = VectorSizeInBits / 128; + unsigned NumLaneElts = NumElts / NumLanes; + + for (unsigned l = 0; l < NumElts; l += NumLaneElts) + for (unsigned i = 0; i < NumLaneElts; ++i) { + int M = SM_SentinelZero; + if (i >= Imm) M = i - Imm + l; + ShuffleMask.push_back(M); + } +} + +void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { + unsigned VectorSizeInBits = VT.getSizeInBits(); + unsigned NumElts = VectorSizeInBits / 8; + unsigned NumLanes = VectorSizeInBits / 128; + unsigned NumLaneElts = NumElts / NumLanes; + + for (unsigned l = 0; l < NumElts; l += NumLaneElts) + for (unsigned i = 0; i < NumLaneElts; ++i) { + unsigned Base = i + Imm; + int M = Base + l; + if (Base >= NumLaneElts) M = SM_SentinelZero; + ShuffleMask.push_back(M); + } +} + void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { unsigned NumElts = VT.getVectorNumElements(); @@ -207,6 +253,97 @@ void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, } } +void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + assert(MaskTy->isVectorTy() && "Expected a vector constant mask!"); + assert(MaskTy->getVectorElementType()->isIntegerTy(8) && + "Expected i8 constant mask elements!"); + int NumElements = MaskTy->getVectorNumElements(); + // FIXME: Add support for AVX-512. + assert((NumElements == 16 || NumElements == 32) && + "Only 128-bit and 256-bit vectors supported!"); + ShuffleMask.reserve(NumElements); + + if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) { + assert((unsigned)NumElements == CDS->getNumElements() && + "Constant mask has a different number of elements!"); + + for (int i = 0; i < NumElements; ++i) { + // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte + // lane of the vector we're inside. + int Base = i < 16 ? 0 : 16; + uint64_t Element = CDS->getElementAsInteger(i); + // If the high bit (7) of the byte is set, the element is zeroed. + if (Element & (1 << 7)) + ShuffleMask.push_back(SM_SentinelZero); + else { + // Only the least significant 4 bits of the byte are used. + int Index = Base + (Element & 0xf); + ShuffleMask.push_back(Index); + } + } + } else if (auto *CV = dyn_cast<ConstantVector>(C)) { + assert((unsigned)NumElements == CV->getNumOperands() && + "Constant mask has a different number of elements!"); + + for (int i = 0; i < NumElements; ++i) { + // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte + // lane of the vector we're inside. + int Base = i < 16 ? 0 : 16; + Constant *COp = CV->getOperand(i); + if (isa<UndefValue>(COp)) { + ShuffleMask.push_back(SM_SentinelUndef); + continue; + } + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + // If the high bit (7) of the byte is set, the element is zeroed. + if (Element & (1 << 7)) + ShuffleMask.push_back(SM_SentinelZero); + else { + // Only the least significant 4 bits of the byte are used. + int Index = Base + (Element & 0xf); + ShuffleMask.push_back(Index); + } + } + } +} + +void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask) { + for (int i = 0, e = RawMask.size(); i < e; ++i) { + uint64_t M = RawMask[i]; + if (M == (uint64_t)SM_SentinelUndef) { + ShuffleMask.push_back(M); + continue; + } + // For AVX vectors with 32 bytes the base of the shuffle is the half of + // the vector we're inside. + int Base = i < 16 ? 0 : 16; + // If the high bit (7) of the byte is set, the element is zeroed. + if (M & (1 << 7)) + ShuffleMask.push_back(SM_SentinelZero); + else { + // Only the least significant 4 bits of the byte are used. + int Index = Base + (M & 0xf); + ShuffleMask.push_back(Index); + } + } +} + +void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { + int ElementBits = VT.getScalarSizeInBits(); + int NumElements = VT.getVectorNumElements(); + for (int i = 0; i < NumElements; ++i) { + // If there are more than 8 elements in the vector, then any immediate blend + // mask applies to each 128-bit lane. There can never be more than + // 8 elements in a 128-bit lane with an immediate blend. + int Bit = NumElements > 8 ? i % (128 / ElementBits) : i; + assert(Bit < 8 && + "Immediate blends only operate over 8 elements at a time!"); + ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i); + } +} + /// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. /// No VT provided since it only works on 256-bit, 4 element vectors. void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { @@ -215,4 +352,44 @@ void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { } } +void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + assert(MaskTy->isVectorTy() && "Expected a vector constant mask!"); + assert(MaskTy->getVectorElementType()->isIntegerTy() && + "Expected integer constant mask elements!"); + int ElementBits = MaskTy->getScalarSizeInBits(); + int NumElements = MaskTy->getVectorNumElements(); + assert((NumElements == 2 || NumElements == 4 || NumElements == 8) && + "Unexpected number of vector elements."); + ShuffleMask.reserve(NumElements); + if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) { + assert((unsigned)NumElements == CDS->getNumElements() && + "Constant mask has a different number of elements!"); + + for (int i = 0; i < NumElements; ++i) { + int Base = (i * ElementBits / 128) * (128 / ElementBits); + uint64_t Element = CDS->getElementAsInteger(i); + // Only the least significant 2 bits of the integer are used. + int Index = Base + (Element & 0x3); + ShuffleMask.push_back(Index); + } + } else if (auto *CV = dyn_cast<ConstantVector>(C)) { + assert((unsigned)NumElements == C->getNumOperands() && + "Constant mask has a different number of elements!"); + + for (int i = 0; i < NumElements; ++i) { + int Base = (i * ElementBits / 128) * (128 / ElementBits); + Constant *COp = CV->getOperand(i); + if (isa<UndefValue>(COp)) { + ShuffleMask.push_back(SM_SentinelUndef); + continue; + } + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + // Only the least significant 2 bits of the integer are used. + int Index = Base + (Element & 0x3); + ShuffleMask.push_back(Index); + } + } +} + } // llvm namespace diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h index 9e75b6b..6ba3c64 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -12,21 +12,21 @@ // //===----------------------------------------------------------------------===// -#ifndef X86_SHUFFLE_DECODE_H -#define X86_SHUFFLE_DECODE_H +#ifndef LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H +#define LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/ArrayRef.h" //===----------------------------------------------------------------------===// // Vector Mask Decoding //===----------------------------------------------------------------------===// namespace llvm { +class Constant; class MVT; -enum { - SM_SentinelZero = -1 -}; +enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 }; void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); @@ -36,6 +36,14 @@ void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask); // <0,2> or <0,1,4,5> void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask); +void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); + +void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); + +void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + +void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); @@ -59,6 +67,16 @@ void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); /// different datatypes and vector widths. void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); +/// \brief Decode a PSHUFB mask from an IR-level vector constant. +void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a PSHUFB mask from a raw array of constants such as from +/// BUILD_VECTOR. +void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a BLEND immediate mask into a shuffle mask. +void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); @@ -67,6 +85,9 @@ void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, /// No VT provided since it only works on 256-bit, 4 element vectors. void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); +/// \brief Decode a VPERMILP variable mask from an IR-level vector constant. +void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask); + } // llvm namespace #endif diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index d5522ed..8bd5817 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef TARGET_X86_H -#define TARGET_X86_H +#ifndef LLVM_LIB_TARGET_X86_X86_H +#define LLVM_LIB_TARGET_X86_X86_H #include "llvm/Support/CodeGen.h" @@ -21,13 +21,8 @@ namespace llvm { class FunctionPass; class ImmutablePass; -class JITCodeEmitter; class X86TargetMachine; -/// createX86AtomicExpandPass - This pass expands atomic operations that cannot -/// be handled natively in terms of a loop using cmpxchg. -FunctionPass *createX86AtomicExpandPass(const X86TargetMachine *TM); - /// createX86ISelDag - This pass converts a legalized DAG into a /// X86-specific DAG, ready for instruction scheduling. /// @@ -54,11 +49,6 @@ FunctionPass *createX86FloatingPointStackifierPass(); /// AVX and SSE. FunctionPass *createX86IssueVZeroUpperPass(); -/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code -/// to the specified MCE object. -FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM, - JITCodeEmitter &JCE); - /// createX86EmitCodeToMemory - Returns a pass that converts a register /// allocated function into raw machine code in a dynamically /// allocated chunk of memory. diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 93f516a..83f55d3 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -104,7 +104,15 @@ def FeatureCDI : SubtargetFeature<"avx512cd", "HasCDI", "true", def FeaturePFI : SubtargetFeature<"avx512pf", "HasPFI", "true", "Enable AVX-512 PreFetch Instructions", [FeatureAVX512]>; - +def FeatureDQI : SubtargetFeature<"avx512dq", "HasDQI", "true", + "Enable AVX-512 Doubleword and Quadword Instructions", + [FeatureAVX512]>; +def FeatureBWI : SubtargetFeature<"avx512bw", "HasBWI", "true", + "Enable AVX-512 Byte and Word Instructions", + [FeatureAVX512]>; +def FeatureVLX : SubtargetFeature<"avx512vl", "HasVLX", "true", + "Enable AVX-512 Vector Length eXtensions", + [FeatureAVX512]>; def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", "Enable packed carry-less multiplication instructions", [FeatureSSE2]>; @@ -149,10 +157,14 @@ def FeatureADX : SubtargetFeature<"adx", "HasADX", "true", def FeatureSHA : SubtargetFeature<"sha", "HasSHA", "true", "Enable SHA instructions", [FeatureSSE2]>; +def FeatureSGX : SubtargetFeature<"sgx", "HasSGX", "true", + "Support SGX instructions">; def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true", "Support PRFCHW instructions">; def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true", "Support RDSEED instruction">; +def FeatureSMAP : SubtargetFeature<"smap", "HasSMAP", "true", + "Support SMAP instructions">; def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", "Use LEA for adjusting the stack pointer">; def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb", @@ -170,6 +182,10 @@ def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true", "LEA instruction with certain arguments is slow">; def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", "INC and DEC instructions are slower than ADD and SUB">; +def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true", + "Use RSQRT* to optimize square root calculations">; +def FeatureUseRecipEst : SubtargetFeature<"use-recip-est", "UseReciprocalEst", + "true", "Use RCP* to optimize division calculations">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -264,8 +280,16 @@ def : ProcessorModel<"core-avx2", HaswellModel, FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM, - FeatureHLE]>; + FeatureHLE, FeatureSlowIncDec]>; +// Broadwell +def : ProcessorModel<"broadwell", HaswellModel, + [FeatureAVX2, FeatureCMPXCHG16B, FeatureFastUAMem, + FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, + FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, + FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM, + FeatureHLE, FeatureADX, FeatureRDSEED, FeatureSMAP, + FeatureSlowIncDec]>; // KNL // FIXME: define KNL model def : ProcessorModel<"knl", HaswellModel, @@ -276,6 +300,17 @@ def : ProcessorModel<"knl", HaswellModel, FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE, FeatureSlowIncDec]>; +// SKX +// FIXME: define SKX model +def : ProcessorModel<"skx", HaswellModel, + [FeatureAVX512, FeatureCDI, + FeatureDQI, FeatureBWI, FeatureVLX, + FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT, + FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, + FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, + FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE, + FeatureSlowIncDec, FeatureSGX]>; + def : Proc<"k6", [FeatureMMX]>; def : Proc<"k6-2", [Feature3DNow]>; def : Proc<"k6-3", [Feature3DNow]>; @@ -311,35 +346,42 @@ def : Proc<"amdfam10", [FeatureSSE4A, def : Proc<"btver1", [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B, FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>; + // Jaguar -def : Proc<"btver2", [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B, - FeaturePRFCHW, FeatureAES, FeaturePCLMUL, - FeatureBMI, FeatureF16C, FeatureMOVBE, - FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>; +def : ProcessorModel<"btver2", BtVer2Model, + [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B, + FeaturePRFCHW, FeatureAES, FeaturePCLMUL, + FeatureBMI, FeatureF16C, FeatureMOVBE, + FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD, + FeatureUseSqrtEst, FeatureUseRecipEst]>; + // Bulldozer def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW, FeaturePCLMUL, - FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>; + FeatureAVX, FeatureSSE4A, FeatureLZCNT, + FeaturePOPCNT, FeatureSlowSHLD]>; // Piledriver def : Proc<"bdver2", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW, FeaturePCLMUL, - FeatureF16C, FeatureLZCNT, - FeaturePOPCNT, FeatureBMI, FeatureTBM, - FeatureFMA, FeatureSlowSHLD]>; + FeatureAVX, FeatureSSE4A, FeatureF16C, + FeatureLZCNT, FeaturePOPCNT, FeatureBMI, + FeatureTBM, FeatureFMA, FeatureSlowSHLD]>; // Steamroller def : Proc<"bdver3", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW, FeaturePCLMUL, - FeatureF16C, FeatureLZCNT, - FeaturePOPCNT, FeatureBMI, FeatureTBM, - FeatureFMA, FeatureFSGSBase]>; + FeatureAVX, FeatureSSE4A, FeatureF16C, + FeatureLZCNT, FeaturePOPCNT, FeatureBMI, + FeatureTBM, FeatureFMA, FeatureSlowSHLD, + FeatureFSGSBase]>; // Excavator def : Proc<"bdver4", [FeatureAVX2, FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW, FeaturePCLMUL, FeatureF16C, FeatureLZCNT, FeaturePOPCNT, FeatureBMI, FeatureBMI2, - FeatureTBM, FeatureFMA, FeatureFSGSBase]>; + FeatureTBM, FeatureFMA, FeatureSSE4A, + FeatureFSGSBase]>; def : Proc<"geode", [Feature3DNowA]>; diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index 1dca568..4e5b7b8 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -18,6 +18,7 @@ #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "llvm/ADT/SmallString.h" +#include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" @@ -29,6 +30,7 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" @@ -45,6 +47,8 @@ using namespace llvm; /// runOnMachineFunction - Emit the function body. /// bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) { + SMShadowTracker.startFunction(MF); + SetupMachineFunction(MF); if (Subtarget->isTargetCOFF()) { @@ -549,6 +553,28 @@ emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel, 4 /*size*/); } +MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const { + if (Subtarget->isTargetKnownWindowsMSVC()) { + const MachineConstantPoolEntry &CPE = + MF->getConstantPool()->getConstants()[CPID]; + if (!CPE.isMachineConstantPoolEntry()) { + SectionKind Kind = + CPE.getSectionKind(TM.getSubtargetImpl()->getDataLayout()); + const Constant *C = CPE.Val.ConstVal; + if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>( + getObjFileLowering().getSectionForConstant(Kind, C))) { + if (MCSymbol *Sym = S->getCOMDATSymbol()) { + if (Sym->isUndefined()) + OutStreamer.EmitSymbolAttribute(Sym, MCSA_Global); + return Sym; + } + } + } + } + + return AsmPrinter::GetCPISymbol(CPID); +} + void X86AsmPrinter::GenerateExportDirective(const MCSymbol *Sym, bool IsData) { SmallString<128> Directive; raw_svector_ostream OS(Directive); @@ -703,7 +729,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList(); if (!Stubs.empty()) { OutStreamer.SwitchSection(TLOFELF.getDataRelSection()); - const DataLayout *TD = TM.getDataLayout(); + const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout(); for (const auto &Stub : Stubs) { OutStreamer.EmitLabel(Stub.first); @@ -712,6 +738,8 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { } Stubs.clear(); } + + SM.serializeToStackMapSection(); } } diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h index e4eef5d..748b948 100644 --- a/lib/Target/X86/X86AsmPrinter.h +++ b/lib/Target/X86/X86AsmPrinter.h @@ -7,14 +7,19 @@ // //===----------------------------------------------------------------------===// -#ifndef X86ASMPRINTER_H -#define X86ASMPRINTER_H +#ifndef LLVM_LIB_TARGET_X86_X86ASMPRINTER_H +#define LLVM_LIB_TARGET_X86_X86ASMPRINTER_H #include "X86Subtarget.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/Target/TargetMachine.h" +// Implemented in X86MCInstLower.cpp +namespace { + class X86MCInstLower; +} + namespace llvm { class MCStreamer; class MCSymbol; @@ -25,9 +30,63 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { void GenerateExportDirective(const MCSymbol *Sym, bool IsData); + // This utility class tracks the length of a stackmap instruction's 'shadow'. + // It is used by the X86AsmPrinter to ensure that the stackmap shadow + // invariants (i.e. no other stackmaps, patchpoints, or control flow within + // the shadow) are met, while outputting a minimal number of NOPs for padding. + // + // To minimise the number of NOPs used, the shadow tracker counts the number + // of instruction bytes output since the last stackmap. Only if there are too + // few instruction bytes to cover the shadow are NOPs used for padding. + class StackMapShadowTracker { + public: + StackMapShadowTracker(TargetMachine &TM); + ~StackMapShadowTracker(); + void startFunction(MachineFunction &MF); + void count(MCInst &Inst, const MCSubtargetInfo &STI); + + // Called to signal the start of a shadow of RequiredSize bytes. + void reset(unsigned RequiredSize) { + RequiredShadowSize = RequiredSize; + CurrentShadowSize = 0; + InShadow = true; + } + + // Called before every stackmap/patchpoint, and at the end of basic blocks, + // to emit any necessary padding-NOPs. + void emitShadowPadding(MCStreamer &OutStreamer, const MCSubtargetInfo &STI); + private: + TargetMachine &TM; + std::unique_ptr<MCCodeEmitter> CodeEmitter; + bool InShadow; + + // RequiredShadowSize holds the length of the shadow specified in the most + // recently encountered STACKMAP instruction. + // CurrentShadowSize counts the number of bytes encoded since the most + // recently encountered STACKMAP, stopping when that number is greater than + // or equal to RequiredShadowSize. + unsigned RequiredShadowSize, CurrentShadowSize; + }; + + StackMapShadowTracker SMShadowTracker; + + // All instructions emitted by the X86AsmPrinter should use this helper + // method. + // + // This helper function invokes the SMShadowTracker on each instruction before + // outputting it to the OutStream. This allows the shadow tracker to minimise + // the number of NOPs used for stackmap padding. + void EmitAndCountInstruction(MCInst &Inst); + + void InsertStackMapShadows(MachineFunction &MF); + void LowerSTACKMAP(const MachineInstr &MI); + void LowerPATCHPOINT(const MachineInstr &MI); + + void LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI); + public: explicit X86AsmPrinter(TargetMachine &TM, MCStreamer &Streamer) - : AsmPrinter(TM, Streamer), SM(*this) { + : AsmPrinter(TM, Streamer), SM(*this), SMShadowTracker(TM) { Subtarget = &TM.getSubtarget<X86Subtarget>(); } @@ -43,6 +102,10 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { void EmitInstruction(const MachineInstr *MI) override; + void EmitBasicBlockEnd(const MachineBasicBlock &MBB) override { + SMShadowTracker.emitShadowPadding(OutStreamer, getSubtargetInfo()); + } + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, raw_ostream &OS) override; @@ -50,6 +113,15 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { unsigned AsmVariant, const char *ExtraCode, raw_ostream &OS) override; + /// \brief Return the symbol for the specified constant pool entry. + MCSymbol *GetCPISymbol(unsigned CPID) const override; + + bool doInitialization(Module &M) override { + SMShadowTracker.reset(0); + SM.reset(); + return AsmPrinter::doInitialization(M); + } + bool runOnMachineFunction(MachineFunction &F) override; }; diff --git a/lib/Target/X86/X86AtomicExpandPass.cpp b/lib/Target/X86/X86AtomicExpandPass.cpp deleted file mode 100644 index 61eefbb..0000000 --- a/lib/Target/X86/X86AtomicExpandPass.cpp +++ /dev/null @@ -1,287 +0,0 @@ -//===-- X86AtomicExpandPass.cpp - Expand illegal atomic instructions --0---===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains a pass (at IR level) to replace atomic instructions which -// cannot be implemented as a single instruction with cmpxchg-based loops. -// -//===----------------------------------------------------------------------===// - -#include "X86.h" -#include "X86TargetMachine.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/Debug.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Target/TargetMachine.h" -using namespace llvm; - -#define DEBUG_TYPE "x86-atomic-expand" - -namespace { - class X86AtomicExpandPass : public FunctionPass { - const X86TargetMachine *TM; - public: - static char ID; // Pass identification, replacement for typeid - explicit X86AtomicExpandPass(const X86TargetMachine *TM) - : FunctionPass(ID), TM(TM) {} - - bool runOnFunction(Function &F) override; - bool expandAtomicInsts(Function &F); - - bool needsCmpXchgNb(Type *MemType); - - /// There are four kinds of atomic operations. Two never need expanding: - /// cmpxchg is what we expand the others *to*, and loads are easily handled - /// by ISelLowering. Atomicrmw and store can need expanding in some - /// circumstances. - bool shouldExpand(Instruction *Inst); - - /// 128-bit atomic stores (64-bit on i686) need to be implemented in terms - /// of trivial cmpxchg16b loops. A simple store isn't necessarily atomic. - bool shouldExpandStore(StoreInst *SI); - - /// Only some atomicrmw instructions need expanding -- some operations - /// (e.g. max) have absolutely no architectural support; some (e.g. or) have - /// limited support but can't return the previous value; some (e.g. add) - /// have complete support in the instruction set. - /// - /// Also, naturally, 128-bit operations always need to be expanded. - bool shouldExpandAtomicRMW(AtomicRMWInst *AI); - - bool expandAtomicRMW(AtomicRMWInst *AI); - bool expandAtomicStore(StoreInst *SI); - }; -} - -char X86AtomicExpandPass::ID = 0; - -FunctionPass *llvm::createX86AtomicExpandPass(const X86TargetMachine *TM) { - return new X86AtomicExpandPass(TM); -} - -bool X86AtomicExpandPass::runOnFunction(Function &F) { - SmallVector<Instruction *, 1> AtomicInsts; - - // Changing control-flow while iterating through it is a bad idea, so gather a - // list of all atomic instructions before we start. - for (BasicBlock &BB : F) - for (Instruction &Inst : BB) { - if (isa<AtomicRMWInst>(&Inst) || - (isa<StoreInst>(&Inst) && cast<StoreInst>(&Inst)->isAtomic())) - AtomicInsts.push_back(&Inst); - } - - bool MadeChange = false; - for (Instruction *Inst : AtomicInsts) { - if (!shouldExpand(Inst)) - continue; - - if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst)) - MadeChange |= expandAtomicRMW(AI); - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) - MadeChange |= expandAtomicStore(SI); - - assert(MadeChange && "Atomic inst not expanded when it should be?"); - Inst->eraseFromParent(); - } - - return MadeChange; -} - -/// Returns true if operations on the given type will need to use either -/// cmpxchg8b or cmpxchg16b. This occurs if the type is 1 step up from the -/// native width, and the instructions are available (otherwise we leave them -/// alone to become __sync_fetch_and_... calls). -bool X86AtomicExpandPass::needsCmpXchgNb(llvm::Type *MemType) { - const X86Subtarget &Subtarget = TM->getSubtarget<X86Subtarget>(); - if (!Subtarget.hasCmpxchg16b()) - return false; - - unsigned CmpXchgNbWidth = Subtarget.is64Bit() ? 128 : 64; - - unsigned OpWidth = MemType->getPrimitiveSizeInBits(); - if (OpWidth == CmpXchgNbWidth) - return true; - - return false; -} - - -bool X86AtomicExpandPass::shouldExpandAtomicRMW(AtomicRMWInst *AI) { - const X86Subtarget &Subtarget = TM->getSubtarget<X86Subtarget>(); - unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; - - if (needsCmpXchgNb(AI->getType())) - return true; - - if (AI->getType()->getPrimitiveSizeInBits() > NativeWidth) - return false; - - AtomicRMWInst::BinOp Op = AI->getOperation(); - switch (Op) { - default: - llvm_unreachable("Unknown atomic operation"); - case AtomicRMWInst::Xchg: - case AtomicRMWInst::Add: - case AtomicRMWInst::Sub: - // It's better to use xadd, xsub or xchg for these in all cases. - return false; - case AtomicRMWInst::Or: - case AtomicRMWInst::And: - case AtomicRMWInst::Xor: - // If the atomicrmw's result isn't actually used, we can just add a "lock" - // prefix to a normal instruction for these operations. - return !AI->use_empty(); - case AtomicRMWInst::Nand: - case AtomicRMWInst::Max: - case AtomicRMWInst::Min: - case AtomicRMWInst::UMax: - case AtomicRMWInst::UMin: - // These always require a non-trivial set of data operations on x86. We must - // use a cmpxchg loop. - return true; - } -} - -bool X86AtomicExpandPass::shouldExpandStore(StoreInst *SI) { - if (needsCmpXchgNb(SI->getValueOperand()->getType())) - return true; - - return false; -} - -bool X86AtomicExpandPass::shouldExpand(Instruction *Inst) { - if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst)) - return shouldExpandAtomicRMW(AI); - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) - return shouldExpandStore(SI); - return false; -} - -/// Emit IR to implement the given atomicrmw operation on values in registers, -/// returning the new value. -static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, - Value *Loaded, Value *Inc) { - Value *NewVal; - switch (Op) { - case AtomicRMWInst::Xchg: - return Inc; - case AtomicRMWInst::Add: - return Builder.CreateAdd(Loaded, Inc, "new"); - case AtomicRMWInst::Sub: - return Builder.CreateSub(Loaded, Inc, "new"); - case AtomicRMWInst::And: - return Builder.CreateAnd(Loaded, Inc, "new"); - case AtomicRMWInst::Nand: - return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new"); - case AtomicRMWInst::Or: - return Builder.CreateOr(Loaded, Inc, "new"); - case AtomicRMWInst::Xor: - return Builder.CreateXor(Loaded, Inc, "new"); - case AtomicRMWInst::Max: - NewVal = Builder.CreateICmpSGT(Loaded, Inc); - return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); - case AtomicRMWInst::Min: - NewVal = Builder.CreateICmpSLE(Loaded, Inc); - return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); - case AtomicRMWInst::UMax: - NewVal = Builder.CreateICmpUGT(Loaded, Inc); - return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); - case AtomicRMWInst::UMin: - NewVal = Builder.CreateICmpULE(Loaded, Inc); - return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); - default: - break; - } - llvm_unreachable("Unknown atomic op"); -} - -bool X86AtomicExpandPass::expandAtomicRMW(AtomicRMWInst *AI) { - AtomicOrdering Order = - AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering(); - Value *Addr = AI->getPointerOperand(); - BasicBlock *BB = AI->getParent(); - Function *F = BB->getParent(); - LLVMContext &Ctx = F->getContext(); - - // Given: atomicrmw some_op iN* %addr, iN %incr ordering - // - // The standard expansion we produce is: - // [...] - // %init_loaded = load atomic iN* %addr - // br label %loop - // loop: - // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] - // %new = some_op iN %loaded, %incr - // %pair = cmpxchg iN* %addr, iN %loaded, iN %new - // %new_loaded = extractvalue { iN, i1 } %pair, 0 - // %success = extractvalue { iN, i1 } %pair, 1 - // br i1 %success, label %atomicrmw.end, label %loop - // atomicrmw.end: - // [...] - BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end"); - BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); - - // This grabs the DebugLoc from AI. - IRBuilder<> Builder(AI); - - // The split call above "helpfully" added a branch at the end of BB (to the - // wrong place), but we want a load. It's easiest to just remove - // the branch entirely. - std::prev(BB->end())->eraseFromParent(); - Builder.SetInsertPoint(BB); - LoadInst *InitLoaded = Builder.CreateLoad(Addr); - InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits()); - Builder.CreateBr(LoopBB); - - // Start the main loop block now that we've taken care of the preliminaries. - Builder.SetInsertPoint(LoopBB); - PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded"); - Loaded->addIncoming(InitLoaded, BB); - - Value *NewVal = - performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand()); - - Value *Pair = Builder.CreateAtomicCmpXchg( - Addr, Loaded, NewVal, Order, - AtomicCmpXchgInst::getStrongestFailureOrdering(Order)); - Value *NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); - Loaded->addIncoming(NewLoaded, LoopBB); - - Value *Success = Builder.CreateExtractValue(Pair, 1, "success"); - Builder.CreateCondBr(Success, ExitBB, LoopBB); - - AI->replaceAllUsesWith(NewLoaded); - - return true; -} - -bool X86AtomicExpandPass::expandAtomicStore(StoreInst *SI) { - // An atomic store might need cmpxchg16b (or 8b on x86) to execute. Express - // this in terms of the usual expansion to "atomicrmw xchg". - IRBuilder<> Builder(SI); - AtomicOrdering Order = - SI->getOrdering() == Unordered ? Monotonic : SI->getOrdering(); - AtomicRMWInst *AI = - Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(), - SI->getValueOperand(), Order); - - // Now we have an appropriate swap instruction, lower it as usual. - if (shouldExpandAtomicRMW(AI)) { - expandAtomicRMW(AI); - AI->eraseFromParent(); - return true; - } - - return AI; -} diff --git a/lib/Target/X86/X86CallingConv.h b/lib/Target/X86/X86CallingConv.h index e76f9fd..0eb2494 100644 --- a/lib/Target/X86/X86CallingConv.h +++ b/lib/Target/X86/X86CallingConv.h @@ -12,14 +12,27 @@ // //===----------------------------------------------------------------------===// -#ifndef X86CALLINGCONV_H -#define X86CALLINGCONV_H +#ifndef LLVM_LIB_TARGET_X86_X86CALLINGCONV_H +#define LLVM_LIB_TARGET_X86_X86CALLINGCONV_H #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/IR/CallingConv.h" namespace llvm { +inline bool CC_X86_32_VectorCallIndirect(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + // Similar to CCPassIndirect, with the addition of inreg. + LocVT = MVT::i32; + LocInfo = CCValAssign::Indirect; + ArgFlags.setInReg(); + return false; // Continue the search, but now for i32. +} + + inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &, CCValAssign::LocInfo &, ISD::ArgFlagsTy &, CCState &) { diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 0824d4e..75a2ec0 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -14,7 +14,9 @@ /// CCIfSubtarget - Match if the current subtarget has a feature F. class CCIfSubtarget<string F, CCAction A> - : CCIf<!strconcat("State.getTarget().getSubtarget<X86Subtarget>().", F), A>; + : CCIf<!strconcat("static_cast<const X86Subtarget&>" + "(State.getMachineFunction().getSubtarget()).", F), + A>; //===----------------------------------------------------------------------===// // Return Value Calling Conventions @@ -52,27 +54,27 @@ def RetCC_X86Common : CallingConv<[ // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3 // can only be used by ABI non-compliant code. This vector type is only // supported while using the AVX-512 target feature. - CCIfType<[v16i32, v8i64, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, // MMX vector types are always returned in MM0. If the target doesn't have // MM0, it doesn't support these vector types. CCIfType<[x86mmx], CCAssignToReg<[MM0]>>, - // Long double types are always returned in ST0 (even with SSE). - CCIfType<[f80], CCAssignToReg<[ST0, ST1]>> + // Long double types are always returned in FP0 (even with SSE). + CCIfType<[f80], CCAssignToReg<[FP0, FP1]>> ]>; // X86-32 C return-value convention. def RetCC_X86_32_C : CallingConv<[ - // The X86-32 calling convention returns FP values in ST0, unless marked + // The X86-32 calling convention returns FP values in FP0, unless marked // with "inreg" (used here to distinguish one kind of reg from another, // weirdly; this is really the sse-regparm calling convention) in which // case they use XMM0, otherwise it is the same as the common X86 calling // conv. CCIfInReg<CCIfSubtarget<"hasSSE2()", CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>, - CCIfType<[f32,f64], CCAssignToReg<[ST0, ST1]>>, + CCIfType<[f32,f64], CCAssignToReg<[FP0, FP1]>>, CCDelegateTo<RetCC_X86Common> ]>; @@ -122,6 +124,24 @@ def RetCC_X86_32_HiPE : CallingConv<[ CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>> ]>; +// X86-32 HiPE return-value convention. +def RetCC_X86_32_VectorCall : CallingConv<[ + // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3. + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, + + // 256-bit FP vectors + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, + + // 512-bit FP vectors + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, + + // Return integers in the standard way. + CCDelegateTo<RetCC_X86Common> +]>; + // X86-64 C return-value convention. def RetCC_X86_64_C : CallingConv<[ // The X86-64 calling convention always returns FP values in XMM0. @@ -177,6 +197,7 @@ def RetCC_X86_32 : CallingConv<[ CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>, // If HiPE, use RetCC_X86_32_HiPE. CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>, + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>, // Otherwise, use RetCC_X86_32_C. CCDelegateTo<RetCC_X86_32_C> @@ -224,6 +245,7 @@ def CC_X86_64_C : CallingConv<[ CCIfType<[i8, i16], CCPromoteToType<i32>>, // The 'nest' parameter, if any, is passed in R10. + CCIfNest<CCIfSubtarget<"isTarget64BitILP32()", CCAssignToReg<[R10D]>>>, CCIfNest<CCAssignToReg<[R10]>>, // The first 6 integer arguments are passed in integer registers. @@ -252,7 +274,7 @@ def CC_X86_64_C : CallingConv<[ YMM4, YMM5, YMM6, YMM7]>>>>, // The first 8 512-bit vector arguments are passed in ZMM registers. - CCIfNotVarArg<CCIfType<[v16i32, v8i64, v16f32, v8f64], + CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], CCIfSubtarget<"hasAVX512()", CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7]>>>>, @@ -327,6 +349,25 @@ def CC_X86_Win64_C : CallingConv<[ CCIfType<[f80], CCAssignToStack<0, 0>> ]>; +def CC_X86_Win64_VectorCall : CallingConv<[ + // The first 6 floating point and vector types of 128 bits or less use + // XMM0-XMM5. + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>, + + // 256-bit vectors use YMM registers. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>, + + // 512-bit vectors use ZMM registers. + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>, + + // Delegate to fastcall to handle integer types. + CCDelegateTo<CC_X86_Win64_C> +]>; + + def CC_X86_64_GHC : CallingConv<[ // Promote i8/i16/i32 arguments to i64. CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, @@ -460,6 +501,30 @@ def CC_X86_32_FastCall : CallingConv<[ CCDelegateTo<CC_X86_32_Common> ]>; +def CC_X86_32_VectorCall : CallingConv<[ + // The first 6 floating point and vector types of 128 bits or less use + // XMM0-XMM5. + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>, + + // 256-bit vectors use YMM registers. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>, + + // 512-bit vectors use ZMM registers. + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>, + + // Otherwise, pass it indirectly. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, + v32i8, v16i16, v8i32, v4i64, v8f32, v4f64, + v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCCustom<"CC_X86_32_VectorCallIndirect">>, + + // Delegate to fastcall to handle integer types. + CCDelegateTo<CC_X86_32_FastCall> +]>; + def CC_X86_32_ThisCall_Common : CallingConv<[ // The first integer argument is passed in ECX CCIfType<[i32], CCAssignToReg<[ECX]>>, @@ -573,6 +638,7 @@ def CC_Intel_OCL_BI : CallingConv<[ // This is the root argument convention for the X86-32 backend. def CC_X86_32 : CallingConv<[ CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>, + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_32_VectorCall>>, CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>, CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>, CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>, @@ -590,6 +656,7 @@ def CC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>, CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>, CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>, + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>, // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp deleted file mode 100644 index a3ae7ee..0000000 --- a/lib/Target/X86/X86CodeEmitter.cpp +++ /dev/null @@ -1,1498 +0,0 @@ -//===-- X86CodeEmitter.cpp - Convert X86 code to machine code -------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the pass that transforms the X86 machine instructions into -// relocatable machine code. -// -//===----------------------------------------------------------------------===// - -#include "X86.h" -#include "X86InstrInfo.h" -#include "X86JITInfo.h" -#include "X86Relocations.h" -#include "X86Subtarget.h" -#include "X86TargetMachine.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/JITCodeEmitter.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/MC/MCCodeEmitter.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/PassManager.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetOptions.h" -using namespace llvm; - -#define DEBUG_TYPE "x86-emitter" - -STATISTIC(NumEmitted, "Number of machine instructions emitted"); - -namespace { - template<class CodeEmitter> - class Emitter : public MachineFunctionPass { - const X86InstrInfo *II; - const DataLayout *TD; - X86TargetMachine &TM; - CodeEmitter &MCE; - MachineModuleInfo *MMI; - intptr_t PICBaseOffset; - bool Is64BitMode; - bool IsPIC; - public: - static char ID; - explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce) - : MachineFunctionPass(ID), II(nullptr), TD(nullptr), TM(tm), - MCE(mce), PICBaseOffset(0), Is64BitMode(false), - IsPIC(TM.getRelocationModel() == Reloc::PIC_) {} - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "X86 Machine Code Emitter"; - } - - void emitOpcodePrefix(uint64_t TSFlags, int MemOperand, - const MachineInstr &MI, - const MCInstrDesc *Desc) const; - - void emitVEXOpcodePrefix(uint64_t TSFlags, int MemOperand, - const MachineInstr &MI, - const MCInstrDesc *Desc) const; - - void emitSegmentOverridePrefix(uint64_t TSFlags, - int MemOperand, - const MachineInstr &MI) const; - - void emitInstruction(MachineInstr &MI, const MCInstrDesc *Desc); - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesAll(); - AU.addRequired<MachineModuleInfo>(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - private: - void emitPCRelativeBlockAddress(MachineBasicBlock *MBB); - void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc, - intptr_t Disp = 0, intptr_t PCAdj = 0, - bool Indirect = false); - void emitExternalSymbolAddress(const char *ES, unsigned Reloc); - void emitConstPoolAddress(unsigned CPI, unsigned Reloc, intptr_t Disp = 0, - intptr_t PCAdj = 0); - void emitJumpTableAddress(unsigned JTI, unsigned Reloc, - intptr_t PCAdj = 0); - - void emitDisplacementField(const MachineOperand *RelocOp, int DispVal, - intptr_t Adj = 0, bool IsPCRel = true); - - void emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeField); - void emitRegModRMByte(unsigned RegOpcodeField); - void emitSIBByte(unsigned SS, unsigned Index, unsigned Base); - void emitConstant(uint64_t Val, unsigned Size); - - void emitMemModRMByte(const MachineInstr &MI, - unsigned Op, unsigned RegOpcodeField, - intptr_t PCAdj = 0); - - unsigned getX86RegNum(unsigned RegNo) const { - const TargetRegisterInfo *TRI = TM.getRegisterInfo(); - return TRI->getEncodingValue(RegNo) & 0x7; - } - - unsigned char getVEXRegisterEncoding(const MachineInstr &MI, - unsigned OpNum) const; - }; - -template<class CodeEmitter> - char Emitter<CodeEmitter>::ID = 0; -} // end anonymous namespace. - -/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code -/// to the specified JITCodeEmitter object. -FunctionPass *llvm::createX86JITCodeEmitterPass(X86TargetMachine &TM, - JITCodeEmitter &JCE) { - return new Emitter<JITCodeEmitter>(TM, JCE); -} - -template<class CodeEmitter> -bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) { - MMI = &getAnalysis<MachineModuleInfo>(); - MCE.setModuleInfo(MMI); - - II = TM.getInstrInfo(); - TD = TM.getDataLayout(); - Is64BitMode = TM.getSubtarget<X86Subtarget>().is64Bit(); - IsPIC = TM.getRelocationModel() == Reloc::PIC_; - - do { - DEBUG(dbgs() << "JITTing function '" << MF.getName() << "'\n"); - MCE.startFunction(MF); - for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); - MBB != E; ++MBB) { - MCE.StartMachineBasicBlock(MBB); - for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); - I != E; ++I) { - const MCInstrDesc &Desc = I->getDesc(); - emitInstruction(*I, &Desc); - // MOVPC32r is basically a call plus a pop instruction. - if (Desc.getOpcode() == X86::MOVPC32r) - emitInstruction(*I, &II->get(X86::POP32r)); - ++NumEmitted; // Keep track of the # of mi's emitted - } - } - } while (MCE.finishFunction(MF)); - - return false; -} - -/// determineREX - Determine if the MachineInstr has to be encoded with a X86-64 -/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand -/// size, and 3) use of X86-64 extended registers. -static unsigned determineREX(const MachineInstr &MI) { - unsigned REX = 0; - const MCInstrDesc &Desc = MI.getDesc(); - - // Pseudo instructions do not need REX prefix byte. - if ((Desc.TSFlags & X86II::FormMask) == X86II::Pseudo) - return 0; - if (Desc.TSFlags & X86II::REX_W) - REX |= 1 << 3; - - unsigned NumOps = Desc.getNumOperands(); - if (NumOps) { - bool isTwoAddr = NumOps > 1 && - Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1; - - // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix. - unsigned i = isTwoAddr ? 1 : 0; - for (unsigned e = NumOps; i != e; ++i) { - const MachineOperand& MO = MI.getOperand(i); - if (MO.isReg()) { - unsigned Reg = MO.getReg(); - if (X86II::isX86_64NonExtLowByteReg(Reg)) - REX |= 0x40; - } - } - - switch (Desc.TSFlags & X86II::FormMask) { - case X86II::MRMSrcReg: { - if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0))) - REX |= 1 << 2; - i = isTwoAddr ? 2 : 1; - for (unsigned e = NumOps; i != e; ++i) { - const MachineOperand& MO = MI.getOperand(i); - if (X86InstrInfo::isX86_64ExtendedReg(MO)) - REX |= 1 << 0; - } - break; - } - case X86II::MRMSrcMem: { - if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0))) - REX |= 1 << 2; - unsigned Bit = 0; - i = isTwoAddr ? 2 : 1; - for (; i != NumOps; ++i) { - const MachineOperand& MO = MI.getOperand(i); - if (MO.isReg()) { - if (X86InstrInfo::isX86_64ExtendedReg(MO)) - REX |= 1 << Bit; - Bit++; - } - } - break; - } - case X86II::MRMXm: - case X86II::MRM0m: case X86II::MRM1m: - case X86II::MRM2m: case X86II::MRM3m: - case X86II::MRM4m: case X86II::MRM5m: - case X86II::MRM6m: case X86II::MRM7m: - case X86II::MRMDestMem: { - unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands); - i = isTwoAddr ? 1 : 0; - if (NumOps > e && X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(e))) - REX |= 1 << 2; - unsigned Bit = 0; - for (; i != e; ++i) { - const MachineOperand& MO = MI.getOperand(i); - if (MO.isReg()) { - if (X86InstrInfo::isX86_64ExtendedReg(MO)) - REX |= 1 << Bit; - Bit++; - } - } - break; - } - default: { - if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0))) - REX |= 1 << 0; - i = isTwoAddr ? 2 : 1; - for (unsigned e = NumOps; i != e; ++i) { - const MachineOperand& MO = MI.getOperand(i); - if (X86InstrInfo::isX86_64ExtendedReg(MO)) - REX |= 1 << 2; - } - break; - } - } - } - return REX; -} - - -/// emitPCRelativeBlockAddress - This method keeps track of the information -/// necessary to resolve the address of this block later and emits a dummy -/// value. -/// -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitPCRelativeBlockAddress(MachineBasicBlock *MBB) { - // Remember where this reference was and where it is to so we can - // deal with it later. - MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(), - X86::reloc_pcrel_word, MBB)); - MCE.emitWordLE(0); -} - -/// emitGlobalAddress - Emit the specified address to the code stream assuming -/// this is part of a "take the address of a global" instruction. -/// -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitGlobalAddress(const GlobalValue *GV, - unsigned Reloc, - intptr_t Disp /* = 0 */, - intptr_t PCAdj /* = 0 */, - bool Indirect /* = false */) { - intptr_t RelocCST = Disp; - if (Reloc == X86::reloc_picrel_word) - RelocCST = PICBaseOffset; - else if (Reloc == X86::reloc_pcrel_word) - RelocCST = PCAdj; - MachineRelocation MR = Indirect - ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc, - const_cast<GlobalValue *>(GV), - RelocCST, false) - : MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc, - const_cast<GlobalValue *>(GV), RelocCST, false); - MCE.addRelocation(MR); - // The relocated value will be added to the displacement - if (Reloc == X86::reloc_absolute_dword) - MCE.emitDWordLE(Disp); - else - MCE.emitWordLE((int32_t)Disp); -} - -/// emitExternalSymbolAddress - Arrange for the address of an external symbol to -/// be emitted to the current location in the function, and allow it to be PC -/// relative. -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitExternalSymbolAddress(const char *ES, - unsigned Reloc) { - intptr_t RelocCST = (Reloc == X86::reloc_picrel_word) ? PICBaseOffset : 0; - - // X86 never needs stubs because instruction selection will always pick - // an instruction sequence that is large enough to hold any address - // to a symbol. - // (see X86ISelLowering.cpp, near 2039: X86TargetLowering::LowerCall) - bool NeedStub = false; - MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(), - Reloc, ES, RelocCST, - 0, NeedStub)); - if (Reloc == X86::reloc_absolute_dword) - MCE.emitDWordLE(0); - else - MCE.emitWordLE(0); -} - -/// emitConstPoolAddress - Arrange for the address of an constant pool -/// to be emitted to the current location in the function, and allow it to be PC -/// relative. -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitConstPoolAddress(unsigned CPI, unsigned Reloc, - intptr_t Disp /* = 0 */, - intptr_t PCAdj /* = 0 */) { - intptr_t RelocCST = 0; - if (Reloc == X86::reloc_picrel_word) - RelocCST = PICBaseOffset; - else if (Reloc == X86::reloc_pcrel_word) - RelocCST = PCAdj; - MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(), - Reloc, CPI, RelocCST)); - // The relocated value will be added to the displacement - if (Reloc == X86::reloc_absolute_dword) - MCE.emitDWordLE(Disp); - else - MCE.emitWordLE((int32_t)Disp); -} - -/// emitJumpTableAddress - Arrange for the address of a jump table to -/// be emitted to the current location in the function, and allow it to be PC -/// relative. -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitJumpTableAddress(unsigned JTI, unsigned Reloc, - intptr_t PCAdj /* = 0 */) { - intptr_t RelocCST = 0; - if (Reloc == X86::reloc_picrel_word) - RelocCST = PICBaseOffset; - else if (Reloc == X86::reloc_pcrel_word) - RelocCST = PCAdj; - MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(), - Reloc, JTI, RelocCST)); - // The relocated value will be added to the displacement - if (Reloc == X86::reloc_absolute_dword) - MCE.emitDWordLE(0); - else - MCE.emitWordLE(0); -} - -inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode, - unsigned RM) { - assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!"); - return RM | (RegOpcode << 3) | (Mod << 6); -} - -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitRegModRMByte(unsigned ModRMReg, - unsigned RegOpcodeFld){ - MCE.emitByte(ModRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg))); -} - -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitRegModRMByte(unsigned RegOpcodeFld) { - MCE.emitByte(ModRMByte(3, RegOpcodeFld, 0)); -} - -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitSIBByte(unsigned SS, - unsigned Index, - unsigned Base) { - // SIB byte is in the same format as the ModRMByte... - MCE.emitByte(ModRMByte(SS, Index, Base)); -} - -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitConstant(uint64_t Val, unsigned Size) { - // Output the constant in little endian byte order... - for (unsigned i = 0; i != Size; ++i) { - MCE.emitByte(Val & 255); - Val >>= 8; - } -} - -/// isDisp8 - Return true if this signed displacement fits in a 8-bit -/// sign-extended field. -static bool isDisp8(int Value) { - return Value == (signed char)Value; -} - -static bool gvNeedsNonLazyPtr(const MachineOperand &GVOp, - const TargetMachine &TM) { - // For Darwin-64, simulate the linktime GOT by using the same non-lazy-pointer - // mechanism as 32-bit mode. - if (TM.getSubtarget<X86Subtarget>().is64Bit() && - !TM.getSubtarget<X86Subtarget>().isTargetDarwin()) - return false; - - // Return true if this is a reference to a stub containing the address of the - // global, not the global itself. - return isGlobalStubReference(GVOp.getTargetFlags()); -} - -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitDisplacementField(const MachineOperand *RelocOp, - int DispVal, - intptr_t Adj /* = 0 */, - bool IsPCRel /* = true */) { - // If this is a simple integer displacement that doesn't require a relocation, - // emit it now. - if (!RelocOp) { - emitConstant(DispVal, 4); - return; - } - - // Otherwise, this is something that requires a relocation. Emit it as such - // now. - unsigned RelocType = Is64BitMode ? - (IsPCRel ? X86::reloc_pcrel_word : X86::reloc_absolute_word_sext) - : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word); - if (RelocOp->isGlobal()) { - // In 64-bit static small code model, we could potentially emit absolute. - // But it's probably not beneficial. If the MCE supports using RIP directly - // do it, otherwise fallback to absolute (this is determined by IsPCRel). - // 89 05 00 00 00 00 mov %eax,0(%rip) # PC-relative - // 89 04 25 00 00 00 00 mov %eax,0x0 # Absolute - bool Indirect = gvNeedsNonLazyPtr(*RelocOp, TM); - emitGlobalAddress(RelocOp->getGlobal(), RelocType, RelocOp->getOffset(), - Adj, Indirect); - } else if (RelocOp->isSymbol()) { - emitExternalSymbolAddress(RelocOp->getSymbolName(), RelocType); - } else if (RelocOp->isCPI()) { - emitConstPoolAddress(RelocOp->getIndex(), RelocType, - RelocOp->getOffset(), Adj); - } else { - assert(RelocOp->isJTI() && "Unexpected machine operand!"); - emitJumpTableAddress(RelocOp->getIndex(), RelocType, Adj); - } -} - -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI, - unsigned Op,unsigned RegOpcodeField, - intptr_t PCAdj) { - const MachineOperand &Op3 = MI.getOperand(Op+3); - int DispVal = 0; - const MachineOperand *DispForReloc = nullptr; - - // Figure out what sort of displacement we have to handle here. - if (Op3.isGlobal()) { - DispForReloc = &Op3; - } else if (Op3.isSymbol()) { - DispForReloc = &Op3; - } else if (Op3.isCPI()) { - if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) { - DispForReloc = &Op3; - } else { - DispVal += MCE.getConstantPoolEntryAddress(Op3.getIndex()); - DispVal += Op3.getOffset(); - } - } else if (Op3.isJTI()) { - if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) { - DispForReloc = &Op3; - } else { - DispVal += MCE.getJumpTableEntryAddress(Op3.getIndex()); - } - } else { - DispVal = Op3.getImm(); - } - - const MachineOperand &Base = MI.getOperand(Op); - const MachineOperand &Scale = MI.getOperand(Op+1); - const MachineOperand &IndexReg = MI.getOperand(Op+2); - - unsigned BaseReg = Base.getReg(); - - // Handle %rip relative addressing. - if (BaseReg == X86::RIP || - (Is64BitMode && DispForReloc)) { // [disp32+RIP] in X86-64 mode - assert(IndexReg.getReg() == 0 && Is64BitMode && - "Invalid rip-relative address"); - MCE.emitByte(ModRMByte(0, RegOpcodeField, 5)); - emitDisplacementField(DispForReloc, DispVal, PCAdj, true); - return; - } - - // Indicate that the displacement will use an pcrel or absolute reference - // by default. MCEs able to resolve addresses on-the-fly use pcrel by default - // while others, unless explicit asked to use RIP, use absolute references. - bool IsPCRel = MCE.earlyResolveAddresses() ? true : false; - - // Is a SIB byte needed? - // If no BaseReg, issue a RIP relative instruction only if the MCE can - // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table - // 2-7) and absolute references. - unsigned BaseRegNo = -1U; - if (BaseReg != 0 && BaseReg != X86::RIP) - BaseRegNo = getX86RegNum(BaseReg); - - if (// The SIB byte must be used if there is an index register. - IndexReg.getReg() == 0 && - // The SIB byte must be used if the base is ESP/RSP/R12, all of which - // encode to an R/M value of 4, which indicates that a SIB byte is - // present. - BaseRegNo != N86::ESP && - // If there is no base register and we're in 64-bit mode, we need a SIB - // byte to emit an addr that is just 'disp32' (the non-RIP relative form). - (!Is64BitMode || BaseReg != 0)) { - if (BaseReg == 0 || // [disp32] in X86-32 mode - BaseReg == X86::RIP) { // [disp32+RIP] in X86-64 mode - MCE.emitByte(ModRMByte(0, RegOpcodeField, 5)); - emitDisplacementField(DispForReloc, DispVal, PCAdj, true); - return; - } - - // If the base is not EBP/ESP and there is no displacement, use simple - // indirect register encoding, this handles addresses like [EAX]. The - // encoding for [EBP] with no displacement means [disp32] so we handle it - // by emitting a displacement of 0 below. - if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) { - MCE.emitByte(ModRMByte(0, RegOpcodeField, BaseRegNo)); - return; - } - - // Otherwise, if the displacement fits in a byte, encode as [REG+disp8]. - if (!DispForReloc && isDisp8(DispVal)) { - MCE.emitByte(ModRMByte(1, RegOpcodeField, BaseRegNo)); - emitConstant(DispVal, 1); - return; - } - - // Otherwise, emit the most general non-SIB encoding: [REG+disp32] - MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo)); - emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel); - return; - } - - // Otherwise we need a SIB byte, so start by outputting the ModR/M byte first. - assert(IndexReg.getReg() != X86::ESP && - IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!"); - - bool ForceDisp32 = false; - bool ForceDisp8 = false; - if (BaseReg == 0) { - // If there is no base register, we emit the special case SIB byte with - // MOD=0, BASE=4, to JUST get the index, scale, and displacement. - MCE.emitByte(ModRMByte(0, RegOpcodeField, 4)); - ForceDisp32 = true; - } else if (DispForReloc) { - // Emit the normal disp32 encoding. - MCE.emitByte(ModRMByte(2, RegOpcodeField, 4)); - ForceDisp32 = true; - } else if (DispVal == 0 && BaseRegNo != N86::EBP) { - // Emit no displacement ModR/M byte - MCE.emitByte(ModRMByte(0, RegOpcodeField, 4)); - } else if (isDisp8(DispVal)) { - // Emit the disp8 encoding... - MCE.emitByte(ModRMByte(1, RegOpcodeField, 4)); - ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP - } else { - // Emit the normal disp32 encoding... - MCE.emitByte(ModRMByte(2, RegOpcodeField, 4)); - } - - // Calculate what the SS field value should be... - static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 }; - unsigned SS = SSTable[Scale.getImm()]; - - if (BaseReg == 0) { - // Handle the SIB byte for the case where there is no base, see Intel - // Manual 2A, table 2-7. The displacement has already been output. - unsigned IndexRegNo; - if (IndexReg.getReg()) - IndexRegNo = getX86RegNum(IndexReg.getReg()); - else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5) - IndexRegNo = 4; - emitSIBByte(SS, IndexRegNo, 5); - } else { - unsigned BaseRegNo = getX86RegNum(BaseReg); - unsigned IndexRegNo; - if (IndexReg.getReg()) - IndexRegNo = getX86RegNum(IndexReg.getReg()); - else - IndexRegNo = 4; // For example [ESP+1*<noreg>+4] - emitSIBByte(SS, IndexRegNo, BaseRegNo); - } - - // Do we need to output a displacement? - if (ForceDisp8) { - emitConstant(DispVal, 1); - } else if (DispVal != 0 || ForceDisp32) { - emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel); - } -} - -static const MCInstrDesc *UpdateOp(MachineInstr &MI, const X86InstrInfo *II, - unsigned Opcode) { - const MCInstrDesc *Desc = &II->get(Opcode); - MI.setDesc(*Desc); - return Desc; -} - -/// Is16BitMemOperand - Return true if the specified instruction has -/// a 16-bit memory operand. Op specifies the operand # of the memoperand. -static bool Is16BitMemOperand(const MachineInstr &MI, unsigned Op) { - const MachineOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); - const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); - - if ((BaseReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) || - (IndexReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg()))) - return true; - return false; -} - -/// Is32BitMemOperand - Return true if the specified instruction has -/// a 32-bit memory operand. Op specifies the operand # of the memoperand. -static bool Is32BitMemOperand(const MachineInstr &MI, unsigned Op) { - const MachineOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); - const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); - - if ((BaseReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) || - (IndexReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg()))) - return true; - return false; -} - -/// Is64BitMemOperand - Return true if the specified instruction has -/// a 64-bit memory operand. Op specifies the operand # of the memoperand. -#ifndef NDEBUG -static bool Is64BitMemOperand(const MachineInstr &MI, unsigned Op) { - const MachineOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); - const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); - - if ((BaseReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) || - (IndexReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg()))) - return true; - return false; -} -#endif - -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitOpcodePrefix(uint64_t TSFlags, - int MemOperand, - const MachineInstr &MI, - const MCInstrDesc *Desc) const { - // Emit the operand size opcode prefix as needed. - if (((TSFlags & X86II::OpSizeMask) >> X86II::OpSizeShift) == X86II::OpSize16) - MCE.emitByte(0x66); - - switch (Desc->TSFlags & X86II::OpPrefixMask) { - case X86II::PD: // 66 - MCE.emitByte(0x66); - break; - case X86II::XS: // F3 - MCE.emitByte(0xF3); - break; - case X86II::XD: // F2 - MCE.emitByte(0xF2); - break; - } - - // Handle REX prefix. - if (Is64BitMode) { - if (unsigned REX = determineREX(MI)) - MCE.emitByte(0x40 | REX); - } - - // 0x0F escape code must be emitted just before the opcode. - switch (Desc->TSFlags & X86II::OpMapMask) { - case X86II::TB: // Two-byte opcode map - case X86II::T8: // 0F 38 - case X86II::TA: // 0F 3A - MCE.emitByte(0x0F); - break; - } - - switch (Desc->TSFlags & X86II::OpMapMask) { - case X86II::T8: // 0F 38 - MCE.emitByte(0x38); - break; - case X86II::TA: // 0F 3A - MCE.emitByte(0x3A); - break; - } -} - -// On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range -// 0-7 and the difference between the 2 groups is given by the REX prefix. -// In the VEX prefix, registers are seen sequencially from 0-15 and encoded -// in 1's complement form, example: -// -// ModRM field => XMM9 => 1 -// VEX.VVVV => XMM9 => ~9 -// -// See table 4-35 of Intel AVX Programming Reference for details. -template<class CodeEmitter> -unsigned char -Emitter<CodeEmitter>::getVEXRegisterEncoding(const MachineInstr &MI, - unsigned OpNum) const { - unsigned SrcReg = MI.getOperand(OpNum).getReg(); - unsigned SrcRegNum = getX86RegNum(MI.getOperand(OpNum).getReg()); - if (X86II::isX86_64ExtendedReg(SrcReg)) - SrcRegNum |= 8; - - // The registers represented through VEX_VVVV should - // be encoded in 1's complement form. - return (~SrcRegNum) & 0xf; -} - -/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitSegmentOverridePrefix(uint64_t TSFlags, - int MemOperand, - const MachineInstr &MI) const { - if (MemOperand < 0) - return; // No memory operand - - // Check for explicit segment override on memory operand. - switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) { - default: llvm_unreachable("Unknown segment register!"); - case 0: break; - case X86::CS: MCE.emitByte(0x2E); break; - case X86::SS: MCE.emitByte(0x36); break; - case X86::DS: MCE.emitByte(0x3E); break; - case X86::ES: MCE.emitByte(0x26); break; - case X86::FS: MCE.emitByte(0x64); break; - case X86::GS: MCE.emitByte(0x65); break; - } -} - -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags, - int MemOperand, - const MachineInstr &MI, - const MCInstrDesc *Desc) const { - unsigned char Encoding = (TSFlags & X86II::EncodingMask) >> - X86II::EncodingShift; - bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; - bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3; - bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; - - // VEX_R: opcode externsion equivalent to REX.R in - // 1's complement (inverted) form - // - // 1: Same as REX_R=0 (must be 1 in 32-bit mode) - // 0: Same as REX_R=1 (64 bit mode only) - // - unsigned char VEX_R = 0x1; - - // VEX_X: equivalent to REX.X, only used when a - // register is used for index in SIB Byte. - // - // 1: Same as REX.X=0 (must be 1 in 32-bit mode) - // 0: Same as REX.X=1 (64-bit mode only) - unsigned char VEX_X = 0x1; - - // VEX_B: - // - // 1: Same as REX_B=0 (ignored in 32-bit mode) - // 0: Same as REX_B=1 (64 bit mode only) - // - unsigned char VEX_B = 0x1; - - // VEX_W: opcode specific (use like REX.W, or used for - // opcode extension, or ignored, depending on the opcode byte) - unsigned char VEX_W = 0; - - // VEX_5M (VEX m-mmmmm field): - // - // 0b00000: Reserved for future use - // 0b00001: implied 0F leading opcode - // 0b00010: implied 0F 38 leading opcode bytes - // 0b00011: implied 0F 3A leading opcode bytes - // 0b00100-0b11111: Reserved for future use - // 0b01000: XOP map select - 08h instructions with imm byte - // 0b01001: XOP map select - 09h instructions with no imm byte - // 0b01010: XOP map select - 0Ah instructions with imm dword - unsigned char VEX_5M = 0; - - // VEX_4V (VEX vvvv field): a register specifier - // (in 1's complement form) or 1111 if unused. - unsigned char VEX_4V = 0xf; - - // VEX_L (Vector Length): - // - // 0: scalar or 128-bit vector - // 1: 256-bit vector - // - unsigned char VEX_L = 0; - - // VEX_PP: opcode extension providing equivalent - // functionality of a SIMD prefix - // - // 0b00: None - // 0b01: 66 - // 0b10: F3 - // 0b11: F2 - // - unsigned char VEX_PP = 0; - - if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W) - VEX_W = 1; - - if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) - VEX_L = 1; - - switch (TSFlags & X86II::OpPrefixMask) { - default: break; // VEX_PP already correct - case X86II::PD: VEX_PP = 0x1; break; // 66 - case X86II::XS: VEX_PP = 0x2; break; // F3 - case X86II::XD: VEX_PP = 0x3; break; // F2 - } - - switch (TSFlags & X86II::OpMapMask) { - default: llvm_unreachable("Invalid prefix!"); - case X86II::TB: VEX_5M = 0x1; break; // 0F - case X86II::T8: VEX_5M = 0x2; break; // 0F 38 - case X86II::TA: VEX_5M = 0x3; break; // 0F 3A - case X86II::XOP8: VEX_5M = 0x8; break; - case X86II::XOP9: VEX_5M = 0x9; break; - case X86II::XOPA: VEX_5M = 0xA; break; - } - - // Classify VEX_B, VEX_4V, VEX_R, VEX_X - unsigned NumOps = Desc->getNumOperands(); - unsigned CurOp = 0; - if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0) - ++CurOp; - else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) { - assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1); - // Special case for GATHER with 2 TIED_TO operands - // Skip the first 2 operands: dst, mask_wb - CurOp += 2; - } - - switch (TSFlags & X86II::FormMask) { - default: llvm_unreachable("Unexpected form in emitVEXOpcodePrefix!"); - case X86II::RawFrm: - break; - case X86II::MRMDestMem: { - // MRMDestMem instructions forms: - // MemAddr, src1(ModR/M) - // MemAddr, src1(VEX_4V), src2(ModR/M) - // MemAddr, src1(ModR/M), imm8 - // - if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg())) - VEX_B = 0x0; - if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg())) - VEX_X = 0x0; - - CurOp = X86::AddrNumOperands; - if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, CurOp++); - - const MachineOperand &MO = MI.getOperand(CurOp); - if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg())) - VEX_R = 0x0; - break; - } - case X86II::MRMSrcMem: - // MRMSrcMem instructions forms: - // src1(ModR/M), MemAddr - // src1(ModR/M), src2(VEX_4V), MemAddr - // src1(ModR/M), MemAddr, imm8 - // src1(ModR/M), MemAddr, src2(VEX_I8IMM) - // - // FMA4: - // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) - // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M), - if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) - VEX_R = 0x0; - CurOp++; - - if (HasVEX_4V) { - VEX_4V = getVEXRegisterEncoding(MI, CurOp); - CurOp++; - } - - if (X86II::isX86_64ExtendedReg( - MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) - VEX_B = 0x0; - if (X86II::isX86_64ExtendedReg( - MI.getOperand(MemOperand+X86::AddrIndexReg).getReg())) - VEX_X = 0x0; - - if (HasVEX_4VOp3) - VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands); - break; - case X86II::MRM0m: case X86II::MRM1m: - case X86II::MRM2m: case X86II::MRM3m: - case X86II::MRM4m: case X86II::MRM5m: - case X86II::MRM6m: case X86II::MRM7m: { - // MRM[0-9]m instructions forms: - // MemAddr - // src1(VEX_4V), MemAddr - if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, CurOp++); - - if (X86II::isX86_64ExtendedReg( - MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) - VEX_B = 0x0; - if (X86II::isX86_64ExtendedReg( - MI.getOperand(MemOperand+X86::AddrIndexReg).getReg())) - VEX_X = 0x0; - break; - } - case X86II::MRMSrcReg: - // MRMSrcReg instructions forms: - // dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) - // dst(ModR/M), src1(ModR/M) - // dst(ModR/M), src1(ModR/M), imm8 - // - if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) - VEX_R = 0x0; - CurOp++; - - if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, CurOp++); - - if (HasMemOp4) // Skip second register source (encoded in I8IMM) - CurOp++; - - if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) - VEX_B = 0x0; - CurOp++; - if (HasVEX_4VOp3) - VEX_4V = getVEXRegisterEncoding(MI, CurOp); - break; - case X86II::MRMDestReg: - // MRMDestReg instructions forms: - // dst(ModR/M), src(ModR/M) - // dst(ModR/M), src(ModR/M), imm8 - // dst(ModR/M), src1(VEX_4V), src2(ModR/M) - if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) - VEX_B = 0x0; - CurOp++; - - if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, CurOp++); - - if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) - VEX_R = 0x0; - break; - case X86II::MRM0r: case X86II::MRM1r: - case X86II::MRM2r: case X86II::MRM3r: - case X86II::MRM4r: case X86II::MRM5r: - case X86II::MRM6r: case X86II::MRM7r: - // MRM0r-MRM7r instructions forms: - // dst(VEX_4V), src(ModR/M), imm8 - VEX_4V = getVEXRegisterEncoding(MI, CurOp); - CurOp++; - - if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) - VEX_B = 0x0; - break; - } - - // Emit segment override opcode prefix as needed. - emitSegmentOverridePrefix(TSFlags, MemOperand, MI); - - // VEX opcode prefix can have 2 or 3 bytes - // - // 3 bytes: - // +-----+ +--------------+ +-------------------+ - // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp | - // +-----+ +--------------+ +-------------------+ - // 2 bytes: - // +-----+ +-------------------+ - // | C5h | | R | vvvv | L | pp | - // +-----+ +-------------------+ - // - // XOP uses a similar prefix: - // +-----+ +--------------+ +-------------------+ - // | 8Fh | | RXB | m-mmmm | | W | vvvv | L | pp | - // +-----+ +--------------+ +-------------------+ - unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3); - - // Can this use the 2 byte VEX prefix? - if (Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) { - MCE.emitByte(0xC5); - MCE.emitByte(LastByte | (VEX_R << 7)); - return; - } - - // 3 byte VEX prefix - MCE.emitByte(Encoding == X86II::XOP ? 0x8F : 0xC4); - MCE.emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M); - MCE.emitByte(LastByte | (VEX_W << 7)); -} - -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, - const MCInstrDesc *Desc) { - DEBUG(dbgs() << MI); - - // If this is a pseudo instruction, lower it. - switch (Desc->getOpcode()) { - case X86::ADD16rr_DB: Desc = UpdateOp(MI, II, X86::OR16rr); break; - case X86::ADD32rr_DB: Desc = UpdateOp(MI, II, X86::OR32rr); break; - case X86::ADD64rr_DB: Desc = UpdateOp(MI, II, X86::OR64rr); break; - case X86::ADD16ri_DB: Desc = UpdateOp(MI, II, X86::OR16ri); break; - case X86::ADD32ri_DB: Desc = UpdateOp(MI, II, X86::OR32ri); break; - case X86::ADD64ri32_DB: Desc = UpdateOp(MI, II, X86::OR64ri32); break; - case X86::ADD16ri8_DB: Desc = UpdateOp(MI, II, X86::OR16ri8); break; - case X86::ADD32ri8_DB: Desc = UpdateOp(MI, II, X86::OR32ri8); break; - case X86::ADD64ri8_DB: Desc = UpdateOp(MI, II, X86::OR64ri8); break; - case X86::ACQUIRE_MOV8rm: Desc = UpdateOp(MI, II, X86::MOV8rm); break; - case X86::ACQUIRE_MOV16rm: Desc = UpdateOp(MI, II, X86::MOV16rm); break; - case X86::ACQUIRE_MOV32rm: Desc = UpdateOp(MI, II, X86::MOV32rm); break; - case X86::ACQUIRE_MOV64rm: Desc = UpdateOp(MI, II, X86::MOV64rm); break; - case X86::RELEASE_MOV8mr: Desc = UpdateOp(MI, II, X86::MOV8mr); break; - case X86::RELEASE_MOV16mr: Desc = UpdateOp(MI, II, X86::MOV16mr); break; - case X86::RELEASE_MOV32mr: Desc = UpdateOp(MI, II, X86::MOV32mr); break; - case X86::RELEASE_MOV64mr: Desc = UpdateOp(MI, II, X86::MOV64mr); break; - } - - - MCE.processDebugLoc(MI.getDebugLoc(), true); - - unsigned Opcode = Desc->Opcode; - - // If this is a two-address instruction, skip one of the register operands. - unsigned NumOps = Desc->getNumOperands(); - unsigned CurOp = 0; - if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0) - ++CurOp; - else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) { - assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1); - // Special case for GATHER with 2 TIED_TO operands - // Skip the first 2 operands: dst, mask_wb - CurOp += 2; - } - - uint64_t TSFlags = Desc->TSFlags; - - // Encoding type for this instruction. - unsigned char Encoding = (TSFlags & X86II::EncodingMask) >> - X86II::EncodingShift; - - // It uses the VEX.VVVV field? - bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; - bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3; - bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; - const unsigned MemOp4_I8IMMOperand = 2; - - // Determine where the memory operand starts, if present. - int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode); - if (MemoryOperand != -1) MemoryOperand += CurOp; - - // Emit the lock opcode prefix as needed. - if (Desc->TSFlags & X86II::LOCK) - MCE.emitByte(0xF0); - - // Emit segment override opcode prefix as needed. - emitSegmentOverridePrefix(TSFlags, MemoryOperand, MI); - - // Emit the repeat opcode prefix as needed. - if (Desc->TSFlags & X86II::REP) - MCE.emitByte(0xF3); - - // Emit the address size opcode prefix as needed. - bool need_address_override; - if (TSFlags & X86II::AdSize) { - need_address_override = true; - } else if (MemoryOperand < 0) { - need_address_override = false; - } else if (Is64BitMode) { - assert(!Is16BitMemOperand(MI, MemoryOperand)); - need_address_override = Is32BitMemOperand(MI, MemoryOperand); - } else { - assert(!Is64BitMemOperand(MI, MemoryOperand)); - need_address_override = Is16BitMemOperand(MI, MemoryOperand); - } - - if (need_address_override) - MCE.emitByte(0x67); - - if (Encoding == 0) - emitOpcodePrefix(TSFlags, MemoryOperand, MI, Desc); - else - emitVEXOpcodePrefix(TSFlags, MemoryOperand, MI, Desc); - - unsigned char BaseOpcode = X86II::getBaseOpcodeFor(Desc->TSFlags); - switch (TSFlags & X86II::FormMask) { - default: - llvm_unreachable("Unknown FormMask value in X86 MachineCodeEmitter!"); - case X86II::Pseudo: - // Remember the current PC offset, this is the PIC relocation - // base address. - switch (Opcode) { - default: - llvm_unreachable("pseudo instructions should be removed before code" - " emission"); - // Do nothing for Int_MemBarrier - it's just a comment. Add a debug - // to make it slightly easier to see. - case X86::Int_MemBarrier: - DEBUG(dbgs() << "#MEMBARRIER\n"); - break; - - case TargetOpcode::INLINEASM: - // We allow inline assembler nodes with empty bodies - they can - // implicitly define registers, which is ok for JIT. - if (MI.getOperand(0).getSymbolName()[0]) { - DebugLoc DL = MI.getDebugLoc(); - DL.print(MI.getParent()->getParent()->getFunction()->getContext(), - llvm::errs()); - report_fatal_error("JIT does not support inline asm!"); - } - break; - case TargetOpcode::DBG_VALUE: - case TargetOpcode::CFI_INSTRUCTION: - break; - case TargetOpcode::GC_LABEL: - case TargetOpcode::EH_LABEL: - MCE.emitLabel(MI.getOperand(0).getMCSymbol()); - break; - - case TargetOpcode::IMPLICIT_DEF: - case TargetOpcode::KILL: - break; - - case X86::SEH_PushReg: - case X86::SEH_SaveReg: - case X86::SEH_SaveXMM: - case X86::SEH_StackAlloc: - case X86::SEH_SetFrame: - case X86::SEH_PushFrame: - case X86::SEH_EndPrologue: - break; - - case X86::MOVPC32r: { - // This emits the "call" portion of this pseudo instruction. - MCE.emitByte(BaseOpcode); - emitConstant(0, X86II::getSizeOfImm(Desc->TSFlags)); - // Remember PIC base. - PICBaseOffset = (intptr_t) MCE.getCurrentPCOffset(); - X86JITInfo *JTI = TM.getJITInfo(); - JTI->setPICBase(MCE.getCurrentPCValue()); - break; - } - } - CurOp = NumOps; - break; - case X86II::RawFrm: { - MCE.emitByte(BaseOpcode); - - if (CurOp == NumOps) - break; - - const MachineOperand &MO = MI.getOperand(CurOp++); - - DEBUG(dbgs() << "RawFrm CurOp " << CurOp << "\n"); - DEBUG(dbgs() << "isMBB " << MO.isMBB() << "\n"); - DEBUG(dbgs() << "isGlobal " << MO.isGlobal() << "\n"); - DEBUG(dbgs() << "isSymbol " << MO.isSymbol() << "\n"); - DEBUG(dbgs() << "isImm " << MO.isImm() << "\n"); - - if (MO.isMBB()) { - emitPCRelativeBlockAddress(MO.getMBB()); - break; - } - - if (MO.isGlobal()) { - emitGlobalAddress(MO.getGlobal(), X86::reloc_pcrel_word, - MO.getOffset(), 0); - break; - } - - if (MO.isSymbol()) { - emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word); - break; - } - - // FIXME: Only used by hackish MCCodeEmitter, remove when dead. - if (MO.isJTI()) { - emitJumpTableAddress(MO.getIndex(), X86::reloc_pcrel_word); - break; - } - - assert(MO.isImm() && "Unknown RawFrm operand!"); - if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32) { - // Fix up immediate operand for pc relative calls. - intptr_t Imm = (intptr_t)MO.getImm(); - Imm = Imm - MCE.getCurrentPCValue() - 4; - emitConstant(Imm, X86II::getSizeOfImm(Desc->TSFlags)); - } else - emitConstant(MO.getImm(), X86II::getSizeOfImm(Desc->TSFlags)); - break; - } - - case X86II::AddRegFrm: { - MCE.emitByte(BaseOpcode + - getX86RegNum(MI.getOperand(CurOp++).getReg())); - - if (CurOp == NumOps) - break; - - const MachineOperand &MO1 = MI.getOperand(CurOp++); - unsigned Size = X86II::getSizeOfImm(Desc->TSFlags); - if (MO1.isImm()) { - emitConstant(MO1.getImm(), Size); - break; - } - - unsigned rt = Is64BitMode ? X86::reloc_pcrel_word - : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word); - if (Opcode == X86::MOV32ri64) - rt = X86::reloc_absolute_word; // FIXME: add X86II flag? - // This should not occur on Darwin for relocatable objects. - if (Opcode == X86::MOV64ri) - rt = X86::reloc_absolute_dword; // FIXME: add X86II flag? - if (MO1.isGlobal()) { - bool Indirect = gvNeedsNonLazyPtr(MO1, TM); - emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0, - Indirect); - } else if (MO1.isSymbol()) - emitExternalSymbolAddress(MO1.getSymbolName(), rt); - else if (MO1.isCPI()) - emitConstPoolAddress(MO1.getIndex(), rt); - else if (MO1.isJTI()) - emitJumpTableAddress(MO1.getIndex(), rt); - break; - } - - case X86II::MRMDestReg: { - MCE.emitByte(BaseOpcode); - - unsigned SrcRegNum = CurOp+1; - if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) - SrcRegNum++; - - emitRegModRMByte(MI.getOperand(CurOp).getReg(), - getX86RegNum(MI.getOperand(SrcRegNum).getReg())); - CurOp = SrcRegNum + 1; - break; - } - case X86II::MRMDestMem: { - MCE.emitByte(BaseOpcode); - - unsigned SrcRegNum = CurOp + X86::AddrNumOperands; - if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) - SrcRegNum++; - emitMemModRMByte(MI, CurOp, - getX86RegNum(MI.getOperand(SrcRegNum).getReg())); - CurOp = SrcRegNum + 1; - break; - } - - case X86II::MRMSrcReg: { - MCE.emitByte(BaseOpcode); - - unsigned SrcRegNum = CurOp+1; - if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) - ++SrcRegNum; - - if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM) - ++SrcRegNum; - - emitRegModRMByte(MI.getOperand(SrcRegNum).getReg(), - getX86RegNum(MI.getOperand(CurOp).getReg())); - // 2 operands skipped with HasMemOp4, compensate accordingly - CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1; - if (HasVEX_4VOp3) - ++CurOp; - break; - } - case X86II::MRMSrcMem: { - int AddrOperands = X86::AddrNumOperands; - unsigned FirstMemOp = CurOp+1; - if (HasVEX_4V) { - ++AddrOperands; - ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV). - } - if (HasMemOp4) // Skip second register source (encoded in I8IMM) - ++FirstMemOp; - - MCE.emitByte(BaseOpcode); - - intptr_t PCAdj = (CurOp + AddrOperands + 1 != NumOps) ? - X86II::getSizeOfImm(Desc->TSFlags) : 0; - emitMemModRMByte(MI, FirstMemOp, - getX86RegNum(MI.getOperand(CurOp).getReg()),PCAdj); - CurOp += AddrOperands + 1; - if (HasVEX_4VOp3) - ++CurOp; - break; - } - - case X86II::MRMXr: - case X86II::MRM0r: case X86II::MRM1r: - case X86II::MRM2r: case X86II::MRM3r: - case X86II::MRM4r: case X86II::MRM5r: - case X86II::MRM6r: case X86II::MRM7r: { - if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). - ++CurOp; - MCE.emitByte(BaseOpcode); - uint64_t Form = (Desc->TSFlags & X86II::FormMask); - emitRegModRMByte(MI.getOperand(CurOp++).getReg(), - (Form == X86II::MRMXr) ? 0 : Form-X86II::MRM0r); - - if (CurOp == NumOps) - break; - - const MachineOperand &MO1 = MI.getOperand(CurOp++); - unsigned Size = X86II::getSizeOfImm(Desc->TSFlags); - if (MO1.isImm()) { - emitConstant(MO1.getImm(), Size); - break; - } - - unsigned rt = Is64BitMode ? X86::reloc_pcrel_word - : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word); - if (Opcode == X86::MOV64ri32) - rt = X86::reloc_absolute_word_sext; // FIXME: add X86II flag? - if (MO1.isGlobal()) { - bool Indirect = gvNeedsNonLazyPtr(MO1, TM); - emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0, - Indirect); - } else if (MO1.isSymbol()) - emitExternalSymbolAddress(MO1.getSymbolName(), rt); - else if (MO1.isCPI()) - emitConstPoolAddress(MO1.getIndex(), rt); - else if (MO1.isJTI()) - emitJumpTableAddress(MO1.getIndex(), rt); - break; - } - - case X86II::MRMXm: - case X86II::MRM0m: case X86II::MRM1m: - case X86II::MRM2m: case X86II::MRM3m: - case X86II::MRM4m: case X86II::MRM5m: - case X86II::MRM6m: case X86II::MRM7m: { - if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). - ++CurOp; - intptr_t PCAdj = (CurOp + X86::AddrNumOperands != NumOps) ? - (MI.getOperand(CurOp+X86::AddrNumOperands).isImm() ? - X86II::getSizeOfImm(Desc->TSFlags) : 4) : 0; - - MCE.emitByte(BaseOpcode); - uint64_t Form = (Desc->TSFlags & X86II::FormMask); - emitMemModRMByte(MI, CurOp, (Form==X86II::MRMXm) ? 0 : Form - X86II::MRM0m, - PCAdj); - CurOp += X86::AddrNumOperands; - - if (CurOp == NumOps) - break; - - const MachineOperand &MO = MI.getOperand(CurOp++); - unsigned Size = X86II::getSizeOfImm(Desc->TSFlags); - if (MO.isImm()) { - emitConstant(MO.getImm(), Size); - break; - } - - unsigned rt = Is64BitMode ? X86::reloc_pcrel_word - : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word); - if (Opcode == X86::MOV64mi32) - rt = X86::reloc_absolute_word_sext; // FIXME: add X86II flag? - if (MO.isGlobal()) { - bool Indirect = gvNeedsNonLazyPtr(MO, TM); - emitGlobalAddress(MO.getGlobal(), rt, MO.getOffset(), 0, - Indirect); - } else if (MO.isSymbol()) - emitExternalSymbolAddress(MO.getSymbolName(), rt); - else if (MO.isCPI()) - emitConstPoolAddress(MO.getIndex(), rt); - else if (MO.isJTI()) - emitJumpTableAddress(MO.getIndex(), rt); - break; - } - - case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2: - case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8: - case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB: - case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D4: - case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D8: - case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB: - case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE: - case X86II::MRM_DF: case X86II::MRM_E0: case X86II::MRM_E1: - case X86II::MRM_E2: case X86II::MRM_E3: case X86II::MRM_E4: - case X86II::MRM_E5: case X86II::MRM_E8: case X86II::MRM_E9: - case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC: - case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_F0: - case X86II::MRM_F1: case X86II::MRM_F2: case X86II::MRM_F3: - case X86II::MRM_F4: case X86II::MRM_F5: case X86II::MRM_F6: - case X86II::MRM_F7: case X86II::MRM_F8: case X86II::MRM_F9: - case X86II::MRM_FA: case X86II::MRM_FB: case X86II::MRM_FC: - case X86II::MRM_FD: case X86II::MRM_FE: case X86II::MRM_FF: - MCE.emitByte(BaseOpcode); - - unsigned char MRM; - switch (TSFlags & X86II::FormMask) { - default: llvm_unreachable("Invalid Form"); - case X86II::MRM_C0: MRM = 0xC0; break; - case X86II::MRM_C1: MRM = 0xC1; break; - case X86II::MRM_C2: MRM = 0xC2; break; - case X86II::MRM_C3: MRM = 0xC3; break; - case X86II::MRM_C4: MRM = 0xC4; break; - case X86II::MRM_C8: MRM = 0xC8; break; - case X86II::MRM_C9: MRM = 0xC9; break; - case X86II::MRM_CA: MRM = 0xCA; break; - case X86II::MRM_CB: MRM = 0xCB; break; - case X86II::MRM_D0: MRM = 0xD0; break; - case X86II::MRM_D1: MRM = 0xD1; break; - case X86II::MRM_D4: MRM = 0xD4; break; - case X86II::MRM_D5: MRM = 0xD5; break; - case X86II::MRM_D6: MRM = 0xD6; break; - case X86II::MRM_D8: MRM = 0xD8; break; - case X86II::MRM_D9: MRM = 0xD9; break; - case X86II::MRM_DA: MRM = 0xDA; break; - case X86II::MRM_DB: MRM = 0xDB; break; - case X86II::MRM_DC: MRM = 0xDC; break; - case X86II::MRM_DD: MRM = 0xDD; break; - case X86II::MRM_DE: MRM = 0xDE; break; - case X86II::MRM_DF: MRM = 0xDF; break; - case X86II::MRM_E0: MRM = 0xE0; break; - case X86II::MRM_E1: MRM = 0xE1; break; - case X86II::MRM_E2: MRM = 0xE2; break; - case X86II::MRM_E3: MRM = 0xE3; break; - case X86II::MRM_E4: MRM = 0xE4; break; - case X86II::MRM_E5: MRM = 0xE5; break; - case X86II::MRM_E8: MRM = 0xE8; break; - case X86II::MRM_E9: MRM = 0xE9; break; - case X86II::MRM_EA: MRM = 0xEA; break; - case X86II::MRM_EB: MRM = 0xEB; break; - case X86II::MRM_EC: MRM = 0xEC; break; - case X86II::MRM_ED: MRM = 0xED; break; - case X86II::MRM_EE: MRM = 0xEE; break; - case X86II::MRM_F0: MRM = 0xF0; break; - case X86II::MRM_F1: MRM = 0xF1; break; - case X86II::MRM_F2: MRM = 0xF2; break; - case X86II::MRM_F3: MRM = 0xF3; break; - case X86II::MRM_F4: MRM = 0xF4; break; - case X86II::MRM_F5: MRM = 0xF5; break; - case X86II::MRM_F6: MRM = 0xF6; break; - case X86II::MRM_F7: MRM = 0xF7; break; - case X86II::MRM_F8: MRM = 0xF8; break; - case X86II::MRM_F9: MRM = 0xF9; break; - case X86II::MRM_FA: MRM = 0xFA; break; - case X86II::MRM_FB: MRM = 0xFB; break; - case X86II::MRM_FC: MRM = 0xFC; break; - case X86II::MRM_FD: MRM = 0xFD; break; - case X86II::MRM_FE: MRM = 0xFE; break; - case X86II::MRM_FF: MRM = 0xFF; break; - } - MCE.emitByte(MRM); - break; - } - - while (CurOp != NumOps && NumOps - CurOp <= 2) { - // The last source register of a 4 operand instruction in AVX is encoded - // in bits[7:4] of a immediate byte. - if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) { - const MachineOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand - : CurOp); - ++CurOp; - unsigned RegNum = getX86RegNum(MO.getReg()) << 4; - if (X86II::isX86_64ExtendedReg(MO.getReg())) - RegNum |= 1 << 7; - // If there is an additional 5th operand it must be an immediate, which - // is encoded in bits[3:0] - if (CurOp != NumOps) { - const MachineOperand &MIMM = MI.getOperand(CurOp++); - if (MIMM.isImm()) { - unsigned Val = MIMM.getImm(); - assert(Val < 16 && "Immediate operand value out of range"); - RegNum |= Val; - } - } - emitConstant(RegNum, 1); - } else { - emitConstant(MI.getOperand(CurOp++).getImm(), - X86II::getSizeOfImm(Desc->TSFlags)); - } - } - - if (!MI.isVariadic() && CurOp != NumOps) { -#ifndef NDEBUG - dbgs() << "Cannot encode all operands of: " << MI << "\n"; -#endif - llvm_unreachable(nullptr); - } - - MCE.processDebugLoc(MI.getDebugLoc(), false); -} diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index ce554ba..95cb718 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -64,7 +64,7 @@ public: X86ScalarSSEf32 = Subtarget->hasSSE1(); } - bool TargetSelectInstruction(const Instruction *I) override; + bool fastSelectInstruction(const Instruction *I) override; /// \brief The specified machine instr operand is a vreg, and that /// vreg is being provided by the specified load instruction. If possible, @@ -73,7 +73,9 @@ public: bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, const LoadInst *LI) override; - bool FastLowerArguments() override; + bool fastLowerArguments() override; + bool fastLowerCall(CallLoweringInfo &CLI) override; + bool fastLowerIntrinsicCall(const IntrinsicInst *II) override; #include "X86GenFastISel.inc" @@ -124,13 +126,8 @@ private: bool X86SelectFPExt(const Instruction *I); bool X86SelectFPTrunc(const Instruction *I); - bool X86VisitIntrinsicCall(const IntrinsicInst &I); - bool X86SelectCall(const Instruction *I); - - bool DoSelectCall(const Instruction *I, const char *MemIntName); - const X86InstrInfo *getInstrInfo() const { - return getTargetMachine()->getInstrInfo(); + return getTargetMachine()->getSubtargetImpl()->getInstrInfo(); } const X86TargetMachine *getTargetMachine() const { return static_cast<const X86TargetMachine *>(&TM); @@ -138,11 +135,14 @@ private: bool handleConstantAddresses(const Value *V, X86AddressMode &AM); - unsigned TargetMaterializeConstant(const Constant *C) override; + unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT); + unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT); + unsigned X86MaterializeGV(const GlobalValue *GV,MVT VT); + unsigned fastMaterializeConstant(const Constant *C) override; - unsigned TargetMaterializeAlloca(const AllocaInst *C) override; + unsigned fastMaterializeAlloca(const AllocaInst *C) override; - unsigned TargetMaterializeFloatZero(const ConstantFP *CF) override; + unsigned fastMaterializeFloatZero(const ConstantFP *CF) override; /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is /// computed in an SSE register, not on the X87 floating point stack. @@ -164,46 +164,6 @@ private: } // end anonymous namespace. -static CmpInst::Predicate optimizeCmpPredicate(const CmpInst *CI) { - // If both operands are the same, then try to optimize or fold the cmp. - CmpInst::Predicate Predicate = CI->getPredicate(); - if (CI->getOperand(0) != CI->getOperand(1)) - return Predicate; - - switch (Predicate) { - default: llvm_unreachable("Invalid predicate!"); - case CmpInst::FCMP_FALSE: Predicate = CmpInst::FCMP_FALSE; break; - case CmpInst::FCMP_OEQ: Predicate = CmpInst::FCMP_ORD; break; - case CmpInst::FCMP_OGT: Predicate = CmpInst::FCMP_FALSE; break; - case CmpInst::FCMP_OGE: Predicate = CmpInst::FCMP_ORD; break; - case CmpInst::FCMP_OLT: Predicate = CmpInst::FCMP_FALSE; break; - case CmpInst::FCMP_OLE: Predicate = CmpInst::FCMP_ORD; break; - case CmpInst::FCMP_ONE: Predicate = CmpInst::FCMP_FALSE; break; - case CmpInst::FCMP_ORD: Predicate = CmpInst::FCMP_ORD; break; - case CmpInst::FCMP_UNO: Predicate = CmpInst::FCMP_UNO; break; - case CmpInst::FCMP_UEQ: Predicate = CmpInst::FCMP_TRUE; break; - case CmpInst::FCMP_UGT: Predicate = CmpInst::FCMP_UNO; break; - case CmpInst::FCMP_UGE: Predicate = CmpInst::FCMP_TRUE; break; - case CmpInst::FCMP_ULT: Predicate = CmpInst::FCMP_UNO; break; - case CmpInst::FCMP_ULE: Predicate = CmpInst::FCMP_TRUE; break; - case CmpInst::FCMP_UNE: Predicate = CmpInst::FCMP_UNO; break; - case CmpInst::FCMP_TRUE: Predicate = CmpInst::FCMP_TRUE; break; - - case CmpInst::ICMP_EQ: Predicate = CmpInst::FCMP_TRUE; break; - case CmpInst::ICMP_NE: Predicate = CmpInst::FCMP_FALSE; break; - case CmpInst::ICMP_UGT: Predicate = CmpInst::FCMP_FALSE; break; - case CmpInst::ICMP_UGE: Predicate = CmpInst::FCMP_TRUE; break; - case CmpInst::ICMP_ULT: Predicate = CmpInst::FCMP_FALSE; break; - case CmpInst::ICMP_ULE: Predicate = CmpInst::FCMP_TRUE; break; - case CmpInst::ICMP_SGT: Predicate = CmpInst::FCMP_FALSE; break; - case CmpInst::ICMP_SGE: Predicate = CmpInst::FCMP_TRUE; break; - case CmpInst::ICMP_SLT: Predicate = CmpInst::FCMP_FALSE; break; - case CmpInst::ICMP_SLE: Predicate = CmpInst::FCMP_TRUE; break; - } - - return Predicate; -} - static std::pair<X86::CondCode, bool> getX86ConditionCode(CmpInst::Predicate Predicate) { X86::CondCode CC = X86::COND_INVALID; @@ -532,7 +492,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, unsigned &ResultReg) { - unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, + unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, Src, /*TODO: Kill=*/false); if (RR == 0) return false; @@ -996,8 +956,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ValLocs; - CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, - I->getContext()); + CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); const Value *RV = Ret->getOperand(0); @@ -1020,7 +979,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { // The calling-convention tables for x87 returns don't tell // the whole story. - if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) + if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) return false; unsigned SrcReg = Reg + VA.getValNo(); @@ -1039,12 +998,12 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { if (SrcVT == MVT::i1) { if (Outs[0].Flags.isSExt()) return false; - SrcReg = FastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false); + SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false); SrcVT = MVT::i8; } unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; - SrcReg = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, + SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, SrcReg, /*TODO: Kill=*/false); } @@ -1107,7 +1066,7 @@ bool X86FastISel::X86SelectLoad(const Instruction *I) { if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg)) return false; - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } @@ -1197,7 +1156,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { ResultReg = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0), ResultReg); - ResultReg = FastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true, + ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true, X86::sub_8bit); if (!ResultReg) return false; @@ -1212,7 +1171,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { } if (ResultReg) { - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } @@ -1253,7 +1212,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { FlagReg2); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]), ResultReg).addReg(FlagReg1).addReg(FlagReg2); - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } @@ -1271,7 +1230,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } @@ -1288,7 +1247,7 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) { MVT SrcVT = TLI.getSimpleValueType(I->getOperand(0)->getType()); if (SrcVT.SimpleTy == MVT::i1) { // Set the high bits to zero. - ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); + ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); SrcVT = MVT::i8; if (ResultReg == 0) @@ -1315,13 +1274,13 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) { ResultReg) .addImm(0).addReg(Result32).addImm(X86::sub_32bit); } else if (DstVT != MVT::i8) { - ResultReg = FastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND, + ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND, ResultReg, /*Kill=*/true); if (ResultReg == 0) return false; } - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } @@ -1345,8 +1304,8 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); switch (Predicate) { default: break; - case CmpInst::FCMP_FALSE: FastEmitBranch(FalseMBB, DbgLoc); return true; - case CmpInst::FCMP_TRUE: FastEmitBranch(TrueMBB, DbgLoc); return true; + case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, DbgLoc); return true; + case CmpInst::FCMP_TRUE: fastEmitBranch(TrueMBB, DbgLoc); return true; } const Value *CmpLHS = CI->getOperand(0); @@ -1416,7 +1375,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { // Emits an unconditional branch to the FalseBB, obtains the branch // weight, and adds it to the successor list. - FastEmitBranch(FalseMBB, DbgLoc); + fastEmitBranch(FalseMBB, DbgLoc); return true; } @@ -1448,7 +1407,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc)) .addMBB(TrueMBB); - FastEmitBranch(FalseMBB, DbgLoc); + fastEmitBranch(FalseMBB, DbgLoc); uint32_t BranchWeight = 0; if (FuncInfo.BPI) BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), @@ -1468,7 +1427,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) .addMBB(TrueMBB); - FastEmitBranch(FalseMBB, DbgLoc); + fastEmitBranch(FalseMBB, DbgLoc); uint32_t BranchWeight = 0; if (FuncInfo.BPI) BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), @@ -1487,7 +1446,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { .addReg(OpReg).addImm(1); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_4)) .addMBB(TrueMBB); - FastEmitBranch(FalseMBB, DbgLoc); + fastEmitBranch(FalseMBB, DbgLoc); uint32_t BranchWeight = 0; if (FuncInfo.BPI) BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), @@ -1561,7 +1520,7 @@ bool X86FastISel::X86SelectShift(const Instruction *I) { unsigned ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg) .addReg(Op0Reg); - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } @@ -1715,7 +1674,7 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { ResultSuperReg).addReg(SourceSuperReg).addImm(8); // Now reference the 8-bit subreg of the result. - ResultReg = FastEmitInst_extractsubreg(MVT::i8, ResultSuperReg, + ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg, /*Kill=*/true, X86::sub_8bit); } // Copy the result out of the physreg if we haven't already. @@ -1724,7 +1683,7 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg) .addReg(OpEntry.DivRemResultReg); } - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } @@ -1840,9 +1799,9 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { return false; unsigned Opc = X86::getCMovFromCond(CC, RC->getSize()); - unsigned ResultReg = FastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill, + unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill); - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } @@ -1920,15 +1879,15 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { return false; const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); - unsigned CmpReg = FastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, + unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); - unsigned AndReg = FastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, + unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, LHSReg, LHSIsKill); - unsigned AndNReg = FastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, + unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, RHSReg, RHSIsKill); - unsigned ResultReg = FastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, + unsigned ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, AndReg, /*IsKill=*/true); - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } @@ -1991,8 +1950,8 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); unsigned ResultReg = - FastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC); - UpdateValueMap(I, ResultReg); + fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC); + updateValueMap(I, ResultReg); return true; } @@ -2021,7 +1980,7 @@ bool X86FastISel::X86SelectSelect(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) .addReg(OpReg, getKillRegState(OpIsKill)); - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } } @@ -2054,7 +2013,7 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::CVTSS2SDrr), ResultReg) .addReg(OpReg); - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } } @@ -2073,7 +2032,7 @@ bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::CVTSD2SSrr), ResultReg) .addReg(OpReg); - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } } @@ -2099,7 +2058,7 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) { if (SrcVT == MVT::i8) { // Truncate from i8 to i1; no code needed. - UpdateValueMap(I, InputReg); + updateValueMap(I, InputReg); return true; } @@ -2116,13 +2075,13 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) { } // Issue an extract_subreg. - unsigned ResultReg = FastEmitInst_extractsubreg(MVT::i8, + unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8, InputReg, /*Kill=*/true, X86::sub_8bit); if (!ResultReg) return false; - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } @@ -2166,24 +2125,12 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, return true; } -static bool isCommutativeIntrinsic(IntrinsicInst const &I) { - switch (I.getIntrinsicID()) { - case Intrinsic::sadd_with_overflow: - case Intrinsic::uadd_with_overflow: - case Intrinsic::smul_with_overflow: - case Intrinsic::umul_with_overflow: - return true; - default: - return false; - } -} - -bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { +bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { // FIXME: Handle more intrinsics. - switch (I.getIntrinsicID()) { + switch (II->getIntrinsicID()) { default: return false; case Intrinsic::frameaddress: { - Type *RetTy = I.getCalledFunction()->getReturnType(); + Type *RetTy = II->getCalledFunction()->getReturnType(); MVT VT; if (!isTypeLegal(RetTy, VT)) @@ -2203,8 +2150,8 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo(); MFI->setFrameAddressIsTaken(true); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + TM.getSubtargetImpl()->getRegisterInfo()); unsigned FrameReg = RegInfo->getFrameRegister(*(FuncInfo.MF)); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && @@ -2223,7 +2170,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { // movq (%rax), %rax // ... unsigned DestReg; - unsigned Depth = cast<ConstantInt>(I.getOperand(0))->getZExtValue(); + unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue(); while (Depth--) { DestReg = createResultReg(RC); addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -2231,23 +2178,23 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { SrcReg = DestReg; } - UpdateValueMap(&I, SrcReg); + updateValueMap(II, SrcReg); return true; } case Intrinsic::memcpy: { - const MemCpyInst &MCI = cast<MemCpyInst>(I); + const MemCpyInst *MCI = cast<MemCpyInst>(II); // Don't handle volatile or variable length memcpys. - if (MCI.isVolatile()) + if (MCI->isVolatile()) return false; - if (isa<ConstantInt>(MCI.getLength())) { + if (isa<ConstantInt>(MCI->getLength())) { // Small memcpy's are common enough that we want to do them // without a call if possible. - uint64_t Len = cast<ConstantInt>(MCI.getLength())->getZExtValue(); + uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue(); if (IsMemcpySmall(Len)) { X86AddressMode DestAM, SrcAM; - if (!X86SelectAddress(MCI.getRawDest(), DestAM) || - !X86SelectAddress(MCI.getRawSource(), SrcAM)) + if (!X86SelectAddress(MCI->getRawDest(), DestAM) || + !X86SelectAddress(MCI->getRawSource(), SrcAM)) return false; TryEmitSmallMemcpy(DestAM, SrcAM, Len); return true; @@ -2255,35 +2202,35 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { } unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; - if (!MCI.getLength()->getType()->isIntegerTy(SizeWidth)) + if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth)) return false; - if (MCI.getSourceAddressSpace() > 255 || MCI.getDestAddressSpace() > 255) + if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255) return false; - return DoSelectCall(&I, "memcpy"); + return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 2); } case Intrinsic::memset: { - const MemSetInst &MSI = cast<MemSetInst>(I); + const MemSetInst *MSI = cast<MemSetInst>(II); - if (MSI.isVolatile()) + if (MSI->isVolatile()) return false; unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; - if (!MSI.getLength()->getType()->isIntegerTy(SizeWidth)) + if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth)) return false; - if (MSI.getDestAddressSpace() > 255) + if (MSI->getDestAddressSpace() > 255) return false; - return DoSelectCall(&I, "memset"); + return lowerCallTo(II, "memset", II->getNumArgOperands() - 2); } case Intrinsic::stackprotector: { // Emit code to store the stack guard onto the stack. EVT PtrTy = TLI.getPointerTy(); - const Value *Op1 = I.getArgOperand(0); // The guard's value. - const AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1)); + const Value *Op1 = II->getArgOperand(0); // The guard's value. + const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1)); MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]); @@ -2294,7 +2241,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { return true; } case Intrinsic::dbg_declare: { - const DbgDeclareInst *DI = cast<DbgDeclareInst>(&I); + const DbgDeclareInst *DI = cast<DbgDeclareInst>(II); X86AddressMode AM; assert(DI->getAddress() && "Null address should be checked earlier!"); if (!X86SelectAddress(DI->getAddress(), AM)) @@ -2302,8 +2249,10 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE); // FIXME may need to add RegState::Debug to any registers produced, // although ESP/EBP should be the only ones at the moment. - addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM). - addImm(0).addMetadata(DI->getVariable()); + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM) + .addImm(0) + .addMetadata(DI->getVariable()) + .addMetadata(DI->getExpression()); return true; } case Intrinsic::trap: { @@ -2314,13 +2263,13 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { if (!Subtarget->hasSSE1()) return false; - Type *RetTy = I.getCalledFunction()->getReturnType(); + Type *RetTy = II->getCalledFunction()->getReturnType(); MVT VT; if (!isTypeLegal(RetTy, VT)) return false; - // Unfortunately we can't use FastEmit_r, because the AVX version of FSQRT + // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT // is not generated by FastISel yet. // FIXME: Update this code once tablegen can handle it. static const unsigned SqrtOpc[2][2] = { @@ -2336,7 +2285,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break; } - const Value *SrcVal = I.getArgOperand(0); + const Value *SrcVal = II->getArgOperand(0); unsigned SrcReg = getRegForValue(SrcVal); if (SrcReg == 0) @@ -2359,7 +2308,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { MIB.addReg(SrcReg); - UpdateValueMap(&I, ResultReg); + updateValueMap(II, ResultReg); return true; } case Intrinsic::sadd_with_overflow: @@ -2370,7 +2319,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { case Intrinsic::umul_with_overflow: { // This implements the basic lowering of the xalu with overflow intrinsics // into add/sub/mul followed by either seto or setb. - const Function *Callee = I.getCalledFunction(); + const Function *Callee = II->getCalledFunction(); auto *Ty = cast<StructType>(Callee->getReturnType()); Type *RetTy = Ty->getTypeAtIndex(0U); Type *CondTy = Ty->getTypeAtIndex(1); @@ -2382,23 +2331,31 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { if (VT < MVT::i8 || VT > MVT::i64) return false; - const Value *LHS = I.getArgOperand(0); - const Value *RHS = I.getArgOperand(1); + const Value *LHS = II->getArgOperand(0); + const Value *RHS = II->getArgOperand(1); // Canonicalize immediate to the RHS. if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && - isCommutativeIntrinsic(I)) + isCommutativeIntrinsic(II)) std::swap(LHS, RHS); + bool UseIncDec = false; + if (isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isOne()) + UseIncDec = true; + unsigned BaseOpc, CondOpc; - switch (I.getIntrinsicID()) { + switch (II->getIntrinsicID()) { default: llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::sadd_with_overflow: - BaseOpc = ISD::ADD; CondOpc = X86::SETOr; break; + BaseOpc = UseIncDec ? unsigned(X86ISD::INC) : unsigned(ISD::ADD); + CondOpc = X86::SETOr; + break; case Intrinsic::uadd_with_overflow: BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break; case Intrinsic::ssub_with_overflow: - BaseOpc = ISD::SUB; CondOpc = X86::SETOr; break; + BaseOpc = UseIncDec ? unsigned(X86ISD::DEC) : unsigned(ISD::SUB); + CondOpc = X86::SETOr; + break; case Intrinsic::usub_with_overflow: BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break; case Intrinsic::smul_with_overflow: @@ -2414,9 +2371,24 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { unsigned ResultReg = 0; // Check if we have an immediate version. - if (auto const *C = dyn_cast<ConstantInt>(RHS)) { - ResultReg = FastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill, - C->getZExtValue()); + if (const auto *CI = dyn_cast<ConstantInt>(RHS)) { + static const unsigned Opc[2][2][4] = { + { { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r }, + { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r } }, + { { X86::INC8r, X86::INC64_16r, X86::INC64_32r, X86::INC64r }, + { X86::DEC8r, X86::DEC64_16r, X86::DEC64_32r, X86::DEC64r } } + }; + + if (BaseOpc == X86ISD::INC || BaseOpc == X86ISD::DEC) { + ResultReg = createResultReg(TLI.getRegClassFor(VT)); + bool Is64Bit = Subtarget->is64Bit(); + bool IsDec = BaseOpc == X86ISD::DEC; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc[Is64Bit][IsDec][VT.SimpleTy-MVT::i8]), ResultReg) + .addReg(LHSReg, getKillRegState(LHSIsKill)); + } else + ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill, + CI->getZExtValue()); } unsigned RHSReg; @@ -2426,7 +2398,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { if (RHSReg == 0) return false; RHSIsKill = hasTrivialKill(RHS); - ResultReg = FastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg, + ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg, RHSIsKill); } @@ -2441,7 +2413,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8]) .addReg(LHSReg, getKillRegState(LHSIsKill)); - ResultReg = FastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8], + ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8], TLI.getRegClassFor(VT), RHSReg, RHSIsKill); } else if (BaseOpc == X86ISD::SMUL && !ResultReg) { static const unsigned MULOpc[] = @@ -2452,10 +2424,10 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), X86::AL) .addReg(LHSReg, getKillRegState(LHSIsKill)); - ResultReg = FastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg, + ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg, RHSIsKill); } else - ResultReg = FastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8], + ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8], TLI.getRegClassFor(VT), LHSReg, LHSIsKill, RHSReg, RHSIsKill); } @@ -2468,7 +2440,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc), ResultReg2); - UpdateValueMap(&I, ResultReg, 2); + updateValueMap(II, ResultReg, 2); return true; } case Intrinsic::x86_sse_cvttss2si: @@ -2476,7 +2448,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { case Intrinsic::x86_sse2_cvttsd2si: case Intrinsic::x86_sse2_cvttsd2si64: { bool IsInputDouble; - switch (I.getIntrinsicID()) { + switch (II->getIntrinsicID()) { default: llvm_unreachable("Unexpected intrinsic."); case Intrinsic::x86_sse_cvttss2si: case Intrinsic::x86_sse_cvttss2si64: @@ -2492,7 +2464,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { break; } - Type *RetTy = I.getCalledFunction()->getReturnType(); + Type *RetTy = II->getCalledFunction()->getReturnType(); MVT VT; if (!isTypeLegal(RetTy, VT)) return false; @@ -2512,7 +2484,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { } // Check if we can fold insertelement instructions into the convert. - const Value *Op = I.getArgOperand(0); + const Value *Op = II->getArgOperand(0); while (auto *IE = dyn_cast<InsertElementInst>(Op)) { const Value *Index = IE->getOperand(2); if (!isa<ConstantInt>(Index)) @@ -2534,13 +2506,13 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) .addReg(Reg); - UpdateValueMap(&I, ResultReg); + updateValueMap(II, ResultReg); return true; } } } -bool X86FastISel::FastLowerArguments() { +bool X86FastISel::fastLowerArguments() { if (!FuncInfo.CanLowerReturn) return false; @@ -2630,58 +2602,57 @@ bool X86FastISel::FastLowerArguments() { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) .addReg(DstReg, getKillRegState(true)); - UpdateValueMap(&Arg, ResultReg); + updateValueMap(&Arg, ResultReg); } return true; } -bool X86FastISel::X86SelectCall(const Instruction *I) { - const CallInst *CI = cast<CallInst>(I); - const Value *Callee = CI->getCalledValue(); - - // Can't handle inline asm yet. - if (isa<InlineAsm>(Callee)) - return false; - - // Handle intrinsic calls. - if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) - return X86VisitIntrinsicCall(*II); - - // Allow SelectionDAG isel to handle tail calls. - if (cast<CallInst>(I)->isTailCall()) - return false; - - return DoSelectCall(I, nullptr); -} - -static unsigned computeBytesPoppedByCallee(const X86Subtarget &Subtarget, - const ImmutableCallSite &CS) { - if (Subtarget.is64Bit()) +static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget, + CallingConv::ID CC, + ImmutableCallSite *CS) { + if (Subtarget->is64Bit()) return 0; - if (Subtarget.getTargetTriple().isOSMSVCRT()) + if (Subtarget->getTargetTriple().isOSMSVCRT()) return 0; - CallingConv::ID CC = CS.getCallingConv(); - if (CC == CallingConv::Fast || CC == CallingConv::GHC) + if (CC == CallingConv::Fast || CC == CallingConv::GHC || + CC == CallingConv::HiPE) return 0; - if (!CS.paramHasAttr(1, Attribute::StructRet)) + if (CS && !CS->paramHasAttr(1, Attribute::StructRet)) return 0; - if (CS.paramHasAttr(1, Attribute::InReg)) + if (CS && CS->paramHasAttr(1, Attribute::InReg)) return 0; return 4; } -// Select either a call, or an llvm.memcpy/memmove/memset intrinsic -bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { - const CallInst *CI = cast<CallInst>(I); - const Value *Callee = CI->getCalledValue(); - - // Handle only C and fastcc calling conventions for now. - ImmutableCallSite CS(CI); - CallingConv::ID CC = CS.getCallingConv(); - bool isWin64 = Subtarget->isCallingConvWin64(CC); - if (CC != CallingConv::C && CC != CallingConv::Fast && - CC != CallingConv::X86_FastCall && CC != CallingConv::X86_64_Win64 && - CC != CallingConv::X86_64_SysV) +bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { + auto &OutVals = CLI.OutVals; + auto &OutFlags = CLI.OutFlags; + auto &OutRegs = CLI.OutRegs; + auto &Ins = CLI.Ins; + auto &InRegs = CLI.InRegs; + CallingConv::ID CC = CLI.CallConv; + bool &IsTailCall = CLI.IsTailCall; + bool IsVarArg = CLI.IsVarArg; + const Value *Callee = CLI.Callee; + const char *SymName = CLI.SymName; + + bool Is64Bit = Subtarget->is64Bit(); + bool IsWin64 = Subtarget->isCallingConvWin64(CC); + + // Handle only C, fastcc, and webkit_js calling conventions for now. + switch (CC) { + default: return false; + case CallingConv::C: + case CallingConv::Fast: + case CallingConv::WebKit_JS: + case CallingConv::X86_FastCall: + case CallingConv::X86_64_Win64: + case CallingConv::X86_64_SysV: + break; + } + + // Allow SelectionDAG isel to handle tail calls. + if (IsTailCall) return false; // fastcc with -tailcallopt is intended to provide a guaranteed @@ -2689,150 +2660,77 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) return false; - PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType()); - FunctionType *FTy = cast<FunctionType>(PT->getElementType()); - bool isVarArg = FTy->isVarArg(); - // Don't know how to handle Win64 varargs yet. Nothing special needed for - // x86-32. Special handling for x86-64 is implemented. - if (isVarArg && isWin64) + // x86-32. Special handling for x86-64 is implemented. + if (IsVarArg && IsWin64) return false; // Don't know about inalloca yet. - if (CS.hasInAllocaArgument()) + if (CLI.CS && CLI.CS->hasInAllocaArgument()) return false; // Fast-isel doesn't know about callee-pop yet. - if (X86::isCalleePop(CC, Subtarget->is64Bit(), isVarArg, + if (X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg, TM.Options.GuaranteedTailCallOpt)) return false; - // Check whether the function can return without sret-demotion. - SmallVector<ISD::OutputArg, 4> Outs; - GetReturnInfo(I->getType(), CS.getAttributes(), Outs, TLI); - bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(), - *FuncInfo.MF, FTy->isVarArg(), - Outs, FTy->getContext()); - if (!CanLowerReturn) - return false; - - // Materialize callee address in a register. FIXME: GV address can be - // handled with a CALLpcrel32 instead. - X86AddressMode CalleeAM; - if (!X86SelectCallAddress(Callee, CalleeAM)) - return false; - unsigned CalleeOp = 0; - const GlobalValue *GV = nullptr; - if (CalleeAM.GV != nullptr) { - GV = CalleeAM.GV; - } else if (CalleeAM.Base.Reg != 0) { - CalleeOp = CalleeAM.Base.Reg; - } else - return false; - - // Deal with call operands first. - SmallVector<const Value *, 8> ArgVals; - SmallVector<unsigned, 8> Args; - SmallVector<MVT, 8> ArgVTs; - SmallVector<ISD::ArgFlagsTy, 8> ArgFlags; - unsigned arg_size = CS.arg_size(); - Args.reserve(arg_size); - ArgVals.reserve(arg_size); - ArgVTs.reserve(arg_size); - ArgFlags.reserve(arg_size); - for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); - i != e; ++i) { - // If we're lowering a mem intrinsic instead of a regular call, skip the - // last two arguments, which should not passed to the underlying functions. - if (MemIntName && e-i <= 2) - break; - Value *ArgVal = *i; - ISD::ArgFlagsTy Flags; - unsigned AttrInd = i - CS.arg_begin() + 1; - if (CS.paramHasAttr(AttrInd, Attribute::SExt)) - Flags.setSExt(); - if (CS.paramHasAttr(AttrInd, Attribute::ZExt)) - Flags.setZExt(); - - if (CS.paramHasAttr(AttrInd, Attribute::ByVal)) { - PointerType *Ty = cast<PointerType>(ArgVal->getType()); - Type *ElementTy = Ty->getElementType(); - unsigned FrameSize = DL.getTypeAllocSize(ElementTy); - unsigned FrameAlign = CS.getParamAlignment(AttrInd); - if (!FrameAlign) - FrameAlign = TLI.getByValTypeAlignment(ElementTy); - Flags.setByVal(); - Flags.setByValSize(FrameSize); - Flags.setByValAlign(FrameAlign); - if (!IsMemcpySmall(FrameSize)) - return false; - } - - if (CS.paramHasAttr(AttrInd, Attribute::InReg)) - Flags.setInReg(); - if (CS.paramHasAttr(AttrInd, Attribute::Nest)) - Flags.setNest(); - - // If this is an i1/i8/i16 argument, promote to i32 to avoid an extra - // instruction. This is safe because it is common to all fastisel supported - // calling conventions on x86. - if (ConstantInt *CI = dyn_cast<ConstantInt>(ArgVal)) { - if (CI->getBitWidth() == 1 || CI->getBitWidth() == 8 || - CI->getBitWidth() == 16) { + // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra + // instruction. This is safe because it is common to all FastISel supported + // calling conventions on x86. + for (int i = 0, e = OutVals.size(); i != e; ++i) { + Value *&Val = OutVals[i]; + ISD::ArgFlagsTy Flags = OutFlags[i]; + if (auto *CI = dyn_cast<ConstantInt>(Val)) { + if (CI->getBitWidth() < 32) { if (Flags.isSExt()) - ArgVal = ConstantExpr::getSExt(CI,Type::getInt32Ty(CI->getContext())); + Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext())); else - ArgVal = ConstantExpr::getZExt(CI,Type::getInt32Ty(CI->getContext())); + Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext())); } } - unsigned ArgReg; - // Passing bools around ends up doing a trunc to i1 and passing it. // Codegen this as an argument + "and 1". - if (ArgVal->getType()->isIntegerTy(1) && isa<TruncInst>(ArgVal) && - cast<TruncInst>(ArgVal)->getParent() == I->getParent() && - ArgVal->hasOneUse()) { - ArgVal = cast<TruncInst>(ArgVal)->getOperand(0); - ArgReg = getRegForValue(ArgVal); - if (ArgReg == 0) return false; - - MVT ArgVT; - if (!isTypeLegal(ArgVal->getType(), ArgVT)) return false; - - ArgReg = FastEmit_ri(ArgVT, ArgVT, ISD::AND, ArgReg, - ArgVal->hasOneUse(), 1); - } else { - ArgReg = getRegForValue(ArgVal); - } + if (auto *TI = dyn_cast<TruncInst>(Val)) { + if (TI->getType()->isIntegerTy(1) && CLI.CS && + (TI->getParent() == CLI.CS->getInstruction()->getParent()) && + TI->hasOneUse()) { + Val = cast<TruncInst>(Val)->getOperand(0); + unsigned ResultReg = getRegForValue(Val); + + if (!ResultReg) + return false; - if (ArgReg == 0) return false; + MVT ArgVT; + if (!isTypeLegal(Val->getType(), ArgVT)) + return false; - Type *ArgTy = ArgVal->getType(); - MVT ArgVT; - if (!isTypeLegal(ArgTy, ArgVT)) - return false; - if (ArgVT == MVT::x86mmx) - return false; - unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy); - Flags.setOrigAlign(OriginalAlignment); + ResultReg = + fastEmit_ri(ArgVT, ArgVT, ISD::AND, ResultReg, Val->hasOneUse(), 1); - Args.push_back(ArgReg); - ArgVals.push_back(ArgVal); - ArgVTs.push_back(ArgVT); - ArgFlags.push_back(Flags); + if (!ResultReg) + return false; + updateValueMap(Val, ResultReg); + } + } } // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs, - I->getParent()->getContext()); + CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext()); // Allocate shadow area for Win64 - if (isWin64) + if (IsWin64) CCInfo.AllocateStack(32, 8); - CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_X86); + SmallVector<MVT, 16> OutVTs; + for (auto *Val : OutVals) { + MVT VT; + if (!isTypeLegal(Val->getType(), VT)) + return false; + OutVTs.push_back(VT); + } + CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); @@ -2842,13 +2740,20 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) .addImm(NumBytes); - // Process argument: walk the register/memloc assignments, inserting - // copies / loads. - SmallVector<unsigned, 4> RegArgs; + // Walk the register/memloc assignments, inserting copies/loads. + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + TM.getSubtargetImpl()->getRegisterInfo()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i]; - unsigned Arg = Args[VA.getValNo()]; - EVT ArgVT = ArgVTs[VA.getValNo()]; + CCValAssign const &VA = ArgLocs[i]; + const Value *ArgVal = OutVals[VA.getValNo()]; + MVT ArgVT = OutVTs[VA.getValNo()]; + + if (ArgVT == MVT::x86mmx) + return false; + + unsigned ArgReg = getRegForValue(ArgVal); + if (!ArgReg) + return false; // Promote the value if needed. switch (VA.getLocInfo()) { @@ -2856,8 +2761,8 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { case CCValAssign::SExt: { assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && "Unexpected extend"); - bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), - Arg, ArgVT, Arg); + bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg, + ArgVT, ArgReg); assert(Emitted && "Failed to emit a sext!"); (void)Emitted; ArgVT = VA.getLocVT(); break; @@ -2865,8 +2770,8 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { case CCValAssign::ZExt: { assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && "Unexpected extend"); - bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), - Arg, ArgVT, Arg); + bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg, + ArgVT, ArgReg); assert(Emitted && "Failed to emit a zext!"); (void)Emitted; ArgVT = VA.getLocVT(); break; @@ -2874,66 +2779,75 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { case CCValAssign::AExt: { assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && "Unexpected extend"); - bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), - Arg, ArgVT, Arg); + bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg, + ArgVT, ArgReg); if (!Emitted) - Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), - Arg, ArgVT, Arg); + Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg, + ArgVT, ArgReg); if (!Emitted) - Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), - Arg, ArgVT, Arg); + Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg, + ArgVT, ArgReg); assert(Emitted && "Failed to emit a aext!"); (void)Emitted; ArgVT = VA.getLocVT(); break; } case CCValAssign::BCvt: { - unsigned BC = FastEmit_r(ArgVT.getSimpleVT(), VA.getLocVT(), - ISD::BITCAST, Arg, /*TODO: Kill=*/false); - assert(BC != 0 && "Failed to emit a bitcast!"); - Arg = BC; + ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg, + /*TODO: Kill=*/false); + assert(ArgReg && "Failed to emit a bitcast!"); ArgVT = VA.getLocVT(); break; } - case CCValAssign::VExt: + case CCValAssign::VExt: // VExt has not been implemented, so this should be impossible to reach // for now. However, fallback to Selection DAG isel once implemented. return false; + case CCValAssign::AExtUpper: + case CCValAssign::SExtUpper: + case CCValAssign::ZExtUpper: + case CCValAssign::FPExt: + llvm_unreachable("Unexpected loc info!"); case CCValAssign::Indirect: // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully // support this. return false; - case CCValAssign::FPExt: - llvm_unreachable("Unexpected loc info!"); } if (VA.isRegLoc()) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg); - RegArgs.push_back(VA.getLocReg()); + TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg); + OutRegs.push_back(VA.getLocReg()); } else { + assert(VA.isMemLoc()); + + // Don't emit stores for undef values. + if (isa<UndefValue>(ArgVal)) + continue; + unsigned LocMemOffset = VA.getLocMemOffset(); X86AddressMode AM; - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo*>( - getTargetMachine()->getRegisterInfo()); AM.Base.Reg = RegInfo->getStackRegister(); AM.Disp = LocMemOffset; - const Value *ArgVal = ArgVals[VA.getValNo()]; - ISD::ArgFlagsTy Flags = ArgFlags[VA.getValNo()]; - + ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()]; + unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType()); + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getStack(LocMemOffset), MachineMemOperand::MOStore, + ArgVT.getStoreSize(), Alignment); if (Flags.isByVal()) { X86AddressMode SrcAM; - SrcAM.Base.Reg = Arg; - bool Res = TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()); - assert(Res && "memcpy length already checked!"); (void)Res; + SrcAM.Base.Reg = ArgReg; + if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize())) + return false; } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) { // If this is a really simple value, emit this with the Value* version // of X86FastEmitStore. If it isn't simple, we don't want to do this, // as it can cause us to reevaluate the argument. - if (!X86FastEmitStore(ArgVT, ArgVal, AM)) + if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO)) return false; } else { - if (!X86FastEmitStore(ArgVT, Arg, /*ValIsKill=*/false, AM)) + bool ValIsKill = hasTrivialKill(ArgVal); + if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO)) return false; } } @@ -2947,37 +2861,53 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base); } - if (Subtarget->is64Bit() && isVarArg && !isWin64) { + if (Is64Bit && IsVarArg && !IsWin64) { + // From AMD64 ABI document: + // For calls that may call functions that use varargs or stdargs + // (prototype-less calls or calls to functions containing ellipsis (...) in + // the declaration) %al is used as hidden argument to specify the number + // of SSE registers used. The contents of %al do not need to match exactly + // the number of registers, but must be an ubound on the number of SSE + // registers used and is in the range 0 - 8 inclusive. + // Count the number of XMM registers allocated. static const MCPhysReg XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); + assert((Subtarget->hasSSE1() || !NumXMMRegs) + && "SSE registers cannot be used when SSE is disabled"); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri), X86::AL).addImm(NumXMMRegs); } + // Materialize callee address in a register. FIXME: GV address can be + // handled with a CALLpcrel32 instead. + X86AddressMode CalleeAM; + if (!X86SelectCallAddress(Callee, CalleeAM)) + return false; + + unsigned CalleeOp = 0; + const GlobalValue *GV = nullptr; + if (CalleeAM.GV != nullptr) { + GV = CalleeAM.GV; + } else if (CalleeAM.Base.Reg != 0) { + CalleeOp = CalleeAM.Base.Reg; + } else + return false; + // Issue the call. MachineInstrBuilder MIB; if (CalleeOp) { // Register-indirect call. - unsigned CallOpc; - if (Subtarget->is64Bit()) - CallOpc = X86::CALL64r; - else - CallOpc = X86::CALL32r; + unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r; MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc)) .addReg(CalleeOp); - } else { // Direct call. assert(GV && "Not a direct call"); - unsigned CallOpc; - if (Subtarget->is64Bit()) - CallOpc = X86::CALL64pcrel32; - else - CallOpc = X86::CALLpcrel32; + unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32; // See if we need any target-specific flags on the GV operand. unsigned char OpFlags = 0; @@ -3000,92 +2930,72 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { OpFlags = X86II::MO_DARWIN_STUB; } - MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc)); - if (MemIntName) - MIB.addExternalSymbol(MemIntName, OpFlags); + if (SymName) + MIB.addExternalSymbol(SymName, OpFlags); else MIB.addGlobalAddress(GV, 0, OpFlags); } - // Add a register mask with the call-preserved registers. + // Add a register mask operand representing the call-preserved registers. // Proper defs for return values will be added by setPhysRegsDeadExcept(). - MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv())); + MIB.addRegMask(TRI.getCallPreservedMask(CC)); // Add an implicit use GOT pointer in EBX. if (Subtarget->isPICStyleGOT()) MIB.addReg(X86::EBX, RegState::Implicit); - if (Subtarget->is64Bit() && isVarArg && !isWin64) + if (Is64Bit && IsVarArg && !IsWin64) MIB.addReg(X86::AL, RegState::Implicit); // Add implicit physical register uses to the call. - for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) - MIB.addReg(RegArgs[i], RegState::Implicit); + for (auto Reg : OutRegs) + MIB.addReg(Reg, RegState::Implicit); // Issue CALLSEQ_END + unsigned NumBytesForCalleeToPop = + computeBytesPoppedByCallee(Subtarget, CC, CLI.CS); unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); - const unsigned NumBytesCallee = computeBytesPoppedByCallee(*Subtarget, CS); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp)) - .addImm(NumBytes).addImm(NumBytesCallee); - - // Build info for return calling conv lowering code. - // FIXME: This is practically a copy-paste from TargetLowering::LowerCallTo. - SmallVector<ISD::InputArg, 32> Ins; - SmallVector<EVT, 4> RetTys; - ComputeValueVTs(TLI, I->getType(), RetTys); - for (unsigned i = 0, e = RetTys.size(); i != e; ++i) { - EVT VT = RetTys[i]; - MVT RegisterVT = TLI.getRegisterType(I->getParent()->getContext(), VT); - unsigned NumRegs = TLI.getNumRegisters(I->getParent()->getContext(), VT); - for (unsigned j = 0; j != NumRegs; ++j) { - ISD::InputArg MyFlags; - MyFlags.VT = RegisterVT; - MyFlags.Used = !CS.getInstruction()->use_empty(); - if (CS.paramHasAttr(0, Attribute::SExt)) - MyFlags.Flags.setSExt(); - if (CS.paramHasAttr(0, Attribute::ZExt)) - MyFlags.Flags.setZExt(); - if (CS.paramHasAttr(0, Attribute::InReg)) - MyFlags.Flags.setInReg(); - Ins.push_back(MyFlags); - } - } + .addImm(NumBytes).addImm(NumBytesForCalleeToPop); // Now handle call return values. - SmallVector<unsigned, 4> UsedRegs; SmallVector<CCValAssign, 16> RVLocs; - CCState CCRetInfo(CC, false, *FuncInfo.MF, TM, RVLocs, - I->getParent()->getContext()); - unsigned ResultReg = FuncInfo.CreateRegs(I->getType()); + CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs, + CLI.RetTy->getContext()); CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86); + + // Copy all of the result registers out of their specified physreg. + unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy); for (unsigned i = 0; i != RVLocs.size(); ++i) { - EVT CopyVT = RVLocs[i].getValVT(); + CCValAssign &VA = RVLocs[i]; + EVT CopyVT = VA.getValVT(); unsigned CopyReg = ResultReg + i; - // If this is a call to a function that returns an fp value on the x87 fp - // stack, but where we prefer to use the value in xmm registers, copy it - // out as F80 and use a truncate to move it from fp stack reg to xmm reg. - if ((RVLocs[i].getLocReg() == X86::ST0 || - RVLocs[i].getLocReg() == X86::ST1)) { - if (isScalarFPTypeInSSEReg(RVLocs[i].getValVT())) { - CopyVT = MVT::f80; - CopyReg = createResultReg(&X86::RFP80RegClass); - } - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(X86::FpPOP_RETVAL), CopyReg); - } else { - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), - CopyReg).addReg(RVLocs[i].getLocReg()); - UsedRegs.push_back(RVLocs[i].getLocReg()); + // If this is x86-64, and we disabled SSE, we can't return FP values + if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && + ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { + report_fatal_error("SSE register return with SSE disabled"); } - if (CopyVT != RVLocs[i].getValVT()) { - // Round the F80 the right size, which also moves to the appropriate xmm - // register. This is accomplished by storing the F80 value in memory and - // then loading it back. Ewww... - EVT ResVT = RVLocs[i].getValVT(); + // If we prefer to use the value in xmm registers, copy it out as f80 and + // use a truncate to move it from fp stack reg to xmm reg. + if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && + isScalarFPTypeInSSEReg(VA.getValVT())) { + CopyVT = MVT::f80; + CopyReg = createResultReg(&X86::RFP80RegClass); + } + + // Copy out the result. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg()); + InRegs.push_back(VA.getLocReg()); + + // Round the f80 to the right size, which also moves it to the appropriate + // xmm register. This is accomplished by storing the f80 value in memory + // and then loading it back. + if (CopyVT != VA.getValVT()) { + EVT ResVT = VA.getValVT(); unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64; unsigned MemSize = ResVT.getSizeInBits()/8; int FI = MFI.CreateStackObject(MemSize, MemSize, false); @@ -3098,18 +3008,15 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { } } - if (RVLocs.size()) - UpdateValueMap(I, ResultReg, RVLocs.size()); - - // Set all unused physreg defs as dead. - static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI); + CLI.ResultReg = ResultReg; + CLI.NumResultRegs = RVLocs.size(); + CLI.Call = MIB; return true; } - bool -X86FastISel::TargetSelectInstruction(const Instruction *I) { +X86FastISel::fastSelectInstruction(const Instruction *I) { switch (I->getOpcode()) { default: break; case Instruction::Load: @@ -3125,8 +3032,6 @@ X86FastISel::TargetSelectInstruction(const Instruction *I) { return X86SelectZExt(I); case Instruction::Br: return X86SelectBranch(I); - case Instruction::Call: - return X86SelectCall(I); case Instruction::LShr: case Instruction::AShr: case Instruction::Shl: @@ -3154,7 +3059,7 @@ X86FastISel::TargetSelectInstruction(const Instruction *I) { return X86SelectTrunc(I); unsigned Reg = getRegForValue(I->getOperand(0)); if (Reg == 0) return false; - UpdateValueMap(I, Reg); + updateValueMap(I, Reg); return true; } } @@ -3162,13 +3067,69 @@ X86FastISel::TargetSelectInstruction(const Instruction *I) { return false; } -unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { - MVT VT; - if (!isTypeLegal(C->getType(), VT)) +unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) { + if (VT > MVT::i64) return 0; + uint64_t Imm = CI->getZExtValue(); + if (Imm == 0) { + unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass); + switch (VT.SimpleTy) { + default: llvm_unreachable("Unexpected value type"); + case MVT::i1: + case MVT::i8: + return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true, + X86::sub_8bit); + case MVT::i16: + return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Kill=*/true, + X86::sub_16bit); + case MVT::i32: + return SrcReg; + case MVT::i64: { + unsigned ResultReg = createResultReg(&X86::GR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg) + .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit); + return ResultReg; + } + } + } + + unsigned Opc = 0; + switch (VT.SimpleTy) { + default: llvm_unreachable("Unexpected value type"); + case MVT::i1: VT = MVT::i8; // fall-through + case MVT::i8: Opc = X86::MOV8ri; break; + case MVT::i16: Opc = X86::MOV16ri; break; + case MVT::i32: Opc = X86::MOV32ri; break; + case MVT::i64: { + if (isUInt<32>(Imm)) + Opc = X86::MOV32ri; + else if (isInt<32>(Imm)) + Opc = X86::MOV64ri32; + else + Opc = X86::MOV64ri; + break; + } + } + if (VT == MVT::i64 && Opc == X86::MOV32ri) { + unsigned SrcReg = fastEmitInst_i(Opc, &X86::GR32RegClass, Imm); + unsigned ResultReg = createResultReg(&X86::GR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg) + .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit); + return ResultReg; + } + return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm); +} + +unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { + if (CFP->isNullValue()) + return fastMaterializeFloatZero(CFP); + // Can't handle alternate code models yet. - if (TM.getCodeModel() != CodeModel::Small) + CodeModel::Model CM = TM.getCodeModel(); + if (CM != CodeModel::Small && CM != CodeModel::Large) return 0; // Get opcode and regclass of the output for the given load instruction. @@ -3176,23 +3137,6 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { const TargetRegisterClass *RC = nullptr; switch (VT.SimpleTy) { default: return 0; - case MVT::i8: - Opc = X86::MOV8rm; - RC = &X86::GR8RegClass; - break; - case MVT::i16: - Opc = X86::MOV16rm; - RC = &X86::GR16RegClass; - break; - case MVT::i32: - Opc = X86::MOV32rm; - RC = &X86::GR32RegClass; - break; - case MVT::i64: - // Must be in x86-64 mode. - Opc = X86::MOV64rm; - RC = &X86::GR64RegClass; - break; case MVT::f32: if (X86ScalarSSEf32) { Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm; @@ -3216,39 +3160,11 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { return 0; } - // Materialize addresses with LEA/MOV instructions. - if (isa<GlobalValue>(C)) { - X86AddressMode AM; - if (X86SelectAddress(C, AM)) { - // If the expression is just a basereg, then we're done, otherwise we need - // to emit an LEA. - if (AM.BaseType == X86AddressMode::RegBase && - AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr) - return AM.Base.Reg; - - unsigned ResultReg = createResultReg(RC); - if (TM.getRelocationModel() == Reloc::Static && - TLI.getPointerTy() == MVT::i64) { - // The displacement code be more than 32 bits away so we need to use - // an instruction with a 64 bit immediate - Opc = X86::MOV64ri; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(Opc), ResultReg).addGlobalAddress(cast<GlobalValue>(C)); - } else { - Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; - addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(Opc), ResultReg), AM); - } - return ResultReg; - } - return 0; - } - // MachineConstantPool wants an explicit alignment. - unsigned Align = DL.getPrefTypeAlignment(C->getType()); + unsigned Align = DL.getPrefTypeAlignment(CFP->getType()); if (Align == 0) { - // Alignment of vector types. FIXME! - Align = DL.getTypeAllocSize(C->getType()); + // Alignment of vector types. FIXME! + Align = DL.getTypeAllocSize(CFP->getType()); } // x86-32 PIC requires a PIC base register for constant pools. @@ -3266,23 +3182,88 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { } // Create the load from the constant pool. - unsigned MCPOffset = MCP.getConstantPoolIndex(C, Align); + unsigned CPI = MCP.getConstantPoolIndex(CFP, Align); unsigned ResultReg = createResultReg(RC); + + if (CM == CodeModel::Large) { + unsigned AddrReg = createResultReg(&X86::GR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri), + AddrReg) + .addConstantPoolIndex(CPI, 0, OpFlag); + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg); + addDirectMem(MIB, AddrReg); + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, + TM.getSubtargetImpl()->getDataLayout()->getPointerSize(), Align); + MIB->addMemOperand(*FuncInfo.MF, MMO); + return ResultReg; + } + addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg), - MCPOffset, PICBase, OpFlag); - + CPI, PICBase, OpFlag); return ResultReg; } -unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) { +unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) { + // Can't handle alternate code models yet. + if (TM.getCodeModel() != CodeModel::Small) + return 0; + + // Materialize addresses with LEA/MOV instructions. + X86AddressMode AM; + if (X86SelectAddress(GV, AM)) { + // If the expression is just a basereg, then we're done, otherwise we need + // to emit an LEA. + if (AM.BaseType == X86AddressMode::RegBase && + AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr) + return AM.Base.Reg; + + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + if (TM.getRelocationModel() == Reloc::Static && + TLI.getPointerTy() == MVT::i64) { + // The displacement code could be more than 32 bits away so we need to use + // an instruction with a 64 bit immediate + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri), + ResultReg) + .addGlobalAddress(GV); + } else { + unsigned Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg), AM); + } + return ResultReg; + } + return 0; +} + +unsigned X86FastISel::fastMaterializeConstant(const Constant *C) { + EVT CEVT = TLI.getValueType(C->getType(), true); + + // Only handle simple types. + if (!CEVT.isSimple()) + return 0; + MVT VT = CEVT.getSimpleVT(); + + if (const auto *CI = dyn_cast<ConstantInt>(C)) + return X86MaterializeInt(CI, VT); + else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) + return X86MaterializeFP(CFP, VT); + else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) + return X86MaterializeGV(GV, VT); + + return 0; +} + +unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) { // Fail on dynamic allocas. At this point, getRegForValue has already // checked its CSE maps, so if we're here trying to handle a dynamic // alloca, we're not going to succeed. X86SelectAddress has a // check for dynamic allocas, because it's called directly from - // various places, but TargetMaterializeAlloca also needs a check + // various places, but targetMaterializeAlloca also needs a check // in order to avoid recursion between getRegForValue, - // X86SelectAddrss, and TargetMaterializeAlloca. + // X86SelectAddrss, and targetMaterializeAlloca. if (!FuncInfo.StaticAllocaMap.count(C)) return 0; assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?"); @@ -3290,7 +3271,7 @@ unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) { X86AddressMode AM; if (!X86SelectAddress(C, AM)) return 0; - unsigned Opc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; + unsigned Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy()); unsigned ResultReg = createResultReg(RC); addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -3298,7 +3279,7 @@ unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) { return ResultReg; } -unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) { +unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) { MVT VT; if (!isTypeLegal(CF->getType(), VT)) return 0; @@ -3356,7 +3337,8 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, AM.getFullAddress(AddrOps); MachineInstr *Result = - XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment); + XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, + Size, Alignment, /*AllowCommute=*/true); if (!Result) return false; diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp index 4be766a..02736ac 100644 --- a/lib/Target/X86/X86FixupLEAs.cpp +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -7,9 +7,8 @@ // //===----------------------------------------------------------------------===// // -// This file defines the pass which will find instructions which -// can be re-written as LEA instructions in order to reduce pipeline -// delays for some models of the Intel Atom family. +// This file defines the pass that finds instructions that can be +// re-written as LEA instructions in order to reduce pipeline delays. // //===----------------------------------------------------------------------===// @@ -40,7 +39,7 @@ class FixupLEAPass : public MachineFunctionPass { /// where appropriate. bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI); - const char *getPassName() const override { return "X86 Atom LEA Fixup"; } + const char *getPassName() const override { return "X86 LEA Fixup"; } /// \brief Given a machine register, look for the instruction /// which writes it in the current basic block. If found, @@ -156,7 +155,8 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { if (!ST.LEAusesAG() && !ST.slowLEA()) return false; - TII = static_cast<const X86InstrInfo *>(TM->getInstrInfo()); + TII = + static_cast<const X86InstrInfo *>(TM->getSubtargetImpl()->getInstrInfo()); DEBUG(dbgs() << "Start X86FixupLEAs\n";); // Process all basic blocks. @@ -218,7 +218,8 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, if (usesRegister(p, CurInst) == RU_Write) { return CurInst; } - InstrDistance += TII->getInstrLatency(TM->getInstrItineraryData(), CurInst); + InstrDistance += TII->getInstrLatency( + TM->getSubtargetImpl()->getInstrItineraryData(), CurInst); Found = getPreviousInstr(CurInst, MFI); } return nullptr; diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index c8a3ab3..6189109 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -28,12 +28,14 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/EdgeBundles.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/InlineAsm.h" #include "llvm/Support/Debug.h" @@ -41,7 +43,9 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetSubtargetInfo.h" #include <algorithm> +#include <bitset> using namespace llvm; #define DEBUG_TYPE "x86-codegen" @@ -50,6 +54,8 @@ STATISTIC(NumFXCH, "Number of fxch instructions inserted"); STATISTIC(NumFP , "Number of floating point instructions"); namespace { + const unsigned ScratchFPReg = 7; + struct FPS : public MachineFunctionPass { static char ID; FPS() : MachineFunctionPass(ID) { @@ -137,7 +143,7 @@ namespace { unsigned StackTop; // The current top of the FP stack. enum { - NumFPRegs = 16 // Including scratch pseudo-registers. + NumFPRegs = 8 // Including scratch pseudo-registers. }; // For each live FP<n> register, point to its Stack[] entry. @@ -146,27 +152,6 @@ namespace { // register allocator thinks. unsigned RegMap[NumFPRegs]; - // Pending fixed registers - Inline assembly needs FP registers to appear - // in fixed stack slot positions. This is handled by copying FP registers - // to ST registers before the instruction, and copying back after the - // instruction. - // - // This is modeled with pending ST registers. NumPendingSTs is the number - // of ST registers (ST0-STn) we are tracking. PendingST[n] points to an FP - // register that holds the ST value. The ST registers are not moved into - // place until immediately before the instruction that needs them. - // - // It can happen that we need an ST register to be live when no FP register - // holds the value: - // - // %ST0 = COPY %FP4<kill> - // - // When that happens, we allocate a scratch FP register to hold the ST - // value. That means every register in PendingST must be live. - - unsigned NumPendingSTs; - unsigned char PendingST[8]; - // Set up our stack model to match the incoming registers to MBB. void setupBlockStack(); @@ -180,9 +165,6 @@ namespace { dbgs() << " FP" << Stack[i]; assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!"); } - for (unsigned i = 0; i != NumPendingSTs; ++i) - dbgs() << ", ST" << i << " in FP" << unsigned(PendingST[i]); - dbgs() << "\n"; } #endif @@ -199,19 +181,6 @@ namespace { return Slot < StackTop && Stack[Slot] == RegNo; } - /// getScratchReg - Return an FP register that is not currently in use. - unsigned getScratchReg() const { - for (int i = NumFPRegs - 1; i >= 8; --i) - if (!isLive(i)) - return i; - llvm_unreachable("Ran out of scratch FP registers"); - } - - /// isScratchReg - Returns trus if RegNo is a scratch FP register. - static bool isScratchReg(unsigned RegNo) { - return RegNo > 8 && RegNo < NumFPRegs; - } - /// getStackEntry - Return the X86::FP<n> register in register ST(i). unsigned getStackEntry(unsigned STi) const { if (STi >= StackTop) @@ -263,21 +232,6 @@ namespace { BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg); } - /// duplicatePendingSTBeforeKill - The instruction at I is about to kill - /// RegNo. If any PendingST registers still need the RegNo value, duplicate - /// them to new scratch registers. - void duplicatePendingSTBeforeKill(unsigned RegNo, MachineInstr *I) { - for (unsigned i = 0; i != NumPendingSTs; ++i) { - if (PendingST[i] != RegNo) - continue; - unsigned SR = getScratchReg(); - DEBUG(dbgs() << "Duplicating pending ST" << i - << " in FP" << RegNo << " to FP" << SR << '\n'); - duplicateToTop(RegNo, SR, I); - PendingST[i] = SR; - } - } - /// popStackAfter - Pop the current value off of the top of the FP stack /// after the specified instruction. void popStackAfter(MachineBasicBlock::iterator &I); @@ -304,6 +258,7 @@ namespace { bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB); + void handleCall(MachineBasicBlock::iterator &I); void handleZeroArgFP(MachineBasicBlock::iterator &I); void handleOneArgFP(MachineBasicBlock::iterator &I); void handleOneArgFPRW(MachineBasicBlock::iterator &I); @@ -320,6 +275,8 @@ namespace { return X86::RFP80RegClass.contains(DstReg) || X86::RFP80RegClass.contains(SrcReg); } + + void setKillFlags(MachineBasicBlock &MBB) const; }; char FPS::ID = 0; } @@ -354,7 +311,7 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { if (!FPIsUsed) return false; Bundles = &getAnalysis<EdgeBundles>(); - TII = MF.getTarget().getInstrInfo(); + TII = MF.getSubtarget().getInstrInfo(); // Prepare cross-MBB liveness. bundleCFG(MF); @@ -367,15 +324,13 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { MachineBasicBlock *Entry = MF.begin(); bool Changed = false; - for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*, 8> > - I = df_ext_begin(Entry, Processed), E = df_ext_end(Entry, Processed); - I != E; ++I) - Changed |= processBasicBlock(MF, **I); + for (MachineBasicBlock *BB : depth_first_ext(Entry, Processed)) + Changed |= processBasicBlock(MF, *BB); // Process any unreachable blocks in arbitrary order now. if (MF.size() != Processed.size()) for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) - if (Processed.insert(BB)) + if (Processed.insert(BB).second) Changed |= processBasicBlock(MF, *BB); LiveBundles.clear(); @@ -409,8 +364,8 @@ void FPS::bundleCFG(MachineFunction &MF) { bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { bool Changed = false; MBB = &BB; - NumPendingSTs = 0; + setKillFlags(BB); setupBlockStack(); for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { @@ -428,6 +383,9 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { X86::RFP80RegClass.contains(MI->getOperand(0).getReg())) FPInstClass = X86II::SpecialFP; + if (MI->isCall()) + FPInstClass = X86II::SpecialFP; + if (FPInstClass == X86II::NotFP) continue; // Efficiently ignore non-fp insts! @@ -462,7 +420,9 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { // after definition. If so, pop them. for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) { unsigned Reg = DeadRegs[i]; - if (Reg >= X86::FP0 && Reg <= X86::FP6) { + // Check if Reg is live on the stack. An inline-asm register operand that + // is in the clobber list and marked dead might not be live on the stack. + if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg-X86::FP0)) { DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n"); freeStackSlotAfter(I, Reg-X86::FP0); } @@ -874,7 +834,9 @@ FPS::freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo) { RegMap[TopReg] = OldSlot; RegMap[FPRegNo] = ~0; Stack[--StackTop] = ~0; - return BuildMI(*MBB, I, DebugLoc(), TII->get(X86::ST_FPrr)).addReg(STReg); + return BuildMI(*MBB, I, DebugLoc(), TII->get(X86::ST_FPrr)) + .addReg(STReg) + .getInstr(); } /// adjustLiveRegs - Kill and revive registers such that exactly the FP @@ -966,6 +928,31 @@ void FPS::shuffleStackTop(const unsigned char *FixStack, // Instruction transformation implementation //===----------------------------------------------------------------------===// +void FPS::handleCall(MachineBasicBlock::iterator &I) { + unsigned STReturns = 0; + + for (const auto &MO : I->operands()) { + if (!MO.isReg()) + continue; + + unsigned R = MO.getReg() - X86::FP0; + + if (R < 8) { + assert(MO.isDef() && MO.isImplicit()); + STReturns |= 1 << R; + } + } + + unsigned N = CountTrailingOnes_32(STReturns); + + // FP registers used for function return must be consecutive starting at + // FP0. + assert(STReturns == 0 || (isMask_32(STReturns) && N <= 2)); + + for (unsigned I = 0; I < N; ++I) + pushReg(N - I - 1); +} + /// handleZeroArgFP - ST(0) = fld0 ST(0) = flds <mem> /// void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) { @@ -992,9 +979,6 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) { unsigned Reg = getFPReg(MI->getOperand(NumOps-1)); bool KillsSrc = MI->killsRegister(X86::FP0+Reg); - if (KillsSrc) - duplicatePendingSTBeforeKill(Reg, I); - // FISTP64m is strange because there isn't a non-popping versions. // If we have one _and_ we don't want to pop the operand, duplicate the value // on the stack instead of moving it. This ensure that popping the value is @@ -1015,7 +999,7 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) { MI->getOpcode() == X86::ISTT_Fp32m80 || MI->getOpcode() == X86::ISTT_Fp64m80 || MI->getOpcode() == X86::ST_FpP80m)) { - duplicateToTop(Reg, getScratchReg(), I); + duplicateToTop(Reg, ScratchFPReg, I); } else { moveToTop(Reg, I); // Move to the top of the stack... } @@ -1058,7 +1042,6 @@ void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) { bool KillsSrc = MI->killsRegister(X86::FP0+Reg); if (KillsSrc) { - duplicatePendingSTBeforeKill(Reg, I); // If this is the last use of the source register, just make sure it's on // the top of the stack. moveToTop(Reg, I); @@ -1314,71 +1297,22 @@ void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) { /// floating point instructions. This is primarily intended for use by pseudo /// instructions. /// -void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { - MachineInstr *MI = I; +void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) { + MachineInstr *MI = Inst; + + if (MI->isCall()) { + handleCall(Inst); + return; + } + switch (MI->getOpcode()) { default: llvm_unreachable("Unknown SpecialFP instruction!"); case TargetOpcode::COPY: { // We handle three kinds of copies: FP <- FP, FP <- ST, and ST <- FP. const MachineOperand &MO1 = MI->getOperand(1); const MachineOperand &MO0 = MI->getOperand(0); - unsigned DstST = MO0.getReg() - X86::ST0; - unsigned SrcST = MO1.getReg() - X86::ST0; bool KillsSrc = MI->killsRegister(MO1.getReg()); - // ST = COPY FP. Set up a pending ST register. - if (DstST < 8) { - unsigned SrcFP = getFPReg(MO1); - assert(isLive(SrcFP) && "Cannot copy dead register"); - assert(!MO0.isDead() && "Cannot copy to dead ST register"); - - // Unallocated STs are marked as the nonexistent FP255. - while (NumPendingSTs <= DstST) - PendingST[NumPendingSTs++] = NumFPRegs; - - // STi could still be live from a previous inline asm. - if (isScratchReg(PendingST[DstST])) { - DEBUG(dbgs() << "Clobbering old ST in FP" << unsigned(PendingST[DstST]) - << '\n'); - freeStackSlotBefore(MI, PendingST[DstST]); - } - - // When the source is killed, allocate a scratch FP register. - if (KillsSrc) { - duplicatePendingSTBeforeKill(SrcFP, I); - unsigned Slot = getSlot(SrcFP); - unsigned SR = getScratchReg(); - PendingST[DstST] = SR; - Stack[Slot] = SR; - RegMap[SR] = Slot; - } else - PendingST[DstST] = SrcFP; - break; - } - - // FP = COPY ST. Extract fixed stack value. - // Any instruction defining ST registers must have assigned them to a - // scratch register. - if (SrcST < 8) { - unsigned DstFP = getFPReg(MO0); - assert(!isLive(DstFP) && "Cannot copy ST to live FP register"); - assert(NumPendingSTs > SrcST && "Cannot copy from dead ST register"); - unsigned SrcFP = PendingST[SrcST]; - assert(isScratchReg(SrcFP) && "Expected ST in a scratch register"); - assert(isLive(SrcFP) && "Scratch holding ST is dead"); - - // DstFP steals the stack slot from SrcFP. - unsigned Slot = getSlot(SrcFP); - Stack[Slot] = DstFP; - RegMap[DstFP] = Slot; - - // Always treat the ST as killed. - PendingST[SrcST] = NumFPRegs; - while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs) - --NumPendingSTs; - break; - } - // FP <- FP copy. unsigned DstFP = getFPReg(MO0); unsigned SrcFP = getFPReg(MO1); @@ -1392,7 +1326,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { } else { // For COPY we just duplicate the specified value to a new stack slot. // This could be made better, but would require substantial changes. - duplicateToTop(SrcFP, DstFP, I); + duplicateToTop(SrcFP, DstFP, Inst); } break; } @@ -1401,41 +1335,11 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { // All FP registers must be explicitly defined, so load a 0 instead. unsigned Reg = MI->getOperand(0).getReg() - X86::FP0; DEBUG(dbgs() << "Emitting LD_F0 for implicit FP" << Reg << '\n'); - BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0)); + BuildMI(*MBB, Inst, MI->getDebugLoc(), TII->get(X86::LD_F0)); pushReg(Reg); break; } - case X86::FpPOP_RETVAL: { - // The FpPOP_RETVAL instruction is used after calls that return a value on - // the floating point stack. We cannot model this with ST defs since CALL - // instructions have fixed clobber lists. This instruction is interpreted - // to mean that there is one more live register on the stack than we - // thought. - // - // This means that StackTop does not match the hardware stack between a - // call and the FpPOP_RETVAL instructions. We do tolerate FP instructions - // between CALL and FpPOP_RETVAL as long as they don't overflow the - // hardware stack. - unsigned DstFP = getFPReg(MI->getOperand(0)); - - // Move existing stack elements up to reflect reality. - assert(StackTop < 8 && "Stack overflowed before FpPOP_RETVAL"); - if (StackTop) { - std::copy_backward(Stack, Stack + StackTop, Stack + StackTop + 1); - for (unsigned i = 0; i != NumFPRegs; ++i) - ++RegMap[i]; - } - ++StackTop; - - // DstFP is the new bottom of the stack. - Stack[0] = DstFP; - RegMap[DstFP] = 0; - - // DstFP will be killed by processBasicBlock if this was a dead def. - break; - } - case TargetOpcode::INLINEASM: { // The inline asm MachineInstr currently only *uses* FP registers for the // 'f' constraint. These should be turned into the current ST(x) register @@ -1472,19 +1376,30 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { // only tell clobbers from defs by looking at the asm descriptor. unsigned STUses = 0, STDefs = 0, STClobbers = 0, STDeadDefs = 0; unsigned NumOps = 0; + SmallSet<unsigned, 1> FRegIdx; + unsigned RCID; + for (unsigned i = InlineAsm::MIOp_FirstOperand, e = MI->getNumOperands(); i != e && MI->getOperand(i).isImm(); i += 1 + NumOps) { unsigned Flags = MI->getOperand(i).getImm(); + NumOps = InlineAsm::getNumOperandRegisters(Flags); if (NumOps != 1) continue; const MachineOperand &MO = MI->getOperand(i + 1); if (!MO.isReg()) continue; - unsigned STReg = MO.getReg() - X86::ST0; + unsigned STReg = MO.getReg() - X86::FP0; if (STReg >= 8) continue; + // If the flag has a register class constraint, this must be an operand + // with constraint "f". Record its index and continue. + if (InlineAsm::hasRegClassConstraint(Flags, RCID)) { + FRegIdx.insert(i + 1); + continue; + } + switch (InlineAsm::getKind(Flags)) { case InlineAsm::Kind_RegUse: STUses |= (1u << STReg); @@ -1527,71 +1442,42 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops " << NumSTPopped << ", and defines " << NumSTDefs << " regs.\n"); - // Scan the instruction for FP uses corresponding to "f" constraints. - // Collect FP registers to kill afer the instruction. - // Always kill all the scratch regs. +#ifndef NDEBUG + // If any input operand uses constraint "f", all output register + // constraints must be early-clobber defs. + for (unsigned I = 0, E = MI->getNumOperands(); I < E; ++I) + if (FRegIdx.count(I)) { + assert((1 << getFPReg(MI->getOperand(I)) & STDefs) == 0 && + "Operands with constraint \"f\" cannot overlap with defs"); + } +#endif + + // Collect all FP registers (register operands with constraints "t", "u", + // and "f") to kill afer the instruction. unsigned FPKills = ((1u << NumFPRegs) - 1) & ~0xff; - unsigned FPUsed = 0; for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MachineOperand &Op = MI->getOperand(i); if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6) continue; - if (!Op.isUse()) - MI->emitError("illegal \"f\" output constraint"); unsigned FPReg = getFPReg(Op); - FPUsed |= 1U << FPReg; // If we kill this operand, make sure to pop it from the stack after the // asm. We just remember it for now, and pop them all off at the end in // a batch. - if (Op.isKill()) + if (Op.isUse() && Op.isKill()) FPKills |= 1U << FPReg; } - // The popped inputs will be killed by the instruction, so duplicate them - // if the FP register needs to be live after the instruction, or if it is - // used in the instruction itself. We effectively treat the popped inputs - // as early clobbers. - for (unsigned i = 0; i < NumSTPopped; ++i) { - if ((FPKills & ~FPUsed) & (1u << PendingST[i])) - continue; - unsigned SR = getScratchReg(); - duplicateToTop(PendingST[i], SR, I); - DEBUG(dbgs() << "Duplicating ST" << i << " in FP" - << unsigned(PendingST[i]) << " to avoid clobbering it.\n"); - PendingST[i] = SR; - } - - // Make sure we have a unique live register for every fixed use. Some of - // them could be undef uses, and we need to emit LD_F0 instructions. - for (unsigned i = 0; i < NumSTUses; ++i) { - if (i < NumPendingSTs && PendingST[i] < NumFPRegs) { - // Check for shared assignments. - for (unsigned j = 0; j < i; ++j) { - if (PendingST[j] != PendingST[i]) - continue; - // STi and STj are inn the same register, create a copy. - unsigned SR = getScratchReg(); - duplicateToTop(PendingST[i], SR, I); - DEBUG(dbgs() << "Duplicating ST" << i << " in FP" - << unsigned(PendingST[i]) - << " to avoid collision with ST" << j << '\n'); - PendingST[i] = SR; - } - continue; - } - unsigned SR = getScratchReg(); - DEBUG(dbgs() << "Emitting LD_F0 for ST" << i << " in FP" << SR << '\n'); - BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0)); - pushReg(SR); - PendingST[i] = SR; - if (NumPendingSTs == i) - ++NumPendingSTs; - } - assert(NumPendingSTs >= NumSTUses && "Fixed registers should be assigned"); + // Do not include registers that are implicitly popped by defs/clobbers. + FPKills &= ~(STDefs | STClobbers); // Now we can rearrange the live registers to match what was requested. - shuffleStackTop(PendingST, NumPendingSTs, I); + unsigned char STUsesArray[8]; + + for (unsigned I = 0; I < NumSTUses; ++I) + STUsesArray[I] = I; + + shuffleStackTop(STUsesArray, NumSTUses, Inst); DEBUG({dbgs() << "Before asm: "; dumpStack();}); // With the stack layout fixed, rewrite the FP registers. @@ -1599,36 +1485,22 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { MachineOperand &Op = MI->getOperand(i); if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6) continue; + unsigned FPReg = getFPReg(Op); - Op.setReg(getSTReg(FPReg)); + + if (FRegIdx.count(i)) + // Operand with constraint "f". + Op.setReg(getSTReg(FPReg)); + else + // Operand with a single register class constraint ("t" or "u"). + Op.setReg(X86::ST0 + FPReg); } // Simulate the inline asm popping its inputs and pushing its outputs. StackTop -= NumSTPopped; - // Hold the fixed output registers in scratch FP registers. They will be - // transferred to real FP registers by copies. - NumPendingSTs = 0; - for (unsigned i = 0; i < NumSTDefs; ++i) { - unsigned SR = getScratchReg(); - pushReg(SR); - FPKills &= ~(1u << SR); - } for (unsigned i = 0; i < NumSTDefs; ++i) - PendingST[NumPendingSTs++] = getStackEntry(i); - DEBUG({dbgs() << "After asm: "; dumpStack();}); - - // If any of the ST defs were dead, pop them immediately. Our caller only - // handles dead FP defs. - MachineBasicBlock::iterator InsertPt = MI; - for (unsigned i = 0; STDefs & (1u << i); ++i) { - if (!(STDeadDefs & (1u << i))) - continue; - freeStackSlotAfter(InsertPt, PendingST[i]); - PendingST[i] = NumFPRegs; - } - while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs) - --NumPendingSTs; + pushReg(NumSTDefs - i - 1); // If this asm kills any FP registers (is the last use of them) we must // explicitly emit pop instructions for them. Do this now after the asm has @@ -1640,9 +1512,10 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { while (FPKills) { unsigned FPReg = countTrailingZeros(FPKills); if (isLive(FPReg)) - freeStackSlotAfter(InsertPt, FPReg); + freeStackSlotAfter(Inst, FPReg); FPKills &= ~(1U << FPReg); } + // Don't delete the inline asm! return; } @@ -1655,12 +1528,12 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { Op.getReg() >= X86::FP0 && Op.getReg() <= X86::FP6); unsigned FPReg = getFPReg(Op); if (Op.isKill()) - moveToTop(FPReg, I); + moveToTop(FPReg, Inst); else - duplicateToTop(FPReg, FPReg, I); + duplicateToTop(FPReg, FPReg, Inst); // Emit the call. This will pop the operand. - BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::CALLpcrel32)) + BuildMI(*MBB, Inst, MI->getDebugLoc(), TII->get(X86::CALLpcrel32)) .addExternalSymbol("_ftol2") .addReg(X86::ST0, RegState::ImplicitKill) .addReg(X86::ECX, RegState::ImplicitDefine) @@ -1738,7 +1611,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { // Duplicate the TOS so that we return it twice. Just pick some other FPx // register to hold it. - unsigned NewReg = getScratchReg(); + unsigned NewReg = ScratchFPReg; duplicateToTop(FirstFPRegOp, NewReg, MI); FirstFPRegOp = NewReg; } @@ -1761,13 +1634,54 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { return; } - I = MBB->erase(I); // Remove the pseudo instruction + Inst = MBB->erase(Inst); // Remove the pseudo instruction // We want to leave I pointing to the previous instruction, but what if we // just erased the first instruction? - if (I == MBB->begin()) { + if (Inst == MBB->begin()) { DEBUG(dbgs() << "Inserting dummy KILL\n"); - I = BuildMI(*MBB, I, DebugLoc(), TII->get(TargetOpcode::KILL)); + Inst = BuildMI(*MBB, Inst, DebugLoc(), TII->get(TargetOpcode::KILL)); } else - --I; + --Inst; +} + +void FPS::setKillFlags(MachineBasicBlock &MBB) const { + const TargetRegisterInfo *TRI = + MBB.getParent()->getSubtarget().getRegisterInfo(); + LivePhysRegs LPR(TRI); + + LPR.addLiveOuts(&MBB); + + for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); + I != E; ++I) { + if (I->isDebugValue()) + continue; + + std::bitset<8> Defs; + SmallVector<MachineOperand *, 2> Uses; + MachineInstr &MI = *I; + + for (auto &MO : I->operands()) { + if (!MO.isReg()) + continue; + + unsigned Reg = MO.getReg() - X86::FP0; + + if (Reg >= 8) + continue; + + if (MO.isDef()) { + Defs.set(Reg); + if (!LPR.contains(MO.getReg())) + MO.setIsDead(); + } else + Uses.push_back(&MO); + } + + for (auto *MO : Uses) + if (Defs.test(getFPReg(*MO)) || !LPR.contains(MO->getReg())) + MO->setIsKill(); + + LPR.stepBackward(MI); + } } diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 8c029a8..b9920b1 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -30,6 +30,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Support/Debug.h" +#include <cstdlib> using namespace llvm; @@ -46,14 +47,15 @@ bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { bool X86FrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineModuleInfo &MMI = MF.getMMI(); - const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo(); + const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); return (MF.getTarget().Options.DisableFramePointerElim(MF) || RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() || MFI->hasInlineAsmWithSPAdjust() || MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() || - MMI.callsUnwindInit() || MMI.callsEHReturn()); + MMI.callsUnwindInit() || MMI.callsEHReturn() || + MFI->hasStackMap() || MFI->hasPatchPoint()); } static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) { @@ -80,6 +82,17 @@ static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) { } } +static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) { + if (IsLP64) { + if (isInt<8>(Imm)) + return X86::AND64ri8; + return X86::AND64ri32; + } + if (isInt<8>(Imm)) + return X86::AND32ri8; + return X86::AND32ri; +} + static unsigned getLEArOpcode(unsigned IsLP64) { return IsLP64 ? X86::LEA64r : X86::LEA32r; } @@ -148,32 +161,32 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, static void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, unsigned StackPtr, int64_t NumBytes, - bool Is64Bit, bool IsLP64, bool UseLEA, + bool Is64BitTarget, bool Is64BitStackPtr, bool UseLEA, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) { bool isSub = NumBytes < 0; uint64_t Offset = isSub ? -NumBytes : NumBytes; unsigned Opc; if (UseLEA) - Opc = getLEArOpcode(IsLP64); + Opc = getLEArOpcode(Is64BitStackPtr); else Opc = isSub - ? getSUBriOpcode(IsLP64, Offset) - : getADDriOpcode(IsLP64, Offset); + ? getSUBriOpcode(Is64BitStackPtr, Offset) + : getADDriOpcode(Is64BitStackPtr, Offset); uint64_t Chunk = (1LL << 31) - 1; DebugLoc DL = MBB.findDebugLoc(MBBI); while (Offset) { uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset; - if (ThisVal == (Is64Bit ? 8 : 4)) { + if (ThisVal == (Is64BitTarget ? 8 : 4)) { // Use push / pop instead. unsigned Reg = isSub - ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX) - : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit); + ? (unsigned)(Is64BitTarget ? X86::RAX : X86::EAX) + : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64BitTarget); if (Reg) { Opc = isSub - ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r) - : (Is64Bit ? X86::POP64r : X86::POP32r); + ? (Is64BitTarget ? X86::PUSH64r : X86::PUSH32r) + : (Is64BitTarget ? X86::POP64r : X86::POP32r); MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc)) .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)); if (isSub) @@ -314,7 +327,7 @@ X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineFrameInfo *MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); // Add callee saved registers to move list. const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); @@ -352,6 +365,23 @@ static bool usesTheStack(const MachineFunction &MF) { return false; } +void X86FrameLowering::getStackProbeFunction(const X86Subtarget &STI, + unsigned &CallOp, + const char *&Symbol) { + CallOp = STI.is64Bit() ? X86::W64ALLOCA : X86::CALLpcrel32; + + if (STI.is64Bit()) { + if (STI.isTargetCygMing()) { + Symbol = "___chkstk_ms"; + } else { + Symbol = "__chkstk"; + } + } else if (STI.isTargetCygMing()) + Symbol = "_alloca"; + else + Symbol = "_chkstk"; +} + /// emitPrologue - Push callee-saved registers onto the stack, which /// automatically adjust the stack pointer. Adjust the stack pointer to allocate /// space for local variables. Also emit labels used by the exception handler to @@ -440,8 +470,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *Fn = MF.getFunction(); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()); - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. @@ -449,11 +479,12 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { bool HasFP = hasFP(MF); const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool Is64Bit = STI.is64Bit(); - bool IsLP64 = STI.isTarget64BitLP64(); + // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. + const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); bool IsWin64 = STI.isTargetWin64(); - bool IsWinEH = - MF.getTarget().getMCAsmInfo()->getExceptionHandlingType() == - ExceptionHandling::WinEH; // Not necessarily synonymous with IsWin64. + // Not necessarily synonymous with IsWin64. + bool IsWinEH = MF.getTarget().getMCAsmInfo()->getExceptionHandlingType() == + ExceptionHandling::ItaniumWinEH; bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry(); bool NeedsDwarfCFI = !IsWinEH && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); @@ -461,6 +492,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { unsigned StackAlign = getStackAlignment(); unsigned SlotSize = RegInfo->getSlotSize(); unsigned FramePtr = RegInfo->getFrameRegister(MF); + const unsigned MachineFramePtr = STI.isTarget64BitILP32() ? + getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr; unsigned StackPtr = RegInfo->getStackRegister(); unsigned BasePtr = RegInfo->getBaseRegister(); DebugLoc DL; @@ -482,6 +515,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { X86FI->setCalleeSavedFrameSize( X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); + bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMacho()); + // If this is x86-64 and the Red Zone is not disabled, if we are a leaf // function, and use up to 128 bytes of stack space, don't have a frame // pointer, calls, or dynamic alloca then we do not need to adjust the @@ -507,7 +542,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { if (TailCallReturnAddrDelta < 0) { MachineInstr *MI = BuildMI(MBB, MBBI, DL, - TII.get(getSUBriOpcode(IsLP64, -TailCallReturnAddrDelta)), + TII.get(getSUBriOpcode(Uses64BitFramePtr, -TailCallReturnAddrDelta)), StackPtr) .addReg(StackPtr) .addImm(-TailCallReturnAddrDelta) @@ -551,7 +586,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // Save EBP/RBP into the appropriate stack slot. BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) - .addReg(FramePtr, RegState::Kill) + .addReg(MachineFramePtr, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); if (NeedsDwarfCFI) { @@ -564,7 +599,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addCFIIndex(CFIIndex); // Change the rule for the FramePtr to be an "offset" rule. - unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true); + unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true); CFIIndex = MMI.addFrameInst( MCCFIInstruction::createOffset(nullptr, DwarfFramePtr, 2 * stackGrowth)); @@ -580,14 +615,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // Update EBP with the new base value. BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr) + TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), FramePtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); if (NeedsDwarfCFI) { // Mark effective beginning of when frame pointer becomes valid. // Define the current CFA to use the EBP/RBP register. - unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true); + unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true); unsigned CFIIndex = MMI.addFrameInst( MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr)); BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) @@ -596,7 +631,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // Mark the FramePtr as live-in in every block. for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) - I->addLiveIn(FramePtr); + I->addLiveIn(MachineFramePtr); } else { NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); } @@ -633,11 +668,12 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // able to calculate their offsets from the frame pointer). if (RegInfo->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); + uint64_t Val = -MaxAlign; MachineInstr *MI = BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr) + TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), StackPtr) .addReg(StackPtr) - .addImm(-MaxAlign) + .addImm(Val) .setMIFlag(MachineInstr::FrameSetup); // The EFLAGS implicit def is dead. @@ -655,6 +691,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // Adjust stack pointer: ESP -= numbytes. + static const size_t PageSize = 4096; + // Windows and cygwin/mingw require a prologue helper routine when allocating // more than 4K bytes on the stack. Windows uses __chkstk and cygwin/mingw // uses __alloca. __alloca and the 32-bit version of __chkstk will probe the @@ -663,19 +701,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // responsible for adjusting the stack pointer. Touching the stack at 4K // increments is necessary to ensure that the guard pages used by the OS // virtual memory manager are allocated in correct sequence. - if (NumBytes >= 4096 && STI.isOSWindows() && !STI.isTargetMacho()) { + if (NumBytes >= PageSize && UseStackProbe) { const char *StackProbeSymbol; + unsigned CallOp; - if (Is64Bit) { - if (STI.isTargetCygMing()) { - StackProbeSymbol = "___chkstk_ms"; - } else { - StackProbeSymbol = "__chkstk"; - } - } else if (STI.isTargetCygMing()) - StackProbeSymbol = "_alloca"; - else - StackProbeSymbol = "_chkstk"; + getStackProbeFunction(STI, CallOp, StackProbeSymbol); // Check whether EAX is livein for this function. bool isEAXAlive = isEAXLiveIn(MF); @@ -706,7 +736,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { } BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::W64ALLOCA : X86::CALLpcrel32)) + TII.get(CallOp)) .addExternalSymbol(StackProbeSymbol) .addReg(StackPtr, RegState::Define | RegState::Implicit) .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit) @@ -722,15 +752,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .setMIFlag(MachineInstr::FrameSetup); } if (isEAXAlive) { - // Restore EAX - MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), - X86::EAX), - StackPtr, false, NumBytes - 4); - MI->setFlag(MachineInstr::FrameSetup); - MBB.insert(MBBI, MI); + // Restore EAX + MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), + X86::EAX), + StackPtr, false, NumBytes - 4); + MI->setFlag(MachineInstr::FrameSetup); + MBB.insert(MBBI, MI); } } else if (NumBytes) { - emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64, + emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, Uses64BitFramePtr, UseLEA, TII, *RegInfo); } @@ -746,7 +776,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // will restore SP to (BP - SEHFrameOffset) for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) { int offset = MFI->getObjectOffset(Info.getFrameIdx()); - SEHFrameOffset = std::max(SEHFrameOffset, abs(offset)); + SEHFrameOffset = std::max(SEHFrameOffset, std::abs(offset)); } SEHFrameOffset += SEHFrameOffset % 16; // ensure alignmant @@ -804,7 +834,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // to reference locals. if (RegInfo->hasBasePointer(MF)) { // Update the base pointer with the current stack pointer. - unsigned Opc = Is64Bit ? X86::MOV64rr : X86::MOV32rr; + unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr; BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); @@ -834,21 +864,29 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, const MachineFrameInfo *MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()); - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); assert(MBBI != MBB.end() && "Returning block has no instructions"); unsigned RetOpcode = MBBI->getOpcode(); DebugLoc DL = MBBI->getDebugLoc(); const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool Is64Bit = STI.is64Bit(); - bool IsLP64 = STI.isTarget64BitLP64(); + // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. + const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); + const bool Is64BitILP32 = STI.isTarget64BitILP32(); bool UseLEA = STI.useLeaForSP(); unsigned StackAlign = getStackAlignment(); unsigned SlotSize = RegInfo->getSlotSize(); unsigned FramePtr = RegInfo->getFrameRegister(MF); + unsigned MachineFramePtr = Is64BitILP32 ? + getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr; unsigned StackPtr = RegInfo->getStackRegister(); + bool IsWinEH = MF.getTarget().getMCAsmInfo()->getExceptionHandlingType() == + ExceptionHandling::ItaniumWinEH; + bool NeedsWinEH = IsWinEH && MF.getFunction()->needsUnwindTableEntry(); + switch (RetOpcode) { default: llvm_unreachable("Can only insert epilog into returning blocks"); @@ -898,7 +936,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // Pop EBP. BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::POP64r : X86::POP32r), FramePtr); + TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr); } else { NumBytes = StackSize - CSSize; } @@ -930,27 +968,39 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (RegInfo->needsStackRealignment(MF)) MBBI = FirstCSPop; if (CSSize != 0) { - unsigned Opc = getLEArOpcode(IsLP64); + unsigned Opc = getLEArOpcode(Uses64BitFramePtr); addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), FramePtr, false, -CSSize); + --MBBI; } else { - unsigned Opc = (Is64Bit ? X86::MOV64rr : X86::MOV32rr); + unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr); BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(FramePtr); + --MBBI; } } else if (NumBytes) { // Adjust stack pointer back: ESP += numbytes. - emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, IsLP64, UseLEA, + emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, Uses64BitFramePtr, UseLEA, TII, *RegInfo); + --MBBI; } + // Windows unwinder will not invoke function's exception handler if IP is + // either in prologue or in epilogue. This behavior causes a problem when a + // call immediately precedes an epilogue, because the return address points + // into the epilogue. To cope with that, we insert an epilogue marker here, + // then replace it with a 'nop' if it ends up immediately after a CALL in the + // final emitted code. + if (NeedsWinEH) + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue)); + // We're returning from function via eh_return. if (RetOpcode == X86::EH_RETURN || RetOpcode == X86::EH_RETURN64) { MBBI = MBB.getLastNonDebugInstr(); MachineOperand &DestAddr = MBBI->getOperand(0); assert(DestAddr.isReg() && "Offset should be in register!"); BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), + TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr).addReg(DestAddr.getReg()); } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNmi || @@ -976,7 +1026,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (Offset) { // Check for possible merge with preceding ADD instruction. Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true); - emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, IsLP64, + emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, Uses64BitFramePtr, UseLEA, TII, *RegInfo); } @@ -1021,7 +1071,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // Check for possible merge with preceding ADD instruction. delta += mergeSPUpdates(MBB, MBBI, StackPtr, true); - emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, IsLP64, UseLEA, TII, + emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, Uses64BitFramePtr, UseLEA, TII, *RegInfo); } } @@ -1029,7 +1079,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const { const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo()); + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); const MachineFrameInfo *MFI = MF.getFrameInfo(); int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); uint64_t StackSize = MFI->getStackSize(); @@ -1072,7 +1122,7 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo()); + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); // We can't calculate offset from frame pointer if the stack is realigned, // so enforce usage of stack/base pointer. The base pointer is used when we // have dynamic allocas in addition to dynamic realignment. @@ -1090,7 +1140,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( std::vector<CalleeSavedInfo> &CSI) const { MachineFrameInfo *MFI = MF.getFrameInfo(); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()); + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); unsigned SlotSize = RegInfo->getSlotSize(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); @@ -1107,7 +1157,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( // about avoiding it later. unsigned FPReg = RegInfo->getFrameRegister(MF); for (unsigned i = 0; i < CSI.size(); ++i) { - if (CSI[i].getReg() == FPReg) { + if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) { CSI.erase(CSI.begin() + i); break; } @@ -1138,7 +1188,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); // ensure alignment - SpillSlotOffset -= abs(SpillSlotOffset) % RC->getAlignment(); + SpillSlotOffset -= std::abs(SpillSlotOffset) % RC->getAlignment(); // spill into slot SpillSlotOffset -= RC->getSize(); int SlotIndex = @@ -1157,7 +1207,7 @@ bool X86FrameLowering::spillCalleeSavedRegisters( DebugLoc DL = MBB.findDebugLoc(MI); MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); // Push GPRs. It increases frame size. @@ -1205,7 +1255,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, DebugLoc DL = MBB.findDebugLoc(MI); MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); // Reload XMMs from stack frame. @@ -1237,7 +1287,7 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, RegScavenger *RS) const { MachineFrameInfo *MFI = MF.getFrameInfo(); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()); + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); unsigned SlotSize = RegInfo->getSlotSize(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); @@ -1278,7 +1328,7 @@ HasNestArgument(const MachineFunction *MF) { /// and the properties of the function either one or two registers will be /// needed. Set primary to true for the first register, false for the second. static unsigned -GetScratchRegister(bool Is64Bit, const MachineFunction &MF, bool Primary) { +GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) { CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv(); // Erlang stuff. @@ -1289,8 +1339,12 @@ GetScratchRegister(bool Is64Bit, const MachineFunction &MF, bool Primary) { return Primary ? X86::EBX : X86::EDI; } - if (Is64Bit) - return Primary ? X86::R11 : X86::R12; + if (Is64Bit) { + if (IsLP64) + return Primary ? X86::R11 : X86::R12; + else + return Primary ? X86::R11D : X86::R12D; + } bool IsNested = HasNestArgument(&MF); @@ -1314,14 +1368,15 @@ void X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { MachineBasicBlock &prologueMBB = MF.front(); MachineFrameInfo *MFI = MF.getFrameInfo(); - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); uint64_t StackSize; const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool Is64Bit = STI.is64Bit(); + const bool IsLP64 = STI.isTarget64BitLP64(); unsigned TlsReg, TlsOffset; DebugLoc DL; - unsigned ScratchReg = GetScratchRegister(Is64Bit, MF, true); + unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true); assert(!MF.getRegInfo().isLiveIn(ScratchReg) && "Scratch register is live-in"); @@ -1359,7 +1414,7 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { } if (IsNested) - allocMBB->addLiveIn(X86::R10); + allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D); MF.push_front(allocMBB); MF.push_front(checkMBB); @@ -1372,7 +1427,7 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { if (Is64Bit) { if (STI.isTargetLinux()) { TlsReg = X86::FS; - TlsOffset = 0x70; + TlsOffset = IsLP64 ? 0x70 : 0x40; } else if (STI.isTargetDarwin()) { TlsReg = X86::GS; TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90. @@ -1387,12 +1442,12 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { } if (CompareStackPointer) - ScratchReg = X86::RSP; + ScratchReg = IsLP64 ? X86::RSP : X86::ESP; else - BuildMI(checkMBB, DL, TII.get(X86::LEA64r), ScratchReg).addReg(X86::RSP) + BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP) .addImm(1).addReg(0).addImm(-StackSize).addReg(0); - BuildMI(checkMBB, DL, TII.get(X86::CMP64rm)).addReg(ScratchReg) + BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg) .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg); } else { if (STI.isTargetLinux()) { @@ -1426,11 +1481,11 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { bool SaveScratch2; if (CompareStackPointer) { // The primary scratch register is available for holding the TLS offset. - ScratchReg2 = GetScratchRegister(Is64Bit, MF, true); + ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true); SaveScratch2 = false; } else { // Need to use a second register to hold the TLS offset - ScratchReg2 = GetScratchRegister(Is64Bit, MF, false); + ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false); // Unfortunately, with fastcc the second scratch register may hold an // argument. @@ -1468,15 +1523,21 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { // Functions with nested arguments use R10, so it needs to be saved across // the call to _morestack + const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX; + const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D; + const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D; + const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr; + const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri; + if (IsNested) - BuildMI(allocMBB, DL, TII.get(X86::MOV64rr), X86::RAX).addReg(X86::R10); + BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10); - BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R10) + BuildMI(allocMBB, DL, TII.get(MOVri), Reg10) .addImm(StackSize); - BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R11) + BuildMI(allocMBB, DL, TII.get(MOVri), Reg11) .addImm(X86FI->getArgumentStackSize()); - MF.getRegInfo().setPhysRegUsed(X86::R10); - MF.getRegInfo().setPhysRegUsed(X86::R11); + MF.getRegInfo().setPhysRegUsed(Reg10); + MF.getRegInfo().setPhysRegUsed(Reg11); } else { BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) .addImm(X86FI->getArgumentStackSize()); @@ -1523,13 +1584,14 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { /// temp0 = sp - MaxStack /// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); MachineFrameInfo *MFI = MF.getFrameInfo(); const unsigned SlotSize = - static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()) + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()) ->getSlotSize(); const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); const bool Is64Bit = STI.is64Bit(); + const bool IsLP64 = STI.isTarget64BitLP64(); DebugLoc DL; // HiPE-specific values const unsigned HipeLeafWords = 24; @@ -1623,7 +1685,7 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { SPLimitOffset = 0x4c; } - ScratchReg = GetScratchRegister(Is64Bit, MF, true); + ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true); assert(!MF.getRegInfo().isLiveIn(ScratchReg) && "HiPE prologue scratch register is live-in"); @@ -1657,9 +1719,9 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { void X86FrameLowering:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); - const X86RegisterInfo &RegInfo = - *static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>( + MF.getSubtarget().getRegisterInfo()); unsigned StackPtr = RegInfo.getStackRegister(); bool reseveCallFrame = hasReservedCallFrame(MF); int Opcode = I->getOpcode(); @@ -1682,8 +1744,10 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next // alignment boundary. - unsigned StackAlign = - MF.getTarget().getFrameLowering()->getStackAlignment(); + unsigned StackAlign = MF.getTarget() + .getSubtargetImpl() + ->getFrameLowering() + ->getStackAlignment(); Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign; MachineInstr *New = nullptr; diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index 5ad3d4d..7740c3a 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86_FRAMELOWERING_H -#define X86_FRAMELOWERING_H +#ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H +#define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H #include "llvm/Target/TargetFrameLowering.h" @@ -20,12 +20,17 @@ namespace llvm { class MCSymbol; class X86TargetMachine; +class X86Subtarget; class X86FrameLowering : public TargetFrameLowering { public: explicit X86FrameLowering(StackDirection D, unsigned StackAl, int LAO) : TargetFrameLowering(StackGrowsDown, StackAl, LAO) {} + static void getStackProbeFunction(const X86Subtarget &STI, + unsigned &CallOp, + const char *&Symbol); + void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL) const; diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index ba2f5f6..3ef7b2c 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Type.h" @@ -33,6 +34,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" +#include <stdint.h> using namespace llvm; #define DEBUG_TYPE "x86-isel" @@ -192,7 +194,6 @@ namespace { private: SDNode *Select(SDNode *N) override; SDNode *SelectGather(SDNode *N, unsigned Opc); - SDNode *SelectAtomic64(SDNode *Node, unsigned Opc); SDNode *SelectAtomicLoadArith(SDNode *Node, MVT NVT); bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); @@ -237,10 +238,10 @@ namespace { inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { - Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ? - CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, - getTargetLowering()->getPointerTy()) : - AM.Base_Reg; + Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) + ? CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, + TLI->getPointerTy()) + : AM.Base_Reg; Scale = getI8Imm(AM.Scale); Index = AM.IndexReg; // These are 32-bit even in 64-bit mode since RIP relative offset @@ -297,7 +298,14 @@ namespace { /// getInstrInfo - Return a reference to the TargetInstrInfo, casted /// to the target-specific type. const X86InstrInfo *getInstrInfo() const { - return getTargetMachine().getInstrInfo(); + return getTargetMachine().getSubtargetImpl()->getInstrInfo(); + } + + /// \brief Address-mode matching performs shift-of-and to and-of-shift + /// reassociation in order to expose more scaled addressing + /// opportunities. + bool ComplexPatternFuncMutatesDAG() const override { + return true; } }; } @@ -510,7 +518,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { // If the source and destination are SSE registers, then this is a legal // conversion that should not be lowered. const X86TargetLowering *X86Lowering = - static_cast<const X86TargetLowering *>(getTargetLowering()); + static_cast<const X86TargetLowering *>(TLI); bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); if (SrcIsSSE && DstIsSSE) @@ -544,7 +552,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { false, false, 0); SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp, MachinePointerInfo(), - MemVT, false, false, 0); + MemVT, false, false, false, 0); // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the // extload we created. This will cause general havok on the dag because @@ -565,7 +573,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { /// the main function. void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI) { - const TargetInstrInfo *TII = TM.getInstrInfo(); + const TargetInstrInfo *TII = TM.getSubtargetImpl()->getInstrInfo(); if (Subtarget->isTargetCygMing()) { unsigned CallOp = Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32; @@ -775,9 +783,10 @@ static void InsertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { } } -// Transform "(X >> (8-C1)) & C2" to "(X >> 8) & 0xff)" if safe. This -// allows us to convert the shift and and into an h-register extract and -// a scaled index. Returns false if the simplification is performed. +// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if +// safe. This allows us to convert the shift and and into an h-register +// extract and a scaled index. Returns false if the simplification is +// performed. static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, @@ -1429,7 +1438,7 @@ bool X86DAGToDAGISel::SelectLEA64_32Addr(SDValue N, SDValue &Base, RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base); if (RN && RN->getReg() == 0) Base = CurDAG->getRegister(0, MVT::i64); - else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(N)) { + else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(Base)) { // Base could already be %rip, particularly in the x32 ABI. Base = SDValue(CurDAG->getMachineNode( TargetOpcode::SUBREG_TO_REG, DL, MVT::i64, @@ -1563,26 +1572,7 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N, /// SDNode *X86DAGToDAGISel::getGlobalBaseReg() { unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); - return CurDAG->getRegister(GlobalBaseReg, - getTargetLowering()->getPointerTy()).getNode(); -} - -SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) { - SDValue Chain = Node->getOperand(0); - SDValue In1 = Node->getOperand(1); - SDValue In2L = Node->getOperand(2); - SDValue In2H = Node->getOperand(3); - - SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; - if (!SelectAddr(Node, In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) - return nullptr; - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); - const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, Chain}; - SDNode *ResNode = CurDAG->getMachineNode(Opc, SDLoc(Node), - MVT::i32, MVT::i32, MVT::Other, Ops); - cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1); - return ResNode; + return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy()).getNode(); } /// Atomic opcode table @@ -1716,16 +1706,23 @@ static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = { static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG, SDLoc dl, enum AtomicOpc &Op, MVT NVT, - SDValue Val) { + SDValue Val, + const X86Subtarget *Subtarget) { if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) { int64_t CNVal = CN->getSExtValue(); // Quit if not 32-bit imm. if ((int32_t)CNVal != CNVal) return Val; + // Quit if INT32_MIN: it would be negated as it is negative and overflow, + // producing an immediate that does not fit in the 32 bits available for + // an immediate operand to sub. However, it still fits in 32 bits for the + // add (since it is not negated) so we can return target-constant. + if (CNVal == INT32_MIN) + return CurDAG->getTargetConstant(CNVal, NVT); // For atomic-load-add, we could do some optimizations. if (Op == ADD) { // Translate to INC/DEC if ADD by 1 or -1. - if ((CNVal == 1) || (CNVal == -1)) { + if (((CNVal == 1) || (CNVal == -1)) && !Subtarget->slowIncDec()) { Op = (CNVal == 1) ? INC : DEC; // No more constant operand after being translated into INC/DEC. return SDValue(); @@ -1774,8 +1771,8 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) { SDValue Chain = Node->getOperand(0); SDValue Ptr = Node->getOperand(1); SDValue Val = Node->getOperand(2); - SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; - if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) + SDValue Base, Scale, Index, Disp, Segment; + if (!SelectAddr(Node, Ptr, Base, Scale, Index, Disp, Segment)) return nullptr; // Which index into the table. @@ -1797,7 +1794,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) { break; } - Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val); + Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val, Subtarget); bool isUnOp = !Val.getNode(); bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant); @@ -1829,31 +1826,40 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) { Opc = AtomicOpcTbl[Op][I32]; break; case MVT::i64: - Opc = AtomicOpcTbl[Op][I64]; if (isCN) { if (immSext8(Val.getNode())) Opc = AtomicOpcTbl[Op][SextConstantI64]; else if (i64immSExt32(Val.getNode())) Opc = AtomicOpcTbl[Op][ConstantI64]; - } + else + llvm_unreachable("True 64 bits constant in SelectAtomicLoadArith"); + } else + Opc = AtomicOpcTbl[Op][I64]; break; } assert(Opc != 0 && "Invalid arith lock transform!"); + // Building the new node. SDValue Ret; - SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, - dl, NVT), 0); - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); if (isUnOp) { - SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain }; + SDValue Ops[] = { Base, Scale, Index, Disp, Segment, Chain }; Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0); } else { - SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain }; + SDValue Ops[] = { Base, Scale, Index, Disp, Segment, Val, Chain }; Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0); } + + // Copying the MachineMemOperand. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1); + + // We need to have two outputs as that is what the original instruction had. + // So we add a dummy, undefined output. This is safe as we checked first + // that no-one uses our output anyway. + SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + dl, NVT), 0); SDValue RetVals[] = { Undef, Ret }; return CurDAG->getMergeValues(RetVals, dl).getNode(); } @@ -2125,6 +2131,16 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { case X86ISD::GlobalBaseReg: return getGlobalBaseReg(); + case X86ISD::SHRUNKBLEND: { + // SHRUNKBLEND selects like a regular VSELECT. + SDValue VSelect = CurDAG->getNode( + ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0), + Node->getOperand(1), Node->getOperand(2)); + ReplaceUses(SDValue(Node, 0), VSelect); + SelectCode(VSelect.getNode()); + // We already called ReplaceUses. + return nullptr; + } case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_AND: @@ -2212,6 +2228,25 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0), getI8Imm(ShlVal)); } + case X86ISD::UMUL8: + case X86ISD::SMUL8: { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + Opc = (Opcode == X86ISD::SMUL8 ? X86::IMUL8r : X86::MUL8r); + + SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::AL, + N0, SDValue()).getValue(1); + + SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32); + SDValue Ops[] = {N1, InFlag}; + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); + + ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); + ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1)); + return nullptr; + } + case X86ISD::UMUL: { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); @@ -2387,11 +2422,14 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } case ISD::SDIVREM: - case ISD::UDIVREM: { + case ISD::UDIVREM: + case X86ISD::SDIVREM8_SEXT_HREG: + case X86ISD::UDIVREM8_ZEXT_HREG: { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); - bool isSigned = Opcode == ISD::SDIVREM; + bool isSigned = (Opcode == ISD::SDIVREM || + Opcode == X86ISD::SDIVREM8_SEXT_HREG); if (!isSigned) { switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); @@ -2507,33 +2545,43 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0); } - // Prevent use of AH in a REX instruction by referencing AX instead. - // Shift it down 8 bits. + // Prevent use of AH in a REX instruction by explicitly copying it to + // an ABCD_L register. // // The current assumption of the register allocator is that isel - // won't generate explicit references to the GPR8_NOREX registers. If + // won't generate explicit references to the GR8_ABCD_H registers. If // the allocator and/or the backend get enhanced to be more robust in // that regard, this can be, and should be, removed. - if (HiReg == X86::AH && Subtarget->is64Bit() && - !SDValue(Node, 1).use_empty()) { - SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, - X86::AX, MVT::i16, InFlag); - InFlag = Result.getValue(2); - - // If we also need AL (the quotient), get it by extracting a subreg from - // Result. The fast register allocator does not like multiple CopyFromReg - // nodes using aliasing registers. - if (!SDValue(Node, 0).use_empty()) - ReplaceUses(SDValue(Node, 0), - CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result)); - - // Shift AX right by 8 bits instead of using AH. - Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16, - Result, - CurDAG->getTargetConstant(8, MVT::i8)), - 0); - ReplaceUses(SDValue(Node, 1), - CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result)); + if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) { + SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8); + unsigned AHExtOpcode = + isSigned ? X86::MOVSX32_NOREXrr8 : X86::MOVZX32_NOREXrr8; + + SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32, + MVT::Glue, AHCopy, InFlag); + SDValue Result(RNode, 0); + InFlag = SDValue(RNode, 1); + + if (Opcode == X86ISD::UDIVREM8_ZEXT_HREG || + Opcode == X86ISD::SDIVREM8_SEXT_HREG) { + if (Node->getValueType(1) == MVT::i64) { + // It's not possible to directly movsx AH to a 64bit register, because + // the latter needs the REX prefix, but the former can't have it. + assert(Opcode != X86ISD::SDIVREM8_SEXT_HREG && + "Unexpected i64 sext of h-register"); + Result = + SDValue(CurDAG->getMachineNode( + TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, + CurDAG->getTargetConstant(0, MVT::i64), Result, + CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)), + 0); + } + } else { + Result = + CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result); + } + ReplaceUses(SDValue(Node, 1), Result); + DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); } // Copy the division (low) result, if it is needed. if (!SDValue(Node, 0).use_empty()) { @@ -2563,12 +2611,30 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); - // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to - // use a smaller encoding. if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && - HasNoSignedComparisonUses(Node)) - // Look past the truncate if CMP is the only use of it. + HasNoSignedComparisonUses(Node)) { + // Look for (X86cmp (truncate $op, i1), 0) and try to convert to a + // smaller encoding + if (Opcode == X86ISD::CMP && N0.getValueType() == MVT::i1 && + X86::isZeroNode(N1)) { + SDValue Reg = N0.getOperand(0); + SDValue Imm = CurDAG->getTargetConstant(1, MVT::i8); + + // Emit testb + if (Reg.getScalarValueSizeInBits() > 8) + Reg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Reg); + // Emit a testb. + SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32, + Reg, Imm); + ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); + return nullptr; + } + N0 = N0.getOperand(0); + } + // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to + // use a smaller encoding. + // Look past the truncate if CMP is the only use of it. if ((N0.getNode()->getOpcode() == ISD::AND || (N0.getResNo() == 0 && N0.getNode()->getOpcode() == X86ISD::AND)) && N0.getNode()->hasOneUse() && diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 5ccff20..f05b6c6 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -19,6 +19,7 @@ #include "X86MachineFunctionInfo.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" @@ -49,6 +50,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" +#include "X86IntrinsicsInfo.h" #include <bitset> #include <numeric> #include <cctype> @@ -65,10 +67,16 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization( cl::Hidden); static cl::opt<bool> ExperimentalVectorShuffleLowering( - "x86-experimental-vector-shuffle-lowering", cl::init(false), + "x86-experimental-vector-shuffle-lowering", cl::init(true), cl::desc("Enable an experimental vector shuffle lowering code path."), cl::Hidden); +static cl::opt<int> ReciprocalEstimateRefinementSteps( + "x86-recip-refinement-steps", cl::init(1), + cl::desc("Specify the number of Newton-Raphson iterations applied to the " + "result of the hardware reciprocal estimate instruction."), + cl::NotHidden); + // Forward declarations. static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2); @@ -191,28 +199,10 @@ static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, return Insert256BitVector(V, V2, NumElems/2, DAG, dl); } -static TargetLoweringObjectFile *createTLOF(const Triple &TT) { - if (TT.isOSBinFormatMachO()) { - if (TT.getArch() == Triple::x86_64) - return new X86_64MachoTargetObjectFile(); - return new TargetLoweringObjectFileMachO(); - } - - if (TT.isOSLinux()) - return new X86LinuxTargetObjectFile(); - if (TT.isOSBinFormatELF()) - return new TargetLoweringObjectFileELF(); - if (TT.isKnownWindowsMSVCEnvironment()) - return new X86WindowsTargetObjectFile(); - if (TT.isOSBinFormatCOFF()) - return new TargetLoweringObjectFileCOFF(); - llvm_unreachable("unknown subtarget type"); -} - // FIXME: This should stop caching the target machine as soon as // we can remove resetOperationActions et al. -X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) - : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) { +X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM) + : TargetLowering(TM) { Subtarget = &TM.getSubtarget<X86Subtarget>(); X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); @@ -255,7 +245,7 @@ void X86TargetLowering::resetOperationActions() { else setSchedulingPreference(Sched::RegPressure); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); + TM.getSubtarget<X86Subtarget>().getRegisterInfo(); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); // Bypass expensive divides on Atom when compiling with O2 @@ -316,6 +306,8 @@ void X86TargetLowering::resetOperationActions() { setTruncStoreAction(MVT::i32, MVT::i8 , Expand); setTruncStoreAction(MVT::i16, MVT::i8, Expand); + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + // SETOEQ and SETUNE require checking two conditions. setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); @@ -519,10 +511,21 @@ void X86TargetLowering::resetOperationActions() { // If we don't have F16C support, then lower half float conversions // into library calls. if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) { - setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand); - setOperationAction(ISD::FP32_TO_FP16, MVT::i16, Expand); + setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); } + // There's never any support for operations beyond MVT::f32. + setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); + setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f16, Expand); + setTruncStoreAction(MVT::f80, MVT::f16, Expand); + if (Subtarget->hasPOPCNT()) { setOperationAction(ISD::CTPOP , MVT::i8 , Promote); } else { @@ -648,8 +651,7 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? - MVT::i64 : MVT::i32, Custom); + setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom); if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { // f32 and f64 use SSE. @@ -797,6 +799,8 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FLOG10, MVT::f80, Expand); setOperationAction(ISD::FEXP, MVT::f80, Expand); setOperationAction(ISD::FEXP2, MVT::f80, Expand); + setOperationAction(ISD::FMINNUM, MVT::f80, Expand); + setOperationAction(ISD::FMAXNUM, MVT::f80, Expand); // First set operation action for all vector types to either promote // (for widening) or expand (for scalarization). Then we will selectively @@ -878,7 +882,12 @@ void X86TargetLowering::resetOperationActions() { (MVT::SimpleValueType)InnerVT, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, Expand); + + // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types, + // we have to deal with them whether we ask for Expansion or not. Setting + // Expand causes its own optimisation problems though, so leave them legal. + if (VT.getVectorElementType() == MVT::i1) + setLoadExtAction(ISD::EXTLOAD, VT, Expand); } // FIXME: In order to prevent SSE instructions being expanded to MMX ones @@ -935,12 +944,13 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); } if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { addRegisterClass(MVT::v2f64, &X86::VR128RegClass); - // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM + // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM // registers cannot be used even for integer operations. addRegisterClass(MVT::v16i8, &X86::VR128RegClass); addRegisterClass(MVT::v8i16, &X86::VR128RegClass); @@ -995,6 +1005,20 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } + // We support custom legalizing of sext and anyext loads for specific + // memory vector types which we can load as a scalar (or sequence of + // scalars) and extend in-register to a legal 128-bit vector type. For sext + // loads these must work with a single scalar load. + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Custom); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i8, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v8i8, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); @@ -1027,8 +1051,6 @@ void X86TargetLowering::resetOperationActions() { AddPromotedToType (ISD::SELECT, VT, MVT::v2i64); } - setTruncStoreAction(MVT::f64, MVT::f32, Expand); - // Custom lower v2i64 and v2f64 selects. setOperationAction(ISD::LOAD, MVT::v2f64, Legal); setOperationAction(ISD::LOAD, MVT::v2i64, Legal); @@ -1090,7 +1112,13 @@ void X86TargetLowering::resetOperationActions() { // some vselects for now. setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); - // i8 and i16 vectors are custom , because the source register and source + // SSE41 brings specific instructions for doing vector sign extend even in + // cases where we don't have SRA. + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Custom); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, Custom); + + // i8 and i16 vectors are custom because the source register and source // source memory operand types are not the same width. f32 vectors are // custom since the immediate controlling the insert encodes additional // information. @@ -1104,7 +1132,7 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); - // FIXME: these should be Legal but thats only for the case where + // FIXME: these should be Legal, but that's only for the case where // the index is constant. For now custom expand to deal with that. if (Subtarget->is64Bit()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); @@ -1254,6 +1282,10 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::VSELECT, MVT::v16i16, Custom); setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); + + // The custom lowering for UINT_TO_FP for v8i32 becomes interesting + // when we have a 256bit-wide blend with immediate. + setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); setOperationAction(ISD::ADD, MVT::v8i32, Custom); @@ -1378,6 +1410,10 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote); setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); @@ -1489,6 +1525,43 @@ void X86TargetLowering::resetOperationActions() { } }// has AVX-512 + if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) { + addRegisterClass(MVT::v32i16, &X86::VR512RegClass); + addRegisterClass(MVT::v64i8, &X86::VR512RegClass); + + addRegisterClass(MVT::v32i1, &X86::VK32RegClass); + addRegisterClass(MVT::v64i1, &X86::VK64RegClass); + + setOperationAction(ISD::LOAD, MVT::v32i16, Legal); + setOperationAction(ISD::LOAD, MVT::v64i8, Legal); + setOperationAction(ISD::SETCC, MVT::v32i1, Custom); + setOperationAction(ISD::SETCC, MVT::v64i1, Custom); + + for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { + const MVT VT = (MVT::SimpleValueType)i; + + const unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + + // Do not attempt to promote non-256-bit vectors + if (!VT.is512BitVector()) + continue; + + if ( EltSize < 32) { + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Legal); + } + } + } + + if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) { + addRegisterClass(MVT::v4i1, &X86::VK4RegClass); + addRegisterClass(MVT::v2i1, &X86::VK2RegClass); + + setOperationAction(ISD::SETCC, MVT::v4i1, Custom); + setOperationAction(ISD::SETCC, MVT::v2i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal); + } + // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion // of this type with custom code. for (int VT = MVT::FIRST_VECTOR_VALUETYPE; @@ -1521,9 +1594,6 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::UMULO, VT, Custom); } - // There are no 8-bit 3-address imul/mul instructions - setOperationAction(ISD::SMULO, MVT::i8, Expand); - setOperationAction(ISD::UMULO, MVT::i8, Expand); if (!Subtarget->is64Bit()) { // These libcalls are not available in 32-bit. @@ -1600,6 +1670,14 @@ void X86TargetLowering::resetOperationActions() { PredictableSelectIsExpensive = !Subtarget->isAtom(); setPrefFunctionAlignment(4); // 2^4 bytes. + + verifyIntrinsicTables(); +} + +// This has so far only been implemented for 64-bit MachO. +bool X86TargetLowering::useLoadStackGuardNode() const { + return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO && + Subtarget->is64Bit(); } TargetLoweringBase::LegalizeTypeAction @@ -1616,10 +1694,40 @@ EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { if (!VT.isVector()) return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; - if (Subtarget->hasAVX512()) - switch(VT.getVectorNumElements()) { - case 8: return MVT::v8i1; - case 16: return MVT::v16i1; + const unsigned NumElts = VT.getVectorNumElements(); + const EVT EltVT = VT.getVectorElementType(); + if (VT.is512BitVector()) { + if (Subtarget->hasAVX512()) + if (EltVT == MVT::i32 || EltVT == MVT::i64 || + EltVT == MVT::f32 || EltVT == MVT::f64) + switch(NumElts) { + case 8: return MVT::v8i1; + case 16: return MVT::v16i1; + } + if (Subtarget->hasBWI()) + if (EltVT == MVT::i8 || EltVT == MVT::i16) + switch(NumElts) { + case 32: return MVT::v32i1; + case 64: return MVT::v64i1; + } + } + + if (VT.is256BitVector() || VT.is128BitVector()) { + if (Subtarget->hasVLX()) + if (EltVT == MVT::i32 || EltVT == MVT::i64 || + EltVT == MVT::f32 || EltVT == MVT::f64) + switch(NumElts) { + case 2: return MVT::v2i1; + case 4: return MVT::v4i1; + case 8: return MVT::v8i1; + } + if (Subtarget->hasBWI() && Subtarget->hasVLX()) + if (EltVT == MVT::i8 || EltVT == MVT::i16) + switch(NumElts) { + case 8: return MVT::v8i1; + case 16: return MVT::v16i1; + case 32: return MVT::v32i1; + } } return VT.changeVectorElementTypeToInteger(); @@ -1726,9 +1834,10 @@ bool X86TargetLowering::isSafeMemOpType(MVT VT) const { } bool -X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, - unsigned, - bool *Fast) const { +X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned, + unsigned, + bool *Fast) const { if (Fast) *Fast = Subtarget->isUnalignedMemAccessFast(); return true; @@ -1794,9 +1903,7 @@ X86TargetLowering::findRepresentativeClass(MVT VT) const{ default: return TargetLowering::findRepresentativeClass(VT); case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: - RRC = Subtarget->is64Bit() ? - (const TargetRegisterClass*)&X86::GR64RegClass : - (const TargetRegisterClass*)&X86::GR32RegClass; + RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; break; case MVT::x86mmx: RRC = &X86::VR64RegClass; @@ -1851,8 +1958,7 @@ X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(), - RVLocs, Context); + CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_X86); } @@ -1871,8 +1977,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); SDValue Flag; @@ -1918,8 +2023,8 @@ X86TargetLowering::LowerReturn(SDValue Chain, // Returns in ST0/ST1 are handled specially: these are pushed as operands to // the RET instruction and handled by the FP Stackifier. - if (VA.getLocReg() == X86::ST0 || - VA.getLocReg() == X86::ST1) { + if (VA.getLocReg() == X86::FP0 || + VA.getLocReg() == X86::FP1) { // If this is a copy from an xmm register to ST(0), use an FPExtend to // change the value to the FP stack register class. if (isScalarFPTypeInSSEReg(VA.getValVT())) @@ -2005,6 +2110,13 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { UI != UE; ++UI) { if (UI->getOpcode() != X86ISD::RET_FLAG) return false; + // If we are returning more than one value, we can definitely + // not make a tail call see PR19530 + if (UI->getNumOperands() > 4) + return false; + if (UI->getNumOperands() == 4 && + UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue) + return false; HasRet = true; } @@ -2015,8 +2127,8 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { return true; } -MVT -X86TargetLowering::getTypeForExtArgOrReturn(MVT VT, +EVT +X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const { MVT ReturnMVT; // TODO: Is this also valid on 32-bit? @@ -2025,7 +2137,7 @@ X86TargetLowering::getTypeForExtArgOrReturn(MVT VT, else ReturnMVT = MVT::i32; - MVT MinVT = getRegisterType(ReturnMVT); + EVT MinVT = getRegisterType(Context, ReturnMVT); return VT.bitsLT(MinVT) ? MinVT : VT; } @@ -2042,8 +2154,8 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; bool Is64Bit = Subtarget->is64Bit(); - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - DAG.getTarget(), RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. @@ -2057,33 +2169,21 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, report_fatal_error("SSE register return with SSE disabled"); } - SDValue Val; - - // If this is a call to a function that returns an fp value on the floating - // point stack, we must guarantee the value is popped from the stack, so - // a CopyFromReg is not good enough - the copy instruction may be eliminated - // if the return value is not used. We use the FpPOP_RETVAL instruction - // instead. - if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { - // If we prefer to use the value in xmm registers, copy it out as f80 and - // use a truncate to move it from fp stack reg to xmm reg. - if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; - SDValue Ops[] = { Chain, InFlag }; - Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, - MVT::Other, MVT::Glue, Ops), 1); - Val = Chain.getValue(0); - - // Round the f80 to the right size, which also moves it to the appropriate - // xmm register. - if (CopyVT != VA.getValVT()) - Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, - // This truncation won't change the value. - DAG.getIntPtrConstant(1)); - } else { - Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), - CopyVT, InFlag).getValue(1); - Val = Chain.getValue(0); - } + // If we prefer to use the value in xmm registers, copy it out as f80 and + // use a truncate to move it from fp stack reg to xmm reg. + if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && + isScalarFPTypeInSSEReg(VA.getValVT())) + CopyVT = MVT::f80; + + Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), + CopyVT, InFlag).getValue(1); + SDValue Val = Chain.getValue(0); + + if (CopyVT != VA.getValVT()) + Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, + // This truncation won't change the value. + DAG.getIntPtrConstant(1)); + InFlag = Chain.getValue(2); InVals.push_back(Val); } @@ -2224,6 +2324,55 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, } } +// FIXME: Get this from tablegen. +static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv, + const X86Subtarget *Subtarget) { + assert(Subtarget->is64Bit()); + + if (Subtarget->isCallingConvWin64(CallConv)) { + static const MCPhysReg GPR64ArgRegsWin64[] = { + X86::RCX, X86::RDX, X86::R8, X86::R9 + }; + return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64)); + } + + static const MCPhysReg GPR64ArgRegs64Bit[] = { + X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 + }; + return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit)); +} + +// FIXME: Get this from tablegen. +static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, + CallingConv::ID CallConv, + const X86Subtarget *Subtarget) { + assert(Subtarget->is64Bit()); + if (Subtarget->isCallingConvWin64(CallConv)) { + // The XMM registers which might contain var arg parameters are shadowed + // in their paired GPR. So we only need to save the GPR to their home + // slots. + // TODO: __vectorcall will change this. + return None; + } + + const Function *Fn = MF.getFunction(); + bool NoImplicitFloatOps = Fn->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); + assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) && + "SSE register cannot be used when SSE is disabled!"); + if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || + !Subtarget->hasSSE1()) + // Kernel mode asks for SSE to be disabled, so there are no XMM argument + // registers. + return None; + + static const MCPhysReg XMMArgRegs64Bit[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); +} + SDValue X86TargetLowering::LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, @@ -2251,8 +2400,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(), - ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (IsWin64) @@ -2296,6 +2444,10 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, RC = &X86::VK8RegClass; else if (RegVT == MVT::v16i1) RC = &X86::VK16RegClass; + else if (RegVT == MVT::v32i1) + RC = &X86::VK32RegClass; + else if (RegVT == MVT::v64i1) + RC = &X86::VK64RegClass; else llvm_unreachable("Unknown argument type!"); @@ -2362,60 +2514,53 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, StackSize = GetAlignedArgumentStackSize(StackSize, DAG); // If the function takes variable number of arguments, make a frame index for - // the start of the first vararg value... for expansion of llvm.va_start. - if (isVarArg) { - if (Is64Bit || (CallConv != CallingConv::X86_FastCall && - CallConv != CallingConv::X86_ThisCall)) { - FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); - } - if (Is64Bit) { - unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; - - // FIXME: We should really autogenerate these arrays - static const MCPhysReg GPR64ArgRegsWin64[] = { - X86::RCX, X86::RDX, X86::R8, X86::R9 - }; - static const MCPhysReg GPR64ArgRegs64Bit[] = { - X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 - }; - static const MCPhysReg XMMArgRegs64Bit[] = { - X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, - X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 - }; - const MCPhysReg *GPR64ArgRegs; - unsigned NumXMMRegs = 0; - - if (IsWin64) { - // The XMM registers which might contain var arg parameters are shadowed - // in their paired GPR. So we only need to save the GPR to their home - // slots. - TotalNumIntRegs = 4; - GPR64ArgRegs = GPR64ArgRegsWin64; - } else { - TotalNumIntRegs = 6; TotalNumXMMRegs = 8; - GPR64ArgRegs = GPR64ArgRegs64Bit; - - NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, - TotalNumXMMRegs); + // the start of the first vararg value... for expansion of llvm.va_start. We + // can skip this if there are no va_start calls. + if (MFI->hasVAStart() && + (Is64Bit || (CallConv != CallingConv::X86_FastCall && + CallConv != CallingConv::X86_ThisCall))) { + FuncInfo->setVarArgsFrameIndex( + MFI->CreateFixedObject(1, StackSize, true)); + } + + // 64-bit calling conventions support varargs and register parameters, so we + // have to do extra work to spill them in the prologue or forward them to + // musttail calls. + if (Is64Bit && isVarArg && + (MFI->hasVAStart() || MFI->hasMustTailInVarArgFunc())) { + // Find the first unallocated argument registers. + ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); + ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); + unsigned NumIntRegs = + CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size()); + unsigned NumXMMRegs = + CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size()); + assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && + "SSE register cannot be used when SSE is disabled!"); + + // Gather all the live in physical registers. + SmallVector<SDValue, 6> LiveGPRs; + SmallVector<SDValue, 8> LiveXMMRegs; + SDValue ALVal; + for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { + unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass); + LiveGPRs.push_back( + DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64)); + } + if (!ArgXMMs.empty()) { + unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); + ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8); + for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) { + unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass); + LiveXMMRegs.push_back( + DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32)); } - unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, - TotalNumIntRegs); - - bool NoImplicitFloatOps = Fn->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); - assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && - "SSE register cannot be used when SSE is disabled!"); - assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && - NoImplicitFloatOps) && - "SSE register cannot be used when SSE is disabled!"); - if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || - !Subtarget->hasSSE1()) - // Kernel mode asks for SSE to be disabled, so don't push them - // on the stack. - TotalNumXMMRegs = 0; + } + // Store them to the va_list returned by va_start. + if (MFI->hasVAStart()) { if (IsWin64) { - const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering(); + const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering(); // Get to the caller-allocated home save location. Add 8 to account // for the return address. int HomeOffset = TFI.getOffsetOfLocalArea() + 8; @@ -2429,10 +2574,9 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // registers, then we must store them to their spots on the stack so // they may be loaded by deferencing the result of va_next. FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); - FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); - FuncInfo->setRegSaveFrameIndex( - MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, - false)); + FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); + FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject( + ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); } // Store the integer parameter registers. @@ -2440,12 +2584,9 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy()); unsigned Offset = FuncInfo->getVarArgsGPOffset(); - for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { + for (SDValue Val : LiveGPRs) { SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, DAG.getIntPtrConstant(Offset)); - unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], - &X86::GR64RegClass); - SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo::getFixedStack( @@ -2455,32 +2596,51 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, Offset += 8; } - if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { + if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { // Now store the XMM (fp + vector) parameter registers. - SmallVector<SDValue, 11> SaveXMMOps; + SmallVector<SDValue, 12> SaveXMMOps; SaveXMMOps.push_back(Chain); - - unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); - SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); SaveXMMOps.push_back(ALVal); - SaveXMMOps.push_back(DAG.getIntPtrConstant( FuncInfo->getRegSaveFrameIndex())); SaveXMMOps.push_back(DAG.getIntPtrConstant( FuncInfo->getVarArgsFPOffset())); - - for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { - unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], - &X86::VR128RegClass); - SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); - SaveXMMOps.push_back(Val); - } + SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), + LiveXMMRegs.end()); MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, MVT::Other, SaveXMMOps)); } if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); + } else { + // Add all GPRs, al, and XMMs to the list of forwards. We will add then + // to the liveout set on a musttail call. + assert(MFI->hasMustTailInVarArgFunc()); + auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); + typedef X86MachineFunctionInfo::Forward Forward; + + for (unsigned I = 0, E = LiveGPRs.size(); I != E; ++I) { + unsigned VReg = + MF.getRegInfo().createVirtualRegister(&X86::GR64RegClass); + Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveGPRs[I]); + Forwards.push_back(Forward(VReg, ArgGPRs[NumIntRegs + I], MVT::i64)); + } + + if (!ArgXMMs.empty()) { + unsigned ALVReg = + MF.getRegInfo().createVirtualRegister(&X86::GR8RegClass); + Chain = DAG.getCopyToReg(Chain, dl, ALVReg, ALVal); + Forwards.push_back(Forward(ALVReg, X86::AL, MVT::i8)); + + for (unsigned I = 0, E = LiveXMMRegs.size(); I != E; ++I) { + unsigned VReg = + MF.getRegInfo().createVirtualRegister(&X86::VR128RegClass); + Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveXMMRegs[I]); + Forwards.push_back( + Forward(VReg, ArgXMMs[NumXMMRegs + I], MVT::v4f32)); + } + } } } @@ -2583,6 +2743,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); StructReturnType SR = callIsStructReturn(Outs); bool IsSibcall = false; + X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); if (MF.getTarget().Options.DisableTailCalls) isTailCall = false; @@ -2614,8 +2775,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(), - ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (IsWin64) @@ -2636,7 +2796,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, int FPDiff = 0; if (isTailCall && !IsSibcall && !IsMustTail) { // Lower arguments at fp - stackoffset + fpdiff. - X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); FPDiff = NumBytesCallerPushed - NumBytes; @@ -2655,8 +2814,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // arguments passed in memory when using inalloca. if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { NumBytesToPush = 0; - assert(ArgLocs.back().getLocMemOffset() == 0 && - "an inalloca argument must be the only memory argument"); + if (!ArgLocs.back().isMemLoc()) + report_fatal_error("cannot use inalloca attribute on a register " + "parameter"); + if (ArgLocs.back().getLocMemOffset() != 0) + report_fatal_error("any parameter with the inalloca attribute must be " + "the only memory argument"); } if (!IsSibcall) @@ -2675,8 +2838,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { // Skip inalloca arguments, they have already been written. ISD::ArgFlagsTy Flags = Outs[i].Flags; @@ -2775,7 +2938,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } } - if (Is64Bit && isVarArg && !IsWin64) { + if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) { // From AMD64 ABI document: // For calls that may call functions that use varargs or stdargs // (prototype-less calls or calls to functions containing ellipsis (...) in @@ -2797,6 +2960,14 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getConstant(NumXMMRegs, MVT::i8))); } + if (Is64Bit && isVarArg && IsMustTail) { + const auto &Forwards = X86Info->getForwardedMustTailRegParms(); + for (const auto &F : Forwards) { + SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); + RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); + } + } + // For tail calls lower the arguments to the 'real' stack slots. Sibcalls // don't need this because the eligibility check rejects calls that require // shuffling arguments passed in memory. @@ -2946,6 +3117,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), OpFlags); + } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) { + // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI + Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); } // Returns a chain & a flag for retval copy to use. @@ -2972,7 +3146,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. - const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo(); + const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -3043,7 +3217,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // If a tail called function callee has more arguments than the caller the // caller needs to make sure that there is room to move the RETADDR to. This is // achieved by reserving an area the size of the argument delta right after the -// original REtADDR, but before the saved framepointer or the spilled registers +// original RETADDR, but before the saved framepointer or the spilled registers // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) // stack layout: // arg1 @@ -3063,9 +3237,9 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const TargetMachine &TM = MF.getTarget(); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); - const TargetFrameLowering &TFI = *TM.getFrameLowering(); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + TM.getSubtargetImpl()->getRegisterInfo()); + const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); uint64_t AlignMask = StackAlignment - 1; int64_t Offset = StackSize; @@ -3178,8 +3352,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to // emit a special epilogue. - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); if (RegInfo->needsStackRealignment(MF)) return false; @@ -3207,8 +3381,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, return false; SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), - DAG.getTarget(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CC_X86); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) @@ -3228,12 +3402,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, } if (Unused) { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), - DAG.getTarget(), RVLocs, *DAG.getContext()); + CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; - if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) + if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) return false; } } @@ -3242,13 +3416,13 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // results are returned in the same way as what the caller expects. if (!CCMatch) { SmallVector<CCValAssign, 16> RVLocs1; - CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), - DAG.getTarget(), RVLocs1, *DAG.getContext()); + CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, + *DAG.getContext()); CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); SmallVector<CCValAssign, 16> RVLocs2; - CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), - DAG.getTarget(), RVLocs2, *DAG.getContext()); + CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, + *DAG.getContext()); CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); if (RVLocs1.size() != RVLocs2.size()) @@ -3274,8 +3448,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Check if stack adjustment is needed. For now, do not do this if any // argument is passed on the stack. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), - DAG.getTarget(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); // Allocate shadow area for Win64 if (IsCalleeWin64) @@ -3292,7 +3466,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); const X86InstrInfo *TII = - static_cast<const X86InstrInfo *>(DAG.getTarget().getInstrInfo()); + static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; @@ -3362,6 +3536,8 @@ static bool MayFoldIntoStore(SDValue Op) { static bool isTargetShuffle(unsigned Opcode) { switch(Opcode) { default: return false; + case X86ISD::BLENDI: + case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: @@ -3379,7 +3555,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::MOVSD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: - case X86ISD::VPERMILP: + case X86ISD::VPERMILPI: case X86ISD::VPERM2X128: case X86ISD::VPERMI: return true; @@ -3405,7 +3581,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: - case X86ISD::VPERMILP: + case X86ISD::VPERMILPI: case X86ISD::VPERMI: return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); } @@ -3417,6 +3593,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::PALIGNR: + case X86ISD::VALIGN: case X86ISD::SHUFP: case X86ISD::VPERM2X128: return DAG.getNode(Opc, dl, VT, V1, V2, @@ -3443,8 +3620,8 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); int ReturnAddrIndex = FuncInfo->getRAIndex(); @@ -3494,23 +3671,18 @@ bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, /// own arguments. Callee pop is necessary to support tail calls. bool X86::isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg, bool TailCallOpt) { - if (IsVarArg) - return false; - switch (CallingConv) { default: return false; case CallingConv::X86_StdCall: - return !is64Bit; case CallingConv::X86_FastCall: - return !is64Bit; case CallingConv::X86_ThisCall: return !is64Bit; case CallingConv::Fast: - return TailCallOpt; case CallingConv::GHC: - return TailCallOpt; case CallingConv::HiPE: + if (IsVarArg) + return false; return TailCallOpt; } } @@ -3687,14 +3859,23 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, } /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that -/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference -/// the second operand. -static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) { - if (VT == MVT::v4f32 || VT == MVT::v4i32 ) - return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); - if (VT == MVT::v2f64 || VT == MVT::v2i64) - return (Mask[0] < 2 && Mask[1] < 2); - return false; +/// is suitable for input to PSHUFD. That is, it doesn't reference the other +/// operand - by default will match for first operand. +static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT, + bool TestSecondOperand = false) { + if (VT != MVT::v4f32 && VT != MVT::v4i32 && + VT != MVT::v2f64 && VT != MVT::v2i64) + return false; + + unsigned NumElems = VT.getVectorNumElements(); + unsigned Lo = TestSecondOperand ? NumElems : 0; + unsigned Hi = Lo + NumElems; + + for (unsigned i = 0; i < NumElems; ++i) + if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi)) + return false; + + return true; } /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that @@ -3755,16 +3936,12 @@ static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { return true; } -/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that -/// is suitable for input to PALIGNR. -static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT, - const X86Subtarget *Subtarget) { - if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) || - (VT.is256BitVector() && !Subtarget->hasInt256())) - return false; - +/// \brief Return true if the mask specifies a shuffle of elements that is +/// suitable for input to intralane (palignr) or interlane (valign) vector +/// right-shift. +static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) { unsigned NumElts = VT.getVectorNumElements(); - unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128; + unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; // Do not handle 64-bit element shuffles with palignr. @@ -3828,6 +4005,29 @@ static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT, return true; } +/// \brief Return true if the node specifies a shuffle of elements that is +/// suitable for input to PALIGNR. +static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT, + const X86Subtarget *Subtarget) { + if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) || + (VT.is256BitVector() && !Subtarget->hasInt256()) || + VT.is512BitVector()) + // FIXME: Add AVX512BW. + return false; + + return isAlignrMask(Mask, VT, false); +} + +/// \brief Return true if the node specifies a shuffle of elements that is +/// suitable for input to VALIGN. +static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT, + const X86Subtarget *Subtarget) { + // FIXME: Add AVX512VL. + if (!VT.is512BitVector() || !Subtarget->hasAVX512()) + return false; + return isAlignrMask(Mask, VT, true); +} + /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming /// the two vector operands have swapped position. static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, @@ -4070,43 +4270,34 @@ static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT, assert(VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckl"); - // AVX defines UNPCK* to operate independently on 128-bit lanes. - unsigned NumLanes; - unsigned NumOf256BitLanes; unsigned NumElts = VT.getVectorNumElements(); - if (VT.is256BitVector()) { - if (NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) + if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && + (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; - NumLanes = 2; - NumOf256BitLanes = 1; - } else if (VT.is512BitVector()) { - assert(VT.getScalarType().getSizeInBits() >= 32 && - "Unsupported vector type for unpckh"); - NumLanes = 2; - NumOf256BitLanes = 2; - } else { - NumLanes = 1; - NumOf256BitLanes = 1; - } - unsigned NumEltsInStride = NumElts/NumOf256BitLanes; - unsigned NumLaneElts = NumEltsInStride/NumLanes; + assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && + "Unsupported vector type for unpckh"); - for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { - for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { - for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { - int BitI = Mask[l256*NumEltsInStride+l+i]; - int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; - if (!isUndefOrEqual(BitI, j+l256*NumElts)) - return false; - if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) + // AVX defines UNPCK* to operate independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l+i]; + int BitI1 = Mask[l+i+1]; + if (!isUndefOrEqual(BitI, j)) + return false; + if (V2IsSplat) { + if (!isUndefOrEqual(BitI1, NumElts)) return false; - if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) + } else { + if (!isUndefOrEqual(BitI1, j + NumElts)) return false; } } } + return true; } @@ -4117,39 +4308,29 @@ static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT, assert(VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckh"); - // AVX defines UNPCK* to operate independently on 128-bit lanes. - unsigned NumLanes; - unsigned NumOf256BitLanes; unsigned NumElts = VT.getVectorNumElements(); - if (VT.is256BitVector()) { - if (NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) + if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && + (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; - NumLanes = 2; - NumOf256BitLanes = 1; - } else if (VT.is512BitVector()) { - assert(VT.getScalarType().getSizeInBits() >= 32 && - "Unsupported vector type for unpckh"); - NumLanes = 2; - NumOf256BitLanes = 2; - } else { - NumLanes = 1; - NumOf256BitLanes = 1; - } - unsigned NumEltsInStride = NumElts/NumOf256BitLanes; - unsigned NumLaneElts = NumEltsInStride/NumLanes; + assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && + "Unsupported vector type for unpckh"); - for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { - for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { - for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { - int BitI = Mask[l256*NumEltsInStride+l+i]; - int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; - if (!isUndefOrEqual(BitI, j+l256*NumElts)) - return false; - if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) + // AVX defines UNPCK* to operate independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l+i]; + int BitI1 = Mask[l+i+1]; + if (!isUndefOrEqual(BitI, j)) + return false; + if (V2IsSplat) { + if (isUndefOrEqual(BitI1, NumElts)) return false; - if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) + } else { + if (!isUndefOrEqual(BitI1, j+NumElts)) return false; } } @@ -4652,11 +4833,13 @@ static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { return Mask; } -/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. -static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { +/// \brief Return the appropriate immediate to shuffle the specified +/// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with +/// VALIGN (if Interlane is true) instructions. +static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp, + bool InterLane) { MVT VT = SVOp->getSimpleValueType(0); - unsigned EltSize = VT.is512BitVector() ? 1 : + unsigned EltSize = InterLane ? 1 : VT.getVectorElementType().getSizeInBits() >> 3; unsigned NumElts = VT.getVectorNumElements(); @@ -4677,6 +4860,19 @@ static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { return (Val - i) * EltSize; } +/// \brief Return the appropriate immediate to shuffle the specified +/// VECTOR_SHUFFLE mask with the PALIGNR instruction. +static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { + return getShuffleAlignrImmediate(SVOp, false); +} + +/// \brief Return the appropriate immediate to shuffle the specified +/// VECTOR_SHUFFLE mask with the VALIGN instruction. +static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) { + return getShuffleAlignrImmediate(SVOp, true); +} + + static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) @@ -4751,28 +4947,6 @@ bool X86::isZeroNode(SDValue Elt) { return false; } -/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in -/// their permute mask. -static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, - SelectionDAG &DAG) { - MVT VT = SVOp->getSimpleValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - SmallVector<int, 8> MaskVec; - - for (unsigned i = 0; i != NumElems; ++i) { - int Idx = SVOp->getMaskElt(i); - if (Idx >= 0) { - if (Idx < (int)NumElems) - Idx += NumElems; - else - Idx -= NumElems; - } - MaskVec.push_back(Idx); - } - return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1), - SVOp->getOperand(0), &MaskVec[0]); -} - /// ShouldXformToMOVHLPS - Return true if the node should be transformed to /// match movhlps. The lower half elements should come from upper half of /// V1 (and in order), and the upper half elements should come from the upper @@ -4897,32 +5071,32 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, SDValue Vec; if (VT.is128BitVector()) { // SSE if (Subtarget->hasSSE2()) { // SSE2 - SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + SDValue Cst = DAG.getConstant(0, MVT::i32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } else { // SSE1 - SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); + SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); } } else if (VT.is256BitVector()) { // AVX if (Subtarget->hasInt256()) { // AVX2 - SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + SDValue Cst = DAG.getConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // 256-bit logic and arithmetic instructions in AVX are all // floating-point, no support for integer ops. Emit fp zeroed vectors. - SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); + SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops); } } else if (VT.is512BitVector()) { // AVX-512 - SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + SDValue Cst = DAG.getConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); } else if (VT.getScalarType() == MVT::i1) { assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type"); - SDValue Cst = DAG.getTargetConstant(0, MVT::i1); + SDValue Cst = DAG.getConstant(0, MVT::i1); SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } else @@ -4939,7 +5113,7 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); - SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); + SDValue Cst = DAG.getConstant(~0U, MVT::i32); SDValue Vec; if (VT.is256BitVector()) { if (HasInt256) { // AVX2 @@ -5109,37 +5283,49 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, } /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the -/// target specific opcode. Returns true if the Mask could be calculated. -/// Sets IsUnary to true if only uses one source. +/// target specific opcode. Returns true if the Mask could be calculated. Sets +/// IsUnary to true if only uses one source. Note that this will set IsUnary for +/// shuffles which use a single input multiple times, and in those cases it will +/// adjust the mask to only have indices within that single input. static bool getTargetShuffleMask(SDNode *N, MVT VT, SmallVectorImpl<int> &Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); SDValue ImmN; IsUnary = false; + bool IsFakeUnary = false; switch(N->getOpcode()) { + case X86ISD::BLENDI: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + break; case X86ISD::SHUFP: ImmN = N->getOperand(N->getNumOperands()-1); DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::UNPCKH: DecodeUNPCKHMask(VT, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::UNPCKL: DecodeUNPCKLMask(VT, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVHLPS: DecodeMOVHLPSMask(NumElems, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVLHPS: DecodeMOVLHPSMask(NumElems, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::PALIGNR: ImmN = N->getOperand(N->getNumOperands()-1); DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); break; case X86ISD::PSHUFD: - case X86ISD::VPERMILP: + case X86ISD::VPERMILPI: ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = true; @@ -5154,6 +5340,72 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = true; break; + case X86ISD::PSHUFB: { + IsUnary = true; + SDValue MaskNode = N->getOperand(1); + while (MaskNode->getOpcode() == ISD::BITCAST) + MaskNode = MaskNode->getOperand(0); + + if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { + // If we have a build-vector, then things are easy. + EVT VT = MaskNode.getValueType(); + assert(VT.isVector() && + "Can't produce a non-vector with a build_vector!"); + if (!VT.isInteger()) + return false; + + int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8; + + SmallVector<uint64_t, 32> RawMask; + for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) { + SDValue Op = MaskNode->getOperand(i); + if (Op->getOpcode() == ISD::UNDEF) { + RawMask.push_back((uint64_t)SM_SentinelUndef); + continue; + } + auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()); + if (!CN) + return false; + APInt MaskElement = CN->getAPIntValue(); + + // We now have to decode the element which could be any integer size and + // extract each byte of it. + for (int j = 0; j < NumBytesPerElement; ++j) { + // Note that this is x86 and so always little endian: the low byte is + // the first byte of the mask. + RawMask.push_back(MaskElement.getLoBits(8).getZExtValue()); + MaskElement = MaskElement.lshr(8); + } + } + DecodePSHUFBMask(RawMask, Mask); + break; + } + + auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); + if (!MaskLoad) + return false; + + SDValue Ptr = MaskLoad->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper) + Ptr = Ptr->getOperand(0); + + auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); + if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) + return false; + + if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { + // FIXME: Support AVX-512 here. + Type *Ty = C->getType(); + if (!Ty->isVectorTy() || (Ty->getVectorNumElements() != 16 && + Ty->getVectorNumElements() != 32)) + return false; + + DecodePSHUFBMask(C, Mask); + break; + } + + return false; + } case X86ISD::VPERMI: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); @@ -5175,17 +5427,29 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); if (Mask.empty()) return false; break; + case X86ISD::MOVSLDUP: + DecodeMOVSLDUPMask(VT, Mask); + break; + case X86ISD::MOVSHDUP: + DecodeMOVSHDUPMask(VT, Mask); + break; case X86ISD::MOVDDUP: case X86ISD::MOVLHPD: case X86ISD::MOVLPD: case X86ISD::MOVLPS: - case X86ISD::MOVSHDUP: - case X86ISD::MOVSLDUP: // Not yet implemented return false; default: llvm_unreachable("unknown target shuffle node"); } + // If we have a fake unary shuffle, the shuffle mask is spread across two + // inputs that are actually the same node. Re-map the mask to always point + // into the first input. + if (IsFakeUnary) + for (int &M : Mask) + if (M >= (int)Mask.size()) + M -= Mask.size(); + return true; } @@ -5476,76 +5740,109 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, } /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32. -static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems, - unsigned NonZeros, unsigned NumNonZero, - unsigned NumZero, SelectionDAG &DAG, +static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget, const TargetLowering &TLI) { - // We know there's at least one non-zero element - unsigned FirstNonZeroIdx = 0; - SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx); - while (FirstNonZero.getOpcode() == ISD::UNDEF || - X86::isZeroNode(FirstNonZero)) { - ++FirstNonZeroIdx; - FirstNonZero = Op->getOperand(FirstNonZeroIdx); + // Find all zeroable elements. + bool Zeroable[4]; + for (int i=0; i < 4; ++i) { + SDValue Elt = Op->getOperand(i); + Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)); + } + assert(std::count_if(&Zeroable[0], &Zeroable[4], + [](bool M) { return !M; }) > 1 && + "We expect at least two non-zero elements!"); + + // We only know how to deal with build_vector nodes where elements are either + // zeroable or extract_vector_elt with constant index. + SDValue FirstNonZero; + for (int i=0; i < 4; ++i) { + if (Zeroable[i]) + continue; + SDValue Elt = Op->getOperand(i); + if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa<ConstantSDNode>(Elt.getOperand(1))) + return SDValue(); + // Make sure that this node is extracting from a 128-bit vector. + MVT VT = Elt.getOperand(0).getSimpleValueType(); + if (!VT.is128BitVector()) + return SDValue(); + if (!FirstNonZero.getNode()) + FirstNonZero = Elt; } - if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - !isa<ConstantSDNode>(FirstNonZero.getOperand(1))) - return SDValue(); + assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!"); + SDValue V1 = FirstNonZero.getOperand(0); + MVT VT = V1.getSimpleValueType(); - SDValue V = FirstNonZero.getOperand(0); - MVT VVT = V.getSimpleValueType(); - if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32)) - return SDValue(); + // See if this build_vector can be lowered as a blend with zero. + SDValue Elt; + unsigned EltMaskIdx, EltIdx; + int Mask[4]; + for (EltIdx = 0; EltIdx < 4; ++EltIdx) { + if (Zeroable[EltIdx]) { + // The zero vector will be on the right hand side. + Mask[EltIdx] = EltIdx+4; + continue; + } - unsigned FirstNonZeroDst = - cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue(); - unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx; - unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx; - unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst; + Elt = Op->getOperand(EltIdx); + // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index. + EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue(); + if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx) + break; + Mask[EltIdx] = EltIdx; + } - for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) { - SDValue Elem = Op.getOperand(Idx); - if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem)) - continue; + if (EltIdx == 4) { + // Let the shuffle legalizer deal with blend operations. + SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); + if (V1.getSimpleValueType() != VT) + V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1); + return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]); + } - // TODO: What else can be here? Deal with it. - if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT) - return SDValue(); + // See if we can lower this build_vector to a INSERTPS. + if (!Subtarget->hasSSE41()) + return SDValue(); - // TODO: Some optimizations are still possible here - // ex: Getting one element from a vector, and the rest from another. - if (Elem.getOperand(0) != V) - return SDValue(); + SDValue V2 = Elt.getOperand(0); + if (Elt == FirstNonZero) + V1 = SDValue(); - unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue(); - if (Dst == Idx) - ++CorrectIdx; - else if (IncorrectIdx == -1U) { - IncorrectIdx = Idx; - IncorrectDst = Dst; - } else - // There was already one element with an incorrect index. - // We can't optimize this case to an insertps. - return SDValue(); + bool CanFold = true; + for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) { + if (Zeroable[i]) + continue; + + SDValue Current = Op->getOperand(i); + SDValue SrcVector = Current->getOperand(0); + if (!V1.getNode()) + V1 = SrcVector; + CanFold = SrcVector == V1 && + cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i; } - if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) { - SDLoc dl(Op); - EVT VT = Op.getSimpleValueType(); - unsigned ElementMoveMask = 0; - if (IncorrectIdx == -1U) - ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4; - else - ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4; + if (!CanFold) + return SDValue(); - SDValue InsertpsMask = - DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf)); - return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask); - } + assert(V1.getNode() && "Expected at least two non-zero elements!"); + if (V1.getSimpleValueType() != MVT::v4f32) + V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1); + if (V2.getSimpleValueType() != MVT::v4f32) + V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2); - return SDValue(); + // Ok, we can emit an INSERTPS instruction. + unsigned ZMask = 0; + for (int i = 0; i < 4; ++i) + if (Zeroable[i]) + ZMask |= 1 << i; + + unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask; + assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); + SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2, + DAG.getIntPtrConstant(InsertPSMask)); + return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result); } /// getVShift - Return a vector logical shift node. @@ -5748,7 +6045,10 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, /// or SDValue() otherwise. static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, SelectionDAG &DAG) { - if (!Subtarget->hasFp256()) + // VBROADCAST requires AVX. + // TODO: Splats could be generated for non-AVX CPUs using SSE + // instructions, but there's less potential gain for only 128-bit vectors. + if (!Subtarget->hasAVX()) return SDValue(); MVT VT = Op.getSimpleValueType(); @@ -5825,17 +6125,34 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, } } + unsigned ScalarSize = Ld.getValueType().getSizeInBits(); bool IsGE256 = (VT.getSizeInBits() >= 256); - // Handle the broadcasting a single constant scalar from the constant pool - // into a vector. On Sandybridge it is still better to load a constant vector + // When optimizing for size, generate up to 5 extra bytes for a broadcast + // instruction to save 8 or more bytes of constant pool data. + // TODO: If multiple splats are generated to load the same constant, + // it may be detrimental to overall size. There needs to be a way to detect + // that condition to know if this is truly a size win. + const Function *F = DAG.getMachineFunction().getFunction(); + bool OptForSize = F->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + + // Handle broadcasting a single constant scalar from the constant pool + // into a vector. + // On Sandybridge (no AVX2), it is still better to load a constant vector // from the constant pool and not to broadcast it from a scalar. - if (ConstSplatVal && Subtarget->hasInt256()) { + // But override that restriction when optimizing for size. + // TODO: Check if splatting is recommended for other AVX-capable CPUs. + if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) { EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); - unsigned ScalarSize = CVT.getSizeInBits(); - if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) { + // Splat f32, i32, v4f64, v4i64 in all cases with AVX2. + // For size optimization, also splat v2f64 and v2i64, and for size opt + // with AVX2, also splat i8 and i16. + // With pattern matching, the VBROADCAST node may become a VMOVDDUP. + if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || + (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) { const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) C = CI->getConstantIntValue(); @@ -5856,7 +6173,6 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, } bool IsLoad = ISD::isNormalLoad(Ld.getNode()); - unsigned ScalarSize = Ld.getValueType().getSizeInBits(); // Handle AVX2 in-register broadcasts. if (!IsLoad && Subtarget->hasInt256() && @@ -6241,11 +6557,6 @@ static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v2f64) && "build_vector with an invalid type found!"); - // Don't try to emit a VSELECT that cannot be lowered into a blend. - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) - return SDValue(); - // Odd-numbered elements in the input build vector are obtained from // adding two integer/float elements. // Even-numbered elements in the input build vector are obtained from @@ -6257,14 +6568,14 @@ static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, for (unsigned i = 0, e = NumElts; i != e; i++) { SDValue Op = BV->getOperand(i); - + // Skip 'undef' values. unsigned Opcode = Op.getOpcode(); if (Opcode == ISD::UNDEF) { std::swap(ExpectedOpcode, NextExpectedOpcode); continue; } - + // Early exit if we found an unexpected opcode. if (Opcode != ExpectedOpcode) return SDValue(); @@ -6318,34 +6629,11 @@ static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, std::swap(ExpectedOpcode, NextExpectedOpcode); } - // Don't try to fold this build_vector into a VSELECT if it has - // too many UNDEF operands. + // Don't try to fold this build_vector into an ADDSUB if the inputs are undef. if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF && - InVec1.getOpcode() != ISD::UNDEF) { - // Emit a sequence of vector add and sub followed by a VSELECT. - // The new VSELECT will be lowered into a BLENDI. - // At ISel stage, we pattern-match the sequence 'add + sub + BLENDI' - // and emit a single ADDSUB instruction. - SDValue Sub = DAG.getNode(ExpectedOpcode, DL, VT, InVec0, InVec1); - SDValue Add = DAG.getNode(NextExpectedOpcode, DL, VT, InVec0, InVec1); - - // Construct the VSELECT mask. - EVT MaskVT = VT.changeVectorElementTypeToInteger(); - EVT SVT = MaskVT.getVectorElementType(); - unsigned SVTBits = SVT.getSizeInBits(); - SmallVector<SDValue, 8> Ops; - - for (unsigned i = 0, e = NumElts; i != e; ++i) { - APInt Value = i & 1 ? APInt::getNullValue(SVTBits) : - APInt::getAllOnesValue(SVTBits); - SDValue Constant = DAG.getConstant(Value, SVT); - Ops.push_back(Constant); - } + InVec1.getOpcode() != ISD::UNDEF) + return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1); - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, Ops); - return DAG.getSelect(DL, VT, Mask, Sub, Add); - } - return SDValue(); } @@ -6581,6 +6869,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // convert it to a vector with movd (S2V+shuffle to zero extend). Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); + + // If using the new shuffle lowering, just directly insert this. + if (ExperimentalVectorShuffleLowering) + return DAG.getNode( + ISD::BITCAST, dl, VT, + getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG)); + Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); // Now we have our 32-bit value zero extended in the low element of @@ -6654,6 +6949,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (EVTBits == 32) { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); + // If using the new shuffle lowering, just directly insert this. + if (ExperimentalVectorShuffleLowering) + return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); + // Turn it into a shuffle of zero and zero-extended scalar to vector. Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); SmallVector<int, 8> MaskVec; @@ -6731,8 +7030,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS if (EVTBits == 32 && NumElems == 4) { - SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero, - NumZero, DAG, Subtarget, *this); + SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this); if (V.getNode()) return V; } @@ -6923,6 +7221,89 @@ static bool isSingleInputShuffleMask(ArrayRef<int> Mask) { return true; } +/// \brief Test whether there are elements crossing 128-bit lanes in this +/// shuffle mask. +/// +/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations +/// and we routinely test for these. +static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) { + int LaneSize = 128 / VT.getScalarSizeInBits(); + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) + return true; + return false; +} + +/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane. +/// +/// This checks a shuffle mask to see if it is performing the same +/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies +/// that it is also not lane-crossing. It may however involve a blend from the +/// same lane of a second vector. +/// +/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is +/// non-trivial to compute in the face of undef lanes. The representation is +/// *not* suitable for use with existing 128-bit shuffles as it will contain +/// entries from both V1 and V2 inputs to the wider mask. +static bool +is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, + SmallVectorImpl<int> &RepeatedMask) { + int LaneSize = 128 / VT.getScalarSizeInBits(); + RepeatedMask.resize(LaneSize, -1); + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) { + if (Mask[i] < 0) + continue; + if ((Mask[i] % Size) / LaneSize != i / LaneSize) + // This entry crosses lanes, so there is no way to model this shuffle. + return false; + + // Ok, handle the in-lane shuffles by detecting if and when they repeat. + if (RepeatedMask[i % LaneSize] == -1) + // This is the first non-undef entry in this slot of a 128-bit lane. + RepeatedMask[i % LaneSize] = + Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size; + else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i]) + // Found a mismatch with the repeated mask. + return false; + } + return true; +} + +// Hide this symbol with an anonymous namespace instead of 'static' so that MSVC +// 2013 will allow us to use it as a non-type template parameter. +namespace { + +/// \brief Implementation of the \c isShuffleEquivalent variadic functor. +/// +/// See its documentation for details. +bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) { + if (Mask.size() != Args.size()) + return false; + for (int i = 0, e = Mask.size(); i < e; ++i) { + assert(*Args[i] >= 0 && "Arguments must be positive integers!"); + if (Mask[i] != -1 && Mask[i] != *Args[i]) + return false; + } + return true; +} + +} // namespace + +/// \brief Checks whether a shuffle mask is equivalent to an explicit list of +/// arguments. +/// +/// This is a fast way to test a shuffle mask against a fixed pattern: +/// +/// if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... } +/// +/// It returns true if the mask is exactly as wide as the argument list, and +/// each element of the mask is either -1 (signifying undef) or the value given +/// in the argument. +static const VariadicFunction1< + bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {}; + /// \brief Get a 4-lane 8-bit shuffle immediate for a mask. /// /// This helper function produces an 8-bit shuffle immediate corresponding to @@ -6947,6 +7328,764 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, return DAG.getConstant(Imm, MVT::i8); } +/// \brief Try to emit a blend instruction for a shuffle. +/// +/// This doesn't do any checks for the availability of instructions for blending +/// these values. It relies on the availability of the X86ISD::BLENDI pattern to +/// be matched in the backend with the type given. What it does check for is +/// that the shuffle mask is in fact a blend. +static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + + unsigned BlendMask = 0; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Mask[i] >= Size) { + if (Mask[i] != i + Size) + return SDValue(); // Shuffled V2 input! + BlendMask |= 1u << i; + continue; + } + if (Mask[i] >= 0 && Mask[i] != i) + return SDValue(); // Shuffled V1 input! + } + switch (VT.SimpleTy) { + case MVT::v2f64: + case MVT::v4f32: + case MVT::v4f64: + case MVT::v8f32: + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, + DAG.getConstant(BlendMask, MVT::i8)); + + case MVT::v4i64: + case MVT::v8i32: + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + // FALLTHROUGH + case MVT::v2i64: + case MVT::v4i32: + // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into + // that instruction. + if (Subtarget->hasAVX2()) { + // Scale the blend by the number of 32-bit dwords per element. + int Scale = VT.getScalarSizeInBits() / 32; + BlendMask = 0; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= Size) + for (int j = 0; j < Scale; ++j) + BlendMask |= 1u << (i * Scale + j); + + MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; + V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, + DAG.getConstant(BlendMask, MVT::i8))); + } + // FALLTHROUGH + case MVT::v8i16: { + // For integer shuffles we need to expand the mask and cast the inputs to + // v8i16s prior to blending. + int Scale = 8 / VT.getVectorNumElements(); + BlendMask = 0; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= Size) + for (int j = 0; j < Scale; ++j) + BlendMask |= 1u << (i * Scale + j); + + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, + DAG.getConstant(BlendMask, MVT::i8))); + } + + case MVT::v16i16: { + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + SmallVector<int, 8> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { + // We can lower these with PBLENDW which is mirrored across 128-bit lanes. + assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); + BlendMask = 0; + for (int i = 0; i < 8; ++i) + if (RepeatedMask[i] >= 16) + BlendMask |= 1u << i; + return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, + DAG.getConstant(BlendMask, MVT::i8)); + } + } + // FALLTHROUGH + case MVT::v32i8: { + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + // Scale the blend by the number of bytes per element. + int Scale = VT.getScalarSizeInBits() / 8; + assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!"); + + // Compute the VSELECT mask. Note that VSELECT is really confusing in the + // mix of LLVM's code generator and the x86 backend. We tell the code + // generator that boolean values in the elements of an x86 vector register + // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' + // mapping a select to operand #1, and 'false' mapping to operand #2. The + // reality in x86 is that vector masks (pre-AVX-512) use only the high bit + // of the element (the remaining are ignored) and 0 in that high bit would + // mean operand #1 while 1 in the high bit would mean operand #2. So while + // the LLVM model for boolean values in vector elements gets the relevant + // bit set, it is set backwards and over constrained relative to x86's + // actual model. + SDValue VSELECTMask[32]; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + for (int j = 0; j < Scale; ++j) + VSELECTMask[Scale * i + j] = + Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) + : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8); + + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2); + return DAG.getNode( + ISD::BITCAST, DL, VT, + DAG.getNode(ISD::VSELECT, DL, MVT::v32i8, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask), + V1, V2)); + } + + default: + llvm_unreachable("Not a supported integer vector type!"); + } +} + +/// \brief Generic routine to lower a shuffle and blend as a decomposed set of +/// unblended shuffles followed by an unshuffled blend. +/// +/// This matches the extremely common pattern for handling combined +/// shuffle+blend operations on newer X86 ISAs where we have very fast blend +/// operations. +static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT, + SDValue V1, + SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { + // Shuffle the input elements into the desired positions in V1 and V2 and + // blend them together. + SmallVector<int, 32> V1Mask(Mask.size(), -1); + SmallVector<int, 32> V2Mask(Mask.size(), -1); + SmallVector<int, 32> BlendMask(Mask.size(), -1); + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= 0 && Mask[i] < Size) { + V1Mask[i] = Mask[i]; + BlendMask[i] = i; + } else if (Mask[i] >= Size) { + V2Mask[i] = Mask[i] - Size; + BlendMask[i] = i + Size; + } + + V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); + V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); + return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); +} + +/// \brief Try to lower a vector shuffle as a byte rotation. +/// +/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary +/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use +/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will +/// try to generically lower a vector shuffle through such an pattern. It +/// does not check for the profitability of lowering either as PALIGNR or +/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. +/// This matches shuffle vectors that look like: +/// +/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] +/// +/// Essentially it concatenates V1 and V2, shifts right by some number of +/// elements, and takes the low elements as the result. Note that while this is +/// specified as a *right shift* because x86 is little-endian, it is a *left +/// rotate* of the vector lanes. +/// +/// Note that this only handles 128-bit vector widths currently. +static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, + ArrayRef<int> Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); + + // We need to detect various ways of spelling a rotation: + // [11, 12, 13, 14, 15, 0, 1, 2] + // [-1, 12, 13, 14, -1, -1, 1, -1] + // [-1, -1, -1, -1, -1, -1, 1, 2] + // [ 3, 4, 5, 6, 7, 8, 9, 10] + // [-1, 4, 5, 6, -1, -1, 9, -1] + // [-1, 4, 5, 6, -1, -1, -1, -1] + int Rotation = 0; + SDValue Lo, Hi; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Mask[i] == -1) + continue; + assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!"); + + // Based on the mod-Size value of this mask element determine where + // a rotated vector would have started. + int StartIdx = i - (Mask[i] % Size); + if (StartIdx == 0) + // The identity rotation isn't interesting, stop. + return SDValue(); + + // If we found the tail of a vector the rotation must be the missing + // front. If we found the head of a vector, it must be how much of the head. + int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx; + + if (Rotation == 0) + Rotation = CandidateRotation; + else if (Rotation != CandidateRotation) + // The rotations don't match, so we can't match this mask. + return SDValue(); + + // Compute which value this mask is pointing at. + SDValue MaskV = Mask[i] < Size ? V1 : V2; + + // Compute which of the two target values this index should be assigned to. + // This reflects whether the high elements are remaining or the low elements + // are remaining. + SDValue &TargetV = StartIdx < 0 ? Hi : Lo; + + // Either set up this value if we've not encountered it before, or check + // that it remains consistent. + if (!TargetV) + TargetV = MaskV; + else if (TargetV != MaskV) + // This may be a rotation, but it pulls from the inputs in some + // unsupported interleaving. + return SDValue(); + } + + // Check that we successfully analyzed the mask, and normalize the results. + assert(Rotation != 0 && "Failed to locate a viable rotation!"); + assert((Lo || Hi) && "Failed to find a rotated input vector!"); + if (!Lo) + Lo = Hi; + else if (!Hi) + Hi = Lo; + + assert(VT.getSizeInBits() == 128 && + "Rotate-based lowering only supports 128-bit lowering!"); + assert(Mask.size() <= 16 && + "Can shuffle at most 16 bytes in a 128-bit vector!"); + + // The actual rotate instruction rotates bytes, so we need to scale the + // rotation based on how many bytes are in the vector. + int Scale = 16 / Mask.size(); + + // SSSE3 targets can use the palignr instruction + if (Subtarget->hasSSSE3()) { + // Cast the inputs to v16i8 to match PALIGNR. + Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo); + Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi); + + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo, + DAG.getConstant(Rotation * Scale, MVT::i8))); + } + + // Default SSE2 implementation + int LoByteShift = 16 - Rotation * Scale; + int HiByteShift = Rotation * Scale; + + // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ. + Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo); + Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi); + + SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo, + DAG.getConstant(8 * LoByteShift, MVT::i8)); + SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi, + DAG.getConstant(8 * HiByteShift, MVT::i8)); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift)); +} + +/// \brief Compute whether each element of a shuffle is zeroable. +/// +/// A "zeroable" vector shuffle element is one which can be lowered to zero. +/// Either it is an undef element in the shuffle mask, the element of the input +/// referenced is undef, or the element of the input referenced is known to be +/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle +/// as many lanes with this technique as possible to simplify the remaining +/// shuffle. +static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, + SDValue V1, SDValue V2) { + SmallBitVector Zeroable(Mask.size(), false); + + bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); + + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + int M = Mask[i]; + // Handle the easy cases. + if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { + Zeroable[i] = true; + continue; + } + + // If this is an index into a build_vector node, dig out the input value and + // use it. + SDValue V = M < Size ? V1 : V2; + if (V.getOpcode() != ISD::BUILD_VECTOR) + continue; + + SDValue Input = V.getOperand(M % Size); + // The UNDEF opcode check really should be dead code here, but not quite + // worth asserting on (it isn't invalid, just unexpected). + if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input)) + Zeroable[i] = true; + } + + return Zeroable; +} + +/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros). +/// +/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2 +/// byte-shift instructions. The mask must consist of a shifted sequential +/// shuffle from one of the input vectors and zeroable elements for the +/// remaining 'shifted in' elements. +/// +/// Note that this only handles 128-bit vector widths currently. +static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); + + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + int Size = Mask.size(); + int Scale = 16 / Size; + + auto isSequential = [](int Base, int StartIndex, int EndIndex, int MaskOffset, + ArrayRef<int> Mask) { + for (int i = StartIndex; i < EndIndex; i++) { + if (Mask[i] < 0) + continue; + if (i + Base != Mask[i] - MaskOffset) + return false; + } + return true; + }; + + for (int Shift = 1; Shift < Size; Shift++) { + int ByteShift = Shift * Scale; + + // PSRLDQ : (little-endian) right byte shift + // [ 5, 6, 7, zz, zz, zz, zz, zz] + // [ -1, 5, 6, 7, zz, zz, zz, zz] + // [ 1, 2, -1, -1, -1, -1, zz, zz] + bool ZeroableRight = true; + for (int i = Size - Shift; i < Size; i++) { + ZeroableRight &= Zeroable[i]; + } + + if (ZeroableRight) { + bool ValidShiftRight1 = isSequential(Shift, 0, Size - Shift, 0, Mask); + bool ValidShiftRight2 = isSequential(Shift, 0, Size - Shift, Size, Mask); + + if (ValidShiftRight1 || ValidShiftRight2) { + // Cast the inputs to v2i64 to match PSRLDQ. + SDValue &TargetV = ValidShiftRight1 ? V1 : V2; + SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); + SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V, + DAG.getConstant(ByteShift * 8, MVT::i8)); + return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); + } + } + + // PSLLDQ : (little-endian) left byte shift + // [ zz, 0, 1, 2, 3, 4, 5, 6] + // [ zz, zz, -1, -1, 2, 3, 4, -1] + // [ zz, zz, zz, zz, zz, zz, -1, 1] + bool ZeroableLeft = true; + for (int i = 0; i < Shift; i++) { + ZeroableLeft &= Zeroable[i]; + } + + if (ZeroableLeft) { + bool ValidShiftLeft1 = isSequential(-Shift, Shift, Size, 0, Mask); + bool ValidShiftLeft2 = isSequential(-Shift, Shift, Size, Size, Mask); + + if (ValidShiftLeft1 || ValidShiftLeft2) { + // Cast the inputs to v2i64 to match PSLLDQ. + SDValue &TargetV = ValidShiftLeft1 ? V1 : V2; + SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); + SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V, + DAG.getConstant(ByteShift * 8, MVT::i8)); + return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); + } + } + } + + return SDValue(); +} + +/// \brief Lower a vector shuffle as a zero or any extension. +/// +/// Given a specific number of elements, element bit width, and extension +/// stride, produce either a zero or any extension based on the available +/// features of the subtarget. +static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( + SDLoc DL, MVT VT, int NumElements, int Scale, bool AnyExt, SDValue InputV, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + assert(Scale > 1 && "Need a scale to extend."); + int EltBits = VT.getSizeInBits() / NumElements; + assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && + "Only 8, 16, and 32 bit elements can be extended."); + assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."); + + // Found a valid zext mask! Try various lowering strategies based on the + // input type and available ISA extensions. + if (Subtarget->hasSSE41()) { + MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), + NumElements / Scale); + InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV)); + } + + // For any extends we can cheat for larger element sizes and use shuffle + // instructions that can fold with a load and/or copy. + if (AnyExt && EltBits == 32) { + int PSHUFDMask[4] = {0, -1, 1, -1}; + return DAG.getNode( + ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); + } + if (AnyExt && EltBits == 16 && Scale > 2) { + int PSHUFDMask[4] = {0, -1, 0, -1}; + InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)); + int PSHUFHWMask[4] = {1, -1, -1, -1}; + return DAG.getNode( + ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, + DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV), + getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG))); + } + + // If this would require more than 2 unpack instructions to expand, use + // pshufb when available. We can only use more than 2 unpack instructions + // when zero extending i8 elements which also makes it easier to use pshufb. + if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) { + assert(NumElements == 16 && "Unexpected byte vector width!"); + SDValue PSHUFBMask[16]; + for (int i = 0; i < 16; ++i) + PSHUFBMask[i] = + DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8); + InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, + DAG.getNode(ISD::BUILD_VECTOR, DL, + MVT::v16i8, PSHUFBMask))); + } + + // Otherwise emit a sequence of unpacks. + do { + MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); + SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT) + : getZeroVector(InputVT, Subtarget, DAG, DL); + InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV); + InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext); + Scale /= 2; + EltBits *= 2; + NumElements /= 2; + } while (Scale > 1); + return DAG.getNode(ISD::BITCAST, DL, VT, InputV); +} + +/// \brief Try to lower a vector shuffle as a zero extension on any micrarch. +/// +/// This routine will try to do everything in its power to cleverly lower +/// a shuffle which happens to match the pattern of a zero extend. It doesn't +/// check for the profitability of this lowering, it tries to aggressively +/// match this pattern. It will use all of the micro-architectural details it +/// can to emit an efficient lowering. It handles both blends with all-zero +/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to +/// masking out later). +/// +/// The reason we have dedicated lowering for zext-style shuffles is that they +/// are both incredibly common and often quite performance sensitive. +static SDValue lowerVectorShuffleAsZeroOrAnyExtend( + SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + int Bits = VT.getSizeInBits(); + int NumElements = Mask.size(); + + // Define a helper function to check a particular ext-scale and lower to it if + // valid. + auto Lower = [&](int Scale) -> SDValue { + SDValue InputV; + bool AnyExt = true; + for (int i = 0; i < NumElements; ++i) { + if (Mask[i] == -1) + continue; // Valid anywhere but doesn't tell us anything. + if (i % Scale != 0) { + // Each of the extend elements needs to be zeroable. + if (!Zeroable[i]) + return SDValue(); + + // We no lorger are in the anyext case. + AnyExt = false; + continue; + } + + // Each of the base elements needs to be consecutive indices into the + // same input vector. + SDValue V = Mask[i] < NumElements ? V1 : V2; + if (!InputV) + InputV = V; + else if (InputV != V) + return SDValue(); // Flip-flopping inputs. + + if (Mask[i] % NumElements != i / Scale) + return SDValue(); // Non-consecutive strided elemenst. + } + + // If we fail to find an input, we have a zero-shuffle which should always + // have already been handled. + // FIXME: Maybe handle this here in case during blending we end up with one? + if (!InputV) + return SDValue(); + + return lowerVectorShuffleAsSpecificZeroOrAnyExtend( + DL, VT, NumElements, Scale, AnyExt, InputV, Subtarget, DAG); + }; + + // The widest scale possible for extending is to a 64-bit integer. + assert(Bits % 64 == 0 && + "The number of bits in a vector must be divisible by 64 on x86!"); + int NumExtElements = Bits / 64; + + // Each iteration, try extending the elements half as much, but into twice as + // many elements. + for (; NumExtElements < NumElements; NumExtElements *= 2) { + assert(NumElements % NumExtElements == 0 && + "The input vector size must be divisble by the extended size."); + if (SDValue V = Lower(NumElements / NumExtElements)) + return V; + } + + // No viable ext lowering found. + return SDValue(); +} + +/// \brief Try to get a scalar value for a specific element of a vector. +/// +/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar. +static SDValue getScalarValueForVectorElement(SDValue V, int Idx, + SelectionDAG &DAG) { + MVT VT = V.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + while (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + // If the bitcasts shift the element size, we can't extract an equivalent + // element from it. + MVT NewVT = V.getSimpleValueType(); + if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) + return SDValue(); + + if (V.getOpcode() == ISD::BUILD_VECTOR || + (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) + return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx)); + + return SDValue(); +} + +/// \brief Helper to test for a load that can be folded with x86 shuffles. +/// +/// This is particularly important because the set of instructions varies +/// significantly based on whether the operand is a load or not. +static bool isShuffleFoldableLoad(SDValue V) { + while (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + + return ISD::isNON_EXTLoad(V.getNode()); +} + +/// \brief Try to lower insertion of a single element into a zero vector. +/// +/// This is a common pattern that we have especially efficient patterns to lower +/// across all subtarget feature sets. +static SDValue lowerVectorShuffleAsElementInsertion( + MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + MVT ExtVT = VT; + MVT EltVT = VT.getVectorElementType(); + + int V2Index = std::find_if(Mask.begin(), Mask.end(), + [&Mask](int M) { return M >= (int)Mask.size(); }) - + Mask.begin(); + bool IsV1Zeroable = true; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (i != V2Index && !Zeroable[i]) { + IsV1Zeroable = false; + break; + } + + // Check for a single input from a SCALAR_TO_VECTOR node. + // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and + // all the smarts here sunk into that routine. However, the current + // lowering of BUILD_VECTOR makes that nearly impossible until the old + // vector shuffle lowering is dead. + if (SDValue V2S = getScalarValueForVectorElement( + V2, Mask[V2Index] - Mask.size(), DAG)) { + // We need to zext the scalar if it is smaller than an i32. + V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S); + if (EltVT == MVT::i8 || EltVT == MVT::i16) { + // Using zext to expand a narrow element won't work for non-zero + // insertions. + if (!IsV1Zeroable) + return SDValue(); + + // Zero-extend directly to i32. + ExtVT = MVT::v4i32; + V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); + } + V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); + } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 || + EltVT == MVT::i16) { + // Either not inserting from the low element of the input or the input + // element size is too small to use VZEXT_MOVL to clear the high bits. + return SDValue(); + } + + if (!IsV1Zeroable) { + // If V1 can't be treated as a zero vector we have fewer options to lower + // this. We can't support integer vectors or non-zero targets cheaply, and + // the V1 elements can't be permuted in any way. + assert(VT == ExtVT && "Cannot change extended type when non-zeroable!"); + if (!VT.isFloatingPoint() || V2Index != 0) + return SDValue(); + SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end()); + V1Mask[V2Index] = -1; + if (!isNoopShuffleMask(V1Mask)) + return SDValue(); + // This is essentially a special case blend operation, but if we have + // general purpose blend operations, they are always faster. Bail and let + // the rest of the lowering handle these as blends. + if (Subtarget->hasSSE41()) + return SDValue(); + + // Otherwise, use MOVSD or MOVSS. + assert((EltVT == MVT::f32 || EltVT == MVT::f64) && + "Only two types of floating point element types to handle!"); + return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL, + ExtVT, V1, V2); + } + + V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); + if (ExtVT != VT) + V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); + + if (V2Index != 0) { + // If we have 4 or fewer lanes we can cheaply shuffle the element into + // the desired position. Otherwise it is more efficient to do a vector + // shift left. We know that we can do a vector shift left because all + // the inputs are zero. + if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) { + SmallVector<int, 4> V2Shuffle(Mask.size(), 1); + V2Shuffle[V2Index] = 0; + V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); + } else { + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2); + V2 = DAG.getNode( + X86ISD::VSHLDQ, DL, MVT::v2i64, V2, + DAG.getConstant( + V2Index * EltVT.getSizeInBits(), + DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64))); + V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); + } + } + return V2; +} + +/// \brief Try to lower broadcast of a single element. +/// +/// For convenience, this code also bundles all of the subtarget feature set +/// filtering. While a little annoying to re-dispatch on type here, there isn't +/// a convenient way to factor it out. +static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V, + ArrayRef<int> Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + if (!Subtarget->hasAVX()) + return SDValue(); + if (VT.isInteger() && !Subtarget->hasAVX2()) + return SDValue(); + + // Check that the mask is a broadcast. + int BroadcastIdx = -1; + for (int M : Mask) + if (M >= 0 && BroadcastIdx == -1) + BroadcastIdx = M; + else if (M >= 0 && M != BroadcastIdx) + return SDValue(); + + assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " + "a sorted mask where the broadcast " + "comes from V1."); + + // Go up the chain of (vector) values to try and find a scalar load that + // we can combine with the broadcast. + for (;;) { + switch (V.getOpcode()) { + case ISD::CONCAT_VECTORS: { + int OperandSize = Mask.size() / V.getNumOperands(); + V = V.getOperand(BroadcastIdx / OperandSize); + BroadcastIdx %= OperandSize; + continue; + } + + case ISD::INSERT_SUBVECTOR: { + SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); + auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2)); + if (!ConstantIdx) + break; + + int BeginIdx = (int)ConstantIdx->getZExtValue(); + int EndIdx = + BeginIdx + (int)VInner.getValueType().getVectorNumElements(); + if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) { + BroadcastIdx -= BeginIdx; + V = VInner; + } else { + V = VOuter; + } + continue; + } + } + break; + } + + // Check if this is a broadcast of a scalar. We special case lowering + // for scalars so that we can more effectively fold with loads. + if (V.getOpcode() == ISD::BUILD_VECTOR || + (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { + V = V.getOperand(BroadcastIdx); + + // If the scalar isn't a load we can't broadcast from it in AVX1, only with + // AVX2. + if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) + return SDValue(); + } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) { + // We can't broadcast from a vector register w/o AVX2, and we can only + // broadcast from the zero-element of a vector register. + return SDValue(); + } + + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V); +} + /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full @@ -6969,12 +8108,56 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // Straight shuffle of a single input vector. Simulate this by using the // single input as both of the "inputs" to this instruction.. unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); + + if (Subtarget->hasAVX()) { + // If we have AVX, we can use VPERMILPS which will allow folding a load + // into the shuffle. + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1, + DAG.getConstant(SHUFPDMask, MVT::i8)); + } + return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1, DAG.getConstant(SHUFPDMask, MVT::i8)); } assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); assert(Mask[1] >= 2 && "Non-canonicalized blend!"); + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 2)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 3)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2); + + // If we have a single input, insert that into V1 if we can do so cheaply. + if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + // Try inverting the insertion since for v2 masks it is easy to do and we + // can't reliably sort the mask one way or the other. + int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), + Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG)) + return Insertion; + } + + // Try to use one of the special instruction patterns to handle two common + // blend patterns if a zero-blend above didn't work. + if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3)) + if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG)) + // We can either use a special instruction to load over the low double or + // to move just the low double. + return DAG.getNode( + isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD, + DL, MVT::v2f64, V2, + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S)); + + if (Subtarget->hasSSE41()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2, DAG.getConstant(SHUFPDMask, MVT::i8)); @@ -6998,6 +8181,11 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (isSingleInputShuffleMask(Mask)) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We have to map the mask as it is actually a v4i32 shuffle instruction. @@ -7011,6 +8199,44 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(WidenedMask, DAG))); } + // If we have a single input from V2 insert that into V1 if we can do so + // cheaply. + if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + // Try inverting the insertion since for v2 masks it is easy to do and we + // can't reliably sort the mask one way or the other. + int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), + Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG)) + return Insertion; + } + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 2)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 3)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); + + if (Subtarget->hasSSE41()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v2i64, V1, V2, Mask, DAG)) + return Shift; + + // Try to use byte rotation instructions. + // Its more profitable for pre-SSSE3 to use shuffles/unpacks. + if (Subtarget->hasSSSE3()) + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) + return Rotate; + // We implement this with SHUFPD which is pretty lame because it will likely // incur 2 cycles of stall for integer vectors on Nehalem and older chips. // However, all the alternatives are still more cycles and newer chips don't @@ -7021,38 +8247,25 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); } -/// \brief Lower 4-lane 32-bit floating point shuffles. +/// \brief Lower a vector shuffle using the SHUFPS instruction. /// -/// Uses instructions exclusively from the floating point unit to minimize -/// domain crossing penalties, as these are sufficient to implement all v4f32 -/// shuffles. -static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - SDLoc DL(Op); - assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); - assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); - assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); - assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); - +/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. +/// It makes no assumptions about whether this is the *best* lowering, it simply +/// uses it. +static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { SDValue LowV = V1, HighV = V2; int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); - if (NumV2Elements == 0) - // Straight shuffle of a single input vector. We pass the input vector to - // both operands to simulate this with a SHUFPS. - return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, - getV4X86ShuffleImm8ForMask(Mask, DAG)); - if (NumV2Elements == 1) { int V2Index = std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - Mask.begin(); + // Compute the index adjacent to V2Index and in the same half by toggling // the low bit. int V2AdjIndex = V2Index ^ 1; @@ -7069,7 +8282,7 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // To make this work, blend them together as the first step. int V1Index = V2AdjIndex; int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; - V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1, + V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, getV4X86ShuffleImm8ForMask(BlendMask, DAG)); // Now proceed to reconstruct the final blend as we have the necessary @@ -7086,9 +8299,17 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, } else if (NumV2Elements == 2) { if (Mask[0] < 4 && Mask[1] < 4) { // Handle the easy case where we have V1 in the low lanes and V2 in the - // high lanes. We never see this reversed because we sort the shuffle. + // high lanes. NewMask[2] -= 4; NewMask[3] -= 4; + } else if (Mask[2] < 4 && Mask[3] < 4) { + // We also handle the reversed case because this utility may get called + // when we detect a SHUFPS pattern but can't easily commute the shuffle to + // arrange things in the right direction. + NewMask[0] -= 4; + NewMask[1] -= 4; + HighV = V1; + LowV = V2; } else { // We have a mixture of V1 and V2 in both low and high lanes. Rather than // trying to place elements directly, just blend them and set up the final @@ -7100,7 +8321,7 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Mask[2] < 4 ? Mask[2] : Mask[3], (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; - V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2, + V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, getV4X86ShuffleImm8ForMask(BlendMask, DAG)); // Now we do a normal shuffle of V1 by giving V1 as both operands to @@ -7112,10 +8333,116 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, NewMask[3] = Mask[2] < 4 ? 3 : 1; } } - return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV, + return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV, getV4X86ShuffleImm8ForMask(NewMask, DAG)); } +/// \brief Lower 4-lane 32-bit floating point shuffles. +/// +/// Uses instructions exclusively from the floating point unit to minimize +/// domain crossing penalties, as these are sufficient to implement all v4f32 +/// shuffles. +static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + + if (NumV2Elements == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + if (Subtarget->hasAVX()) { + // If we have AVX, we can use VPERMILPS which will allow folding a load + // into the shuffle. + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + } + + // Otherwise, use a straight shuffle of a single input vector. We pass the + // input vector to both operands to simulate this with a SHUFPS. + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + } + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2); + if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); + + // There are special ways we can lower some single-element blends. However, we + // have custom ways we can lower more complex single-element blends below that + // we defer to if both this and BLENDPS fail to match, so restrict this to + // when the V2 input is targeting element 0 of the mask -- that is the fast + // case here. + if (NumV2Elements == 1 && Mask[0] >= 4) + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2, + Mask, Subtarget, DAG)) + return V; + + if (Subtarget->hasSSE41()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Check for whether we can use INSERTPS to perform the blend. We only use + // INSERTPS when the V1 elements are already in the correct locations + // because otherwise we can just always use two SHUFPS instructions which + // are much smaller to encode than a SHUFPS and an INSERTPS. + if (NumV2Elements == 1 && Subtarget->hasSSE41()) { + int V2Index = + std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - + Mask.begin(); + + // When using INSERTPS we can zero any lane of the destination. Collect + // the zero inputs into a mask and drop them from the lanes of V1 which + // actually need to be present as inputs to the INSERTPS. + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + // Synthesize a shuffle mask for the non-zero and non-v2 inputs. + bool InsertNeedsShuffle = false; + unsigned ZMask = 0; + for (int i = 0; i < 4; ++i) + if (i != V2Index) { + if (Zeroable[i]) { + ZMask |= 1 << i; + } else if (Mask[i] != i) { + InsertNeedsShuffle = true; + break; + } + } + + // We don't want to use INSERTPS or other insertion techniques if it will + // require shuffling anyways. + if (!InsertNeedsShuffle) { + // If all of V1 is zeroable, replace it with undef. + if ((ZMask | 1 << V2Index) == 0xF) + V1 = DAG.getUNDEF(MVT::v4f32); + + unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask; + assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); + + // Insert the V2 element into the desired position. + return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, + DAG.getConstant(InsertPSMask, MVT::i8)); + } + } + + // Otherwise fall back to a SHUFPS lowering strategy. + return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); +} + /// \brief Lower 4-lane i32 vector shuffles. /// /// We try to handle these with integer-domain shuffles where we can, but for @@ -7131,11 +8458,66 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); - if (isSingleInputShuffleMask(Mask)) + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. It also allows us to fold memory operands into the + // shuffle in many cases. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, + Mask, Subtarget, DAG)) + return ZExt; + + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + + if (NumV2Elements == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. + // We coerce the shuffle pattern to be compatible with UNPCK instructions + // but we aren't actually going to use the UNPCK instruction because doing + // so prevents folding a load into this instruction or making a copy. + const int UnpackLoMask[] = {0, 0, 1, 1}; + const int UnpackHiMask[] = {2, 2, 3, 3}; + if (isShuffleEquivalent(Mask, 0, 0, 1, 1)) + Mask = UnpackLoMask; + else if (isShuffleEquivalent(Mask, 2, 2, 3, 3)) + Mask = UnpackHiMask; + return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(Mask, DAG)); + } + + // There are special ways we can lower some single-element blends. + if (NumV2Elements == 1) + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2, + Mask, Subtarget, DAG)) + return V; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2); + if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); + + if (Subtarget->hasSSE41()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v4i32, V1, V2, Mask, DAG)) + return Shift; + + // Try to use byte rotation instructions. + // Its more profitable for pre-SSSE3 to use shuffles/unpacks. + if (Subtarget->hasSSSE3()) + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) + return Rotate; // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build @@ -7188,6 +8570,27 @@ static SDValue lowerV8I16SingleInputVectorShuffle( MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL); MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH); + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V, + Mask, Subtarget, DAG)) + return Broadcast; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V); + if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V); + + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v8i16, V, V, Mask, DAG)) + return Shift; + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v8i16, V, V, Mask, Subtarget, DAG)) + return Rotate; + // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all // such inputs we can swap two of the dwords across the half mark and end up // with <=2 inputs to each half in each half. Once there, we can fall through @@ -7196,22 +8599,126 @@ static SDValue lowerV8I16SingleInputVectorShuffle( // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] // - // Before we had 3-1 in the low half and 3-1 in the high half. Afterward, 2-2 - // and 2-2. - auto balanceSides = [&](ArrayRef<int> ThreeInputs, int OneInput, - int ThreeInputHalfSum, int OneInputHalfOffset) { + // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half + // and an existing 2-into-2 on the other half. In this case we may have to + // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or + // 1-into-3 which could cause us to cycle endlessly fixing each side in turn. + // Fortunately, we don't have to handle anything but a 2-into-2 pattern + // because any other situation (including a 3-into-1 or 1-into-3 in the other + // half than the one we target for fixing) will be fixed when we re-enter this + // path. We will also combine away any sequence of PSHUFD instructions that + // result into a single instruction. Here is an example of the tricky case: + // + // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] + // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3] + // + // This now has a 1-into-3 in the high half! Instead, we do two shuffles: + // + // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h] + // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6] + // + // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h] + // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6] + // + // The result is fine to be handled by the generic logic. + auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs, + ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs, + int AOffset, int BOffset) { + assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) && + "Must call this with A having 3 or 1 inputs from the A half."); + assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) && + "Must call this with B having 1 or 3 inputs from the B half."); + assert(AToAInputs.size() + BToAInputs.size() == 4 && + "Must call this with either 3:1 or 1:3 inputs (summing to 4)."); + // Compute the index of dword with only one word among the three inputs in // a half by taking the sum of the half with three inputs and subtracting // the sum of the actual three inputs. The difference is the remaining // slot. - int DWordA = (ThreeInputHalfSum - - std::accumulate(ThreeInputs.begin(), ThreeInputs.end(), 0)) / - 2; - int DWordB = OneInputHalfOffset / 2 + (OneInput / 2 + 1) % 2; + int ADWord, BDWord; + int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord; + int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord; + int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset; + ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs; + int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0]; + int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); + int TripleNonInputIdx = + TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); + TripleDWord = TripleNonInputIdx / 2; + + // We use xor with one to compute the adjacent DWord to whichever one the + // OneInput is in. + OneInputDWord = (OneInput / 2) ^ 1; + + // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA + // and BToA inputs. If there is also such a problem with the BToB and AToB + // inputs, we don't try to fix it necessarily -- we'll recurse and see it in + // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it + // is essential that we don't *create* a 3<-1 as then we might oscillate. + if (BToBInputs.size() == 2 && AToBInputs.size() == 2) { + // Compute how many inputs will be flipped by swapping these DWords. We + // need + // to balance this to ensure we don't form a 3-1 shuffle in the other + // half. + int NumFlippedAToBInputs = + std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) + + std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1); + int NumFlippedBToBInputs = + std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) + + std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1); + if ((NumFlippedAToBInputs == 1 && + (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) || + (NumFlippedBToBInputs == 1 && + (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) { + // We choose whether to fix the A half or B half based on whether that + // half has zero flipped inputs. At zero, we may not be able to fix it + // with that half. We also bias towards fixing the B half because that + // will more commonly be the high half, and we have to bias one way. + auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord, + ArrayRef<int> Inputs) { + int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot. + bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(), + PinnedIdx ^ 1) != Inputs.end(); + // Determine whether the free index is in the flipped dword or the + // unflipped dword based on where the pinned index is. We use this bit + // in an xor to conditionally select the adjacent dword. + int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord)); + bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), + FixFreeIdx) != Inputs.end(); + if (IsFixIdxInput == IsFixFreeIdxInput) + FixFreeIdx += 1; + IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), + FixFreeIdx) != Inputs.end(); + assert(IsFixIdxInput != IsFixFreeIdxInput && + "We need to be changing the number of flipped inputs!"); + int PSHUFHalfMask[] = {0, 1, 2, 3}; + std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]); + V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL, + MVT::v8i16, V, + getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG)); + + for (int &M : Mask) + if (M != -1 && M == FixIdx) + M = FixFreeIdx; + else if (M != -1 && M == FixFreeIdx) + M = FixIdx; + }; + if (NumFlippedBToBInputs != 0) { + int BPinnedIdx = + BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; + FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs); + } else { + assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!"); + int APinnedIdx = + AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; + FixFlippedInputs(APinnedIdx, ADWord, AToBInputs); + } + } + } int PSHUFDMask[] = {0, 1, 2, 3}; - PSHUFDMask[DWordA] = DWordB; - PSHUFDMask[DWordB] = DWordA; + PSHUFDMask[ADWord] = BDWord; + PSHUFDMask[BDWord] = ADWord; V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), @@ -7219,24 +8726,20 @@ static SDValue lowerV8I16SingleInputVectorShuffle( // Adjust the mask to match the new locations of A and B. for (int &M : Mask) - if (M != -1 && M/2 == DWordA) - M = 2 * DWordB + M % 2; - else if (M != -1 && M/2 == DWordB) - M = 2 * DWordA + M % 2; + if (M != -1 && M/2 == ADWord) + M = 2 * BDWord + M % 2; + else if (M != -1 && M/2 == BDWord) + M = 2 * ADWord + M % 2; // Recurse back into this routine to re-compute state now that this isn't // a 3 and 1 problem. return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), Mask); }; - if (NumLToL == 3 && NumHToL == 1) - return balanceSides(LToLInputs, HToLInputs[0], 0 + 1 + 2 + 3, 4); - else if (NumLToL == 1 && NumHToL == 3) - return balanceSides(HToLInputs, LToLInputs[0], 4 + 5 + 6 + 7, 0); - else if (NumLToH == 1 && NumHToH == 3) - return balanceSides(HToHInputs, LToHInputs[0], 4 + 5 + 6 + 7, 0); - else if (NumLToH == 3 && NumHToH == 1) - return balanceSides(LToHInputs, HToHInputs[0], 0 + 1 + 2 + 3, 4); + if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) + return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); + else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) + return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0); // At this point there are at most two inputs to the low and high halves from // each half. That means the inputs can always be grouped into dwords and @@ -7250,9 +8753,10 @@ static SDValue lowerV8I16SingleInputVectorShuffle( // First fix the masks for all the inputs that are staying in their // original halves. This will then dictate the targets of the cross-half // shuffles. - auto fixInPlaceInputs = [&PSHUFDMask]( - ArrayRef<int> InPlaceInputs, MutableArrayRef<int> SourceHalfMask, - MutableArrayRef<int> HalfMask, int HalfOffset) { + auto fixInPlaceInputs = + [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs, + MutableArrayRef<int> SourceHalfMask, + MutableArrayRef<int> HalfMask, int HalfOffset) { if (InPlaceInputs.empty()) return; if (InPlaceInputs.size() == 1) { @@ -7261,6 +8765,14 @@ static SDValue lowerV8I16SingleInputVectorShuffle( PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; return; } + if (IncomingInputs.empty()) { + // Just fix all of the in place inputs. + for (int Input : InPlaceInputs) { + SourceHalfMask[Input - HalfOffset] = Input - HalfOffset; + PSHUFDMask[Input / 2] = Input / 2; + } + return; + } assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); SourceHalfMask[InPlaceInputs[0] - HalfOffset] = @@ -7272,10 +8784,8 @@ static SDValue lowerV8I16SingleInputVectorShuffle( std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; }; - if (!HToLInputs.empty()) - fixInPlaceInputs(LToLInputs, PSHUFLMask, LoMask, 0); - if (!LToHInputs.empty()) - fixInPlaceInputs(HToHInputs, PSHUFHMask, HiMask, 4); + fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0); + fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4); // Now gather the cross-half inputs and place them into a free dword of // their target half. @@ -7284,7 +8794,8 @@ static SDValue lowerV8I16SingleInputVectorShuffle( auto moveInputsToRightHalf = [&PSHUFDMask]( MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs, MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask, - int SourceOffset, int DestOffset) { + MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset, + int DestOffset) { auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) { return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word; }; @@ -7310,7 +8821,7 @@ static SDValue lowerV8I16SingleInputVectorShuffle( Input - SourceOffset; // We have to swap the uses in our half mask in one sweep. for (int &M : HalfMask) - if (M == SourceHalfMask[Input - SourceOffset]) + if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset) M = Input; else if (M == Input) M = SourceHalfMask[Input - SourceOffset] + SourceOffset; @@ -7362,18 +8873,68 @@ static SDValue lowerV8I16SingleInputVectorShuffle( } else if (IncomingInputs.size() == 2) { if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { - int SourceDWordBase = !isDWordClobbered(SourceHalfMask, 0) ? 0 : 2; - assert(!isDWordClobbered(SourceHalfMask, SourceDWordBase) && - "Not all dwords can be clobbered!"); - SourceHalfMask[SourceDWordBase] = IncomingInputs[0] - SourceOffset; - SourceHalfMask[SourceDWordBase + 1] = IncomingInputs[1] - SourceOffset; + // We have two non-adjacent or clobbered inputs we need to extract from + // the source half. To do this, we need to map them into some adjacent + // dword slot in the source mask. + int InputsFixed[2] = {IncomingInputs[0] - SourceOffset, + IncomingInputs[1] - SourceOffset}; + + // If there is a free slot in the source half mask adjacent to one of + // the inputs, place the other input in it. We use (Index XOR 1) to + // compute an adjacent index. + if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) && + SourceHalfMask[InputsFixed[0] ^ 1] == -1) { + SourceHalfMask[InputsFixed[0]] = InputsFixed[0]; + SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; + InputsFixed[1] = InputsFixed[0] ^ 1; + } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) && + SourceHalfMask[InputsFixed[1] ^ 1] == -1) { + SourceHalfMask[InputsFixed[1]] = InputsFixed[1]; + SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0]; + InputsFixed[0] = InputsFixed[1] ^ 1; + } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 && + SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) { + // The two inputs are in the same DWord but it is clobbered and the + // adjacent DWord isn't used at all. Move both inputs to the free + // slot. + SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0]; + SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1]; + InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1); + InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1; + } else { + // The only way we hit this point is if there is no clobbering + // (because there are no off-half inputs to this half) and there is no + // free slot adjacent to one of the inputs. In this case, we have to + // swap an input with a non-input. + for (int i = 0; i < 4; ++i) + assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) && + "We can't handle any clobbers here!"); + assert(InputsFixed[1] != (InputsFixed[0] ^ 1) && + "Cannot have adjacent inputs here!"); + + SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; + SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1; + + // We also have to update the final source mask in this case because + // it may need to undo the above swap. + for (int &M : FinalSourceHalfMask) + if (M == (InputsFixed[0] ^ 1) + SourceOffset) + M = InputsFixed[1] + SourceOffset; + else if (M == InputsFixed[1] + SourceOffset) + M = (InputsFixed[0] ^ 1) + SourceOffset; + + InputsFixed[1] = InputsFixed[0] ^ 1; + } + + // Point everything at the fixed inputs. for (int &M : HalfMask) if (M == IncomingInputs[0]) - M = SourceDWordBase + SourceOffset; + M = InputsFixed[0] + SourceOffset; else if (M == IncomingInputs[1]) - M = SourceDWordBase + 1 + SourceOffset; - IncomingInputs[0] = SourceDWordBase + SourceOffset; - IncomingInputs[1] = SourceDWordBase + 1 + SourceOffset; + M = InputsFixed[1] + SourceOffset; + + IncomingInputs[0] = InputsFixed[0] + SourceOffset; + IncomingInputs[1] = InputsFixed[1] + SourceOffset; } } else { llvm_unreachable("Unhandled input size!"); @@ -7383,13 +8944,14 @@ static SDValue lowerV8I16SingleInputVectorShuffle( int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2; assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free"); PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; - for (int Input : IncomingInputs) - std::replace(HalfMask.begin(), HalfMask.end(), Input, - FreeDWord * 2 + Input % 2); + for (int &M : HalfMask) + for (int Input : IncomingInputs) + if (M == Input) + M = FreeDWord * 2 + Input % 2; }; - moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, + moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask, /*SourceOffset*/ 4, /*DestOffset*/ 0); - moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, + moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask, /*SourceOffset*/ 0, /*DestOffset*/ 4); // Now enact all the shuffles we've computed to move the inputs into their @@ -7526,34 +9088,37 @@ static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1, if (GoodInputs.size() == 2) { // If the low inputs are spread across two dwords, pack them into // a single dword. - MoveMask[Mask[GoodInputs[0]] % 2 + MoveOffset] = - Mask[GoodInputs[0]] - MaskOffset; - MoveMask[Mask[GoodInputs[1]] % 2 + MoveOffset] = - Mask[GoodInputs[1]] - MaskOffset; - Mask[GoodInputs[0]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset; - Mask[GoodInputs[1]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset; + MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset; + MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset; + Mask[GoodInputs[0]] = MoveOffset + MaskOffset; + Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset; } else { - // Otherwise pin the low inputs. + // Otherwise pin the good inputs. for (int GoodInput : GoodInputs) MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset; } - int MoveMaskIdx = - std::find(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), -1) - - std::begin(MoveMask); - assert(MoveMaskIdx >= MoveOffset && "Established above"); - if (BadInputs.size() == 2) { + // If we have two bad inputs then there may be either one or two good + // inputs fixed in place. Find a fixed input, and then find the *other* + // two adjacent indices by using modular arithmetic. + int GoodMaskIdx = + std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), + [](int M) { return M >= 0; }) - + std::begin(MoveMask); + int MoveMaskIdx = + ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset; assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot"); assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot"); - MoveMask[MoveMaskIdx + Mask[BadInputs[0]] % 2] = - Mask[BadInputs[0]] - MaskOffset; - MoveMask[MoveMaskIdx + Mask[BadInputs[1]] % 2] = - Mask[BadInputs[1]] - MaskOffset; - Mask[BadInputs[0]] = MoveMaskIdx + Mask[BadInputs[0]] % 2 + MaskOffset; - Mask[BadInputs[1]] = MoveMaskIdx + Mask[BadInputs[1]] % 2 + MaskOffset; + MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset; + MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset; + Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset; + Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset; } else { assert(BadInputs.size() == 1 && "All sizes handled"); + int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset, + std::end(MoveMask), -1) - + std::begin(MoveMask); MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset; Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset; } @@ -7609,6 +9174,12 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG)) + return ZExt; + auto isV1 = [](int M) { return M >= 0 && M < 8; }; auto isV2 = [](int M) { return M >= 8; }; @@ -7621,6 +9192,33 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized " "to be V1-input shuffles."); + // There are special ways we can lower some single-element blends. + if (NumV2Inputs == 1) + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2, + Mask, Subtarget, DAG)) + return V; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2); + if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2); + + if (Subtarget->hasSSE41()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v8i16, V1, V2, Mask, DAG)) + return Shift; + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) + return Rotate; + if (NumV1Inputs + NumV2Inputs <= 4) return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG); @@ -7664,6 +9262,74 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV)); } +/// \brief Check whether a compaction lowering can be done by dropping even +/// elements and compute how many times even elements must be dropped. +/// +/// This handles shuffles which take every Nth element where N is a power of +/// two. Example shuffle masks: +/// +/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 +/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 +/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 +/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 +/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 +/// +/// Any of these lanes can of course be undef. +/// +/// This routine only supports N <= 3. +/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here +/// for larger N. +/// +/// \returns N above, or the number of times even elements must be dropped if +/// there is such a number. Otherwise returns zero. +static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) { + // Figure out whether we're looping over two inputs or just one. + bool IsSingleInput = isSingleInputShuffleMask(Mask); + + // The modulus for the shuffle vector entries is based on whether this is + // a single input or not. + int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2); + assert(isPowerOf2_32((uint32_t)ShuffleModulus) && + "We should only be called with masks with a power-of-2 size!"); + + uint64_t ModMask = (uint64_t)ShuffleModulus - 1; + + // We track whether the input is viable for all power-of-2 strides 2^1, 2^2, + // and 2^3 simultaneously. This is because we may have ambiguity with + // partially undef inputs. + bool ViableForN[3] = {true, true, true}; + + for (int i = 0, e = Mask.size(); i < e; ++i) { + // Ignore undef lanes, we'll optimistically collapse them to the pattern we + // want. + if (Mask[i] == -1) + continue; + + bool IsAnyViable = false; + for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) + if (ViableForN[j]) { + uint64_t N = j + 1; + + // The shuffle mask must be equal to (i * 2^N) % M. + if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask)) + IsAnyViable = true; + else + ViableForN[j] = false; + } + // Early exit if we exhaust the possible powers of two. + if (!IsAnyViable) + break; + } + + for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) + if (ViableForN[j]) + return j + 1; + + // Return 0 as there is no viable power of two. + return 0; +} + /// \brief Generic lowering of v16i8 shuffles. /// /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to @@ -7681,6 +9347,22 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); ArrayRef<int> OrigMask = SVOp->getMask(); assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v16i8, V1, V2, OrigMask, DAG)) + return Shift; + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG)) + return Rotate; + + // Try to use a zext lowering. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG)) + return ZExt; + int MaskStorage[16] = { OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7], @@ -7690,8 +9372,16 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, MutableArrayRef<int> LoMask = Mask.slice(0, 8); MutableArrayRef<int> HiMask = Mask.slice(8, 8); + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; }); + // For single-input shuffles, there are some nicer lowering tricks we can use. - if (isSingleInputShuffleMask(Mask)) { + if (NumV2Elements == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + // Check whether we can widen this to an i16 shuffle by duplicating bytes. // Notably, this handles splat and partial-splat shuffles more efficiently. // However, it only makes sense if the pre-duplication shuffle simplifies @@ -7701,10 +9391,10 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // FIXME: We should check for other patterns which can be widened into an // i16 shuffle as well. auto canWidenViaDuplication = [](ArrayRef<int> Mask) { - for (int i = 0; i < 16; i += 2) { - if (Mask[i] != Mask[i + 1]) + for (int i = 0; i < 16; i += 2) + if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1]) return false; - } + return true; }; auto tryToWidenViaDuplication = [&]() -> SDValue { @@ -7765,11 +9455,16 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, MVT::v16i8, V1, V1); int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; - for (int i = 0; i < 16; i += 2) { - if (Mask[i] != -1) - PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); - assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!"); - } + for (int i = 0; i < 16; ++i) + if (Mask[i] != -1) { + int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); + assert(MappedMask < 8 && "Invalid v8 shuffle mask!"); + if (PostDupI16Shuffle[i / 2] == -1) + PostDupI16Shuffle[i / 2] = MappedMask; + else + assert(PostDupI16Shuffle[i / 2] == MappedMask && + "Conflicting entrties in the original shuffle!"); + } return DAG.getNode( ISD::BITCAST, DL, MVT::v16i8, DAG.getVectorShuffle(MVT::v8i16, DL, @@ -7786,21 +9481,108 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // // FIXME: We need to handle other interleaving widths (i16, i32, ...). if (shouldLowerAsInterleaving(Mask)) { - // FIXME: Figure out whether we should pack these into the low or high - // halves. - - int EMask[16], OMask[16]; + int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) { + return (M >= 0 && M < 8) || (M >= 16 && M < 24); + }); + int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) { + return (M >= 8 && M < 16) || M >= 24; + }); + int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1}; + int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1}; + bool UnpackLo = NumLoHalf >= NumHiHalf; + MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8); + MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8); for (int i = 0; i < 8; ++i) { - EMask[i] = Mask[2*i]; - OMask[i] = Mask[2*i + 1]; - EMask[i + 8] = -1; - OMask[i + 8] = -1; + TargetEMask[i] = Mask[2 * i]; + TargetOMask[i] = Mask[2 * i + 1]; } SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask); SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask); - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds); + return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, + MVT::v16i8, Evens, Odds); + } + + // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly + // with PSHUFB. It is important to do this before we attempt to generate any + // blends but after all of the single-input lowerings. If the single input + // lowerings can find an instruction sequence that is faster than a PSHUFB, we + // want to preserve that and we can DAG combine any longer sequences into + // a PSHUFB in the end. But once we start blending from multiple inputs, + // the complexity of DAG combining bad patterns back into PSHUFB is too high, + // and there are *very* few patterns that would actually be faster than the + // PSHUFB approach because of its ability to zero lanes. + // + // FIXME: The only exceptions to the above are blends which are exact + // interleavings with direct instructions supporting them. We currently don't + // handle those well here. + if (Subtarget->hasSSSE3()) { + SDValue V1Mask[16]; + SDValue V2Mask[16]; + for (int i = 0; i < 16; ++i) + if (Mask[i] == -1) { + V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8); + } else { + V1Mask[i] = DAG.getConstant(Mask[i] < 16 ? Mask[i] : 0x80, MVT::i8); + V2Mask[i] = + DAG.getConstant(Mask[i] < 16 ? 0x80 : Mask[i] - 16, MVT::i8); + } + V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask)); + if (isSingleInputShuffleMask(Mask)) + return V1; // Single inputs are easy. + + // Otherwise, blend the two. + V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask)); + return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); + } + + // There are special ways we can lower some single-element blends. + if (NumV2Elements == 1) + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2, + Mask, Subtarget, DAG)) + return V; + + // Check whether a compaction lowering can be done. This handles shuffles + // which take every Nth element for some even N. See the helper function for + // details. + // + // We special case these as they can be particularly efficiently handled with + // the PACKUSB instruction on x86 and they show up in common patterns of + // rearranging bytes to truncate wide elements. + if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) { + // NumEvenDrops is the power of two stride of the elements. Another way of + // thinking about it is that we need to drop the even elements this many + // times to get the original input. + bool IsSingleInput = isSingleInputShuffleMask(Mask); + + // First we need to zero all the dropped bytes. + assert(NumEvenDrops <= 3 && + "No support for dropping even elements more than 3 times."); + // We use the mask type to pick which bytes are preserved based on how many + // elements are dropped. + MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 }; + SDValue ByteClearMask = + DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, + DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1])); + V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask); + if (!IsSingleInput) + V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask); + + // Now pack things back together. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1); + V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2); + SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2); + for (int i = 1; i < NumEvenDrops; ++i) { + Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result); + Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result); + } + + return Result; } int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; @@ -7899,15 +9681,933 @@ static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, } } -/// \brief Tiny helper function to test whether adjacent masks are sequential. -static bool areAdjacentMasksSequential(ArrayRef<int> Mask) { - for (int i = 0, Size = Mask.size(); i < Size; i += 2) - if (Mask[i] + 1 != Mask[i+1]) +/// \brief Helper function to test whether a shuffle mask could be +/// simplified by widening the elements being shuffled. +/// +/// Appends the mask for wider elements in WidenedMask if valid. Otherwise +/// leaves it in an unspecified state. +/// +/// NOTE: This must handle normal vector shuffle masks and *target* vector +/// shuffle masks. The latter have the special property of a '-2' representing +/// a zero-ed lane of a vector. +static bool canWidenShuffleElements(ArrayRef<int> Mask, + SmallVectorImpl<int> &WidenedMask) { + for (int i = 0, Size = Mask.size(); i < Size; i += 2) { + // If both elements are undef, its trivial. + if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) { + WidenedMask.push_back(SM_SentinelUndef); + continue; + } + + // Check for an undef mask and a mask value properly aligned to fit with + // a pair of values. If we find such a case, use the non-undef mask's value. + if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) { + WidenedMask.push_back(Mask[i + 1] / 2); + continue; + } + if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) { + WidenedMask.push_back(Mask[i] / 2); + continue; + } + + // When zeroing, we need to spread the zeroing across both lanes to widen. + if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) { + if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) && + (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) { + WidenedMask.push_back(SM_SentinelZero); + continue; + } return false; + } + + // Finally check if the two mask values are adjacent and aligned with + // a pair. + if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) { + WidenedMask.push_back(Mask[i] / 2); + continue; + } + + // Otherwise we can't safely widen the elements used in this shuffle. + return false; + } + assert(WidenedMask.size() == Mask.size() / 2 && + "Incorrect size of mask after widening the elements!"); return true; } +/// \brief Generic routine to split ector shuffle into half-sized shuffles. +/// +/// This routine just extracts two subvectors, shuffles them independently, and +/// then concatenates them back together. This should work effectively with all +/// AVX vector shuffle types. +static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(VT.getSizeInBits() >= 256 && + "Only for 256-bit or wider vector shuffles!"); + assert(V1.getSimpleValueType() == VT && "Bad operand type!"); + assert(V2.getSimpleValueType() == VT && "Bad operand type!"); + + ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2); + ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2); + + int NumElements = VT.getVectorNumElements(); + int SplitNumElements = NumElements / 2; + MVT ScalarVT = VT.getScalarType(); + MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2); + + SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1, + DAG.getIntPtrConstant(0)); + SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1, + DAG.getIntPtrConstant(SplitNumElements)); + SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2, + DAG.getIntPtrConstant(0)); + SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2, + DAG.getIntPtrConstant(SplitNumElements)); + + // Now create two 4-way blends of these half-width vectors. + auto HalfBlend = [&](ArrayRef<int> HalfMask) { + bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false; + SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask; + for (int i = 0; i < SplitNumElements; ++i) { + int M = HalfMask[i]; + if (M >= NumElements) { + if (M >= NumElements + SplitNumElements) + UseHiV2 = true; + else + UseLoV2 = true; + V2BlendMask.push_back(M - NumElements); + V1BlendMask.push_back(-1); + BlendMask.push_back(SplitNumElements + i); + } else if (M >= 0) { + if (M >= SplitNumElements) + UseHiV1 = true; + else + UseLoV1 = true; + V2BlendMask.push_back(-1); + V1BlendMask.push_back(M); + BlendMask.push_back(i); + } else { + V2BlendMask.push_back(-1); + V1BlendMask.push_back(-1); + BlendMask.push_back(-1); + } + } + + // Because the lowering happens after all combining takes place, we need to + // manually combine these blend masks as much as possible so that we create + // a minimal number of high-level vector shuffle nodes. + + // First try just blending the halves of V1 or V2. + if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2) + return DAG.getUNDEF(SplitVT); + if (!UseLoV2 && !UseHiV2) + return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); + if (!UseLoV1 && !UseHiV1) + return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); + + SDValue V1Blend, V2Blend; + if (UseLoV1 && UseHiV1) { + V1Blend = + DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); + } else { + // We only use half of V1 so map the usage down into the final blend mask. + V1Blend = UseLoV1 ? LoV1 : HiV1; + for (int i = 0; i < SplitNumElements; ++i) + if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements) + BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements); + } + if (UseLoV2 && UseHiV2) { + V2Blend = + DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); + } else { + // We only use half of V2 so map the usage down into the final blend mask. + V2Blend = UseLoV2 ? LoV2 : HiV2; + for (int i = 0; i < SplitNumElements; ++i) + if (BlendMask[i] >= SplitNumElements) + BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0); + } + return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask); + }; + SDValue Lo = HalfBlend(LoMask); + SDValue Hi = HalfBlend(HiMask); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); +} + +/// \brief Either split a vector in halves or decompose the shuffles and the +/// blend. +/// +/// This is provided as a good fallback for many lowerings of non-single-input +/// shuffles with more than one 128-bit lane. In those cases, we want to select +/// between splitting the shuffle into 128-bit components and stitching those +/// back together vs. extracting the single-input shuffles and blending those +/// results. +static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to " + "lower single-input shuffles as it " + "could then recurse on itself."); + int Size = Mask.size(); + + // If this can be modeled as a broadcast of two elements followed by a blend, + // prefer that lowering. This is especially important because broadcasts can + // often fold with memory operands. + auto DoBothBroadcast = [&] { + int V1BroadcastIdx = -1, V2BroadcastIdx = -1; + for (int M : Mask) + if (M >= Size) { + if (V2BroadcastIdx == -1) + V2BroadcastIdx = M - Size; + else if (M - Size != V2BroadcastIdx) + return false; + } else if (M >= 0) { + if (V1BroadcastIdx == -1) + V1BroadcastIdx = M; + else if (M != V1BroadcastIdx) + return false; + } + return true; + }; + if (DoBothBroadcast()) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, + DAG); + + // If the inputs all stem from a single 128-bit lane of each input, then we + // split them rather than blending because the split will decompose to + // unusually few instructions. + int LaneCount = VT.getSizeInBits() / 128; + int LaneSize = Size / LaneCount; + SmallBitVector LaneInputs[2]; + LaneInputs[0].resize(LaneCount, false); + LaneInputs[1].resize(LaneCount, false); + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0) + LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true; + if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + + // Otherwise, just fall back to decomposed shuffles and a blend. This requires + // that the decomposed single-input shuffles don't end up here. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); +} + +/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as +/// a permutation and blend of those lanes. +/// +/// This essentially blends the out-of-lane inputs to each lane into the lane +/// from a permuted copy of the vector. This lowering strategy results in four +/// instructions in the worst case for a single-input cross lane shuffle which +/// is lower than any other fully general cross-lane shuffle strategy I'm aware +/// of. Special cases for each particular shuffle pattern should be handled +/// prior to trying this lowering. +static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { + // FIXME: This should probably be generalized for 512-bit vectors as well. + assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!"); + int LaneSize = Mask.size() / 2; + + // If there are only inputs from one 128-bit lane, splitting will in fact be + // less expensive. The flags track wether the given lane contains an element + // that crosses to another lane. + bool LaneCrossing[2] = {false, false}; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) + LaneCrossing[(Mask[i] % Size) / LaneSize] = true; + if (!LaneCrossing[0] || !LaneCrossing[1]) + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + + if (isSingleInputShuffleMask(Mask)) { + SmallVector<int, 32> FlippedBlendMask; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + FlippedBlendMask.push_back( + Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize) + ? Mask[i] + : Mask[i] % LaneSize + + (i / LaneSize) * LaneSize + Size)); + + // Flip the vector, and blend the results which should now be in-lane. The + // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and + // 5 for the high source. The value 3 selects the high half of source 2 and + // the value 2 selects the low half of source 2. We only use source 2 to + // allow folding it into a memory operand. + unsigned PERMMask = 3 | 2 << 4; + SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT), + V1, DAG.getConstant(PERMMask, MVT::i8)); + return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask); + } + + // This now reduces to two single-input shuffles of V1 and V2 which at worst + // will be handled by the above logic and a blend of the results, much like + // other patterns in AVX. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering 2-lane 128-bit shuffles. +static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + // Blends are faster and handle all the non-lane-crossing cases. + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements() / 2); + // Check for patterns which can be matched with a single insert of a 128-bit + // subvector. + if (isShuffleEquivalent(Mask, 0, 1, 0, 1) || + isShuffleEquivalent(Mask, 0, 1, 4, 5)) { + SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, + DAG.getIntPtrConstant(0)); + SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, + Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); + } + if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) { + SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, + DAG.getIntPtrConstant(0)); + SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2, + DAG.getIntPtrConstant(2)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); + } + + // Otherwise form a 128-bit permutation. + // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half. + unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4; + return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, + DAG.getConstant(PermMask, MVT::i8)); +} + +/// \brief Handle lowering of 4-lane 64-bit floating point shuffles. +/// +/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 +/// isn't available. +static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + SmallVector<int, 4> WidenedMask; + if (canWidenShuffleElements(Mask, WidenedMask)) + return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget, + DAG); + + if (isSingleInputShuffleMask(Mask)) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { + // Non-half-crossing single input shuffles can be lowerid with an + // interleaved permutation. + unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | + ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1, + DAG.getConstant(VPERMILPMask, MVT::i8)); + } + + // With AVX2 we have direct support for this permutation. + if (Subtarget->hasAVX2()) + return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // Otherwise, fall back. + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, + DAG); + } + + // X86 has dedicated unpack instructions that can handle specific blend + // operations: UNPCKH and UNPCKL. + if (isShuffleEquivalent(Mask, 0, 4, 2, 6)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 5, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2); + + // If we have a single input to the zero element, insert that into V1 if we + // can do so cheaply. + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + if (NumV2Elements == 1 && Mask[0] >= 4) + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Check if the blend happens to exactly fit that of SHUFPD. + if ((Mask[0] == -1 || Mask[0] < 2) && + (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) && + (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) && + (Mask[3] == -1 || Mask[3] >= 6)) { + unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) | + ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3); + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2, + DAG.getConstant(SHUFPDMask, MVT::i8)); + } + if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) && + (Mask[1] == -1 || Mask[1] < 2) && + (Mask[2] == -1 || Mask[2] >= 6) && + (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) { + unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) | + ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3); + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1, + DAG.getConstant(SHUFPDMask, MVT::i8)); + } + + // If we have AVX2 then we always want to lower with a blend because an v4 we + // can fully permute the elements. + if (Subtarget->hasAVX2()) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, + Mask, DAG); + + // Otherwise fall back on generic lowering. + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 4-lane 64-bit integer shuffles. +/// +/// This routine is only called when we have AVX2 and thus a reasonable +/// instruction set for v4i64 shuffling.. +static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!"); + + SmallVector<int, 4> WidenedMask; + if (canWidenShuffleElements(Mask, WidenedMask)) + return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget, + DAG); + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // When the shuffle is mirrored between the 128-bit lanes of the unit, we can + // use lower latency instructions that will operate on both 128-bit lanes. + SmallVector<int, 2> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) { + if (isSingleInputShuffleMask(Mask)) { + int PSHUFDMask[] = {-1, -1, -1, -1}; + for (int i = 0; i < 2; ++i) + if (RepeatedMask[i] >= 0) { + PSHUFDMask[2 * i] = 2 * RepeatedMask[i]; + PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1; + } + return DAG.getNode( + ISD::BITCAST, DL, MVT::v4i64, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, + DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); + } + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 4, 2, 6)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 5, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2); + } + + // AVX2 provides a direct instruction for permuting a single input across + // lanes. + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // Otherwise fall back on generic blend lowering. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, + Mask, DAG); +} + +/// \brief Handle lowering of 8-lane 32-bit floating point shuffles. +/// +/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 +/// isn't available. +static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // If the shuffle mask is repeated in each 128-bit lane, we have many more + // options to efficiently lower the shuffle. + SmallVector<int, 4> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) { + assert(RepeatedMask.size() == 4 && + "Repeated masks must be half the mask width!"); + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1, + getV4X86ShuffleImm8ForMask(RepeatedMask, DAG)); + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2); + if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2); + + // Otherwise, fall back to a SHUFPS sequence. Here it is important that we + // have already handled any direct blends. We also need to squash the + // repeated mask into a simulated v4f32 mask. + for (int i = 0; i < 4; ++i) + if (RepeatedMask[i] >= 8) + RepeatedMask[i] -= 4; + return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); + } + + // If we have a single input shuffle with different shuffle patterns in the + // two 128-bit lanes use the variable mask to VPERMILPS. + if (isSingleInputShuffleMask(Mask)) { + SDValue VPermMask[8]; + for (int i = 0; i < 8; ++i) + VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) + : DAG.getConstant(Mask[i], MVT::i32); + if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) + return DAG.getNode( + X86ISD::VPERMILPV, DL, MVT::v8f32, V1, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask)); + + if (Subtarget->hasAVX2()) + return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, + DAG.getNode(ISD::BITCAST, DL, MVT::v8f32, + DAG.getNode(ISD::BUILD_VECTOR, DL, + MVT::v8i32, VPermMask)), + V1); + + // Otherwise, fall back. + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, + DAG); + } + + // If we have AVX2 then we always want to lower with a blend because at v8 we + // can fully permute the elements. + if (Subtarget->hasAVX2()) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, + Mask, DAG); + + // Otherwise fall back on generic lowering. + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 8-lane 32-bit integer shuffles. +/// +/// This routine is only called when we have AVX2 and thus a reasonable +/// instruction set for v8i32 shuffling.. +static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!"); + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // If the shuffle mask is repeated in each 128-bit lane we can use more + // efficient instructions that mirror the shuffles across the two 128-bit + // lanes. + SmallVector<int, 4> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) { + assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, + getV4X86ShuffleImm8ForMask(RepeatedMask, DAG)); + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2); + if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2); + } + + // If the shuffle patterns aren't repeated but it is a single input, directly + // generate a cross-lane VPERMD instruction. + if (isSingleInputShuffleMask(Mask)) { + SDValue VPermMask[8]; + for (int i = 0; i < 8; ++i) + VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) + : DAG.getConstant(Mask[i], MVT::i32); + return DAG.getNode( + X86ISD::VPERMV, DL, MVT::v8i32, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1); + } + + // Otherwise fall back on generic blend lowering. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, + Mask, DAG); +} + +/// \brief Handle lowering of 16-lane 16-bit integer shuffles. +/// +/// This routine is only called when we have AVX2 and thus a reasonable +/// instruction set for v16i16 shuffling.. +static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!"); + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // There are no generalized cross-lane shuffle operations available on i16 + // element types. + if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, + Mask, DAG); + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, + // First 128-bit lane: + 0, 16, 1, 17, 2, 18, 3, 19, + // Second 128-bit lane: + 8, 24, 9, 25, 10, 26, 11, 27)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2); + if (isShuffleEquivalent(Mask, + // First 128-bit lane: + 4, 20, 5, 21, 6, 22, 7, 23, + // Second 128-bit lane: + 12, 28, 13, 29, 14, 30, 15, 31)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2); + + if (isSingleInputShuffleMask(Mask)) { + SDValue PSHUFBMask[32]; + for (int i = 0; i < 16; ++i) { + if (Mask[i] == -1) { + PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8); + continue; + } + + int M = i < 8 ? Mask[i] : Mask[i] - 8; + assert(M >= 0 && M < 8 && "Invalid single-input mask!"); + PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8); + PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8); + } + return DAG.getNode( + ISD::BITCAST, DL, MVT::v16i16, + DAG.getNode( + X86ISD::PSHUFB, DL, MVT::v32i8, + DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1), + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask))); + } + + // Otherwise fall back on generic lowering. + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 32-lane 8-bit integer shuffles. +/// +/// This routine is only called when we have AVX2 and thus a reasonable +/// instruction set for v32i8 shuffling.. +static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); + assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!"); + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // There are no generalized cross-lane shuffle operations available on i8 + // element types. + if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, + Mask, DAG); + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Use dedicated unpack instructions for masks that match their pattern. + // Note that these are repeated 128-bit lane unpacks, not unpacks across all + // 256-bit lanes. + if (isShuffleEquivalent( + Mask, + // First 128-bit lane: + 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, + // Second 128-bit lane: + 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2); + if (isShuffleEquivalent( + Mask, + // First 128-bit lane: + 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, + // Second 128-bit lane: + 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2); + + if (isSingleInputShuffleMask(Mask)) { + SDValue PSHUFBMask[32]; + for (int i = 0; i < 32; ++i) + PSHUFBMask[i] = + Mask[i] < 0 + ? DAG.getUNDEF(MVT::i8) + : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8); + + return DAG.getNode( + X86ISD::PSHUFB, DL, MVT::v32i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)); + } + + // Otherwise fall back on generic lowering. + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG); +} + +/// \brief High-level routine to lower various 256-bit x86 vector shuffles. +/// +/// This routine either breaks down the specific type of a 256-bit x86 vector +/// shuffle or splits it into two 128-bit shuffles and fuses the results back +/// together based on the available instructions. +static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, + MVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + + // There is a really nice hard cut-over between AVX1 and AVX2 that means we can + // check for those subtargets here and avoid much of the subtarget querying in + // the per-vector-type lowering routines. With AVX1 we have essentially *zero* + // ability to manipulate a 256-bit vector with integer types. Since we'll use + // floating point types there eventually, just immediately cast everything to + // a float and operate entirely in that domain. + if (VT.isInteger() && !Subtarget->hasAVX2()) { + int ElementBits = VT.getScalarSizeInBits(); + if (ElementBits < 32) + // No floating point type available, decompose into 128-bit vectors. + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + + MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), + VT.getVectorNumElements()); + V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask)); + } + + switch (VT.SimpleTy) { + case MVT::v4f64: + return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v4i64: + return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v8f32: + return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v8i32: + return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v16i16: + return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v32i8: + return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG); + + default: + llvm_unreachable("Not a valid 256-bit x86 vector type!"); + } +} + +/// \brief Handle lowering of 8-lane 64-bit floating point shuffles. +static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 16-lane 32-bit floating point shuffles. +static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 8-lane 64-bit integer shuffles. +static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 16-lane 32-bit integer shuffles. +static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 32-lane 16-bit integer shuffles. +static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); + assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 64-lane 8-bit integer shuffles. +static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"); + assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); +} + +/// \brief High-level routine to lower various 512-bit x86 vector shuffles. +/// +/// This routine either breaks down the specific type of a 512-bit x86 vector +/// shuffle or splits it into two 256-bit shuffles and fuses the results back +/// together based on the available instructions. +static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, + MVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Subtarget->hasAVX512() && + "Cannot lower 512-bit vectors w/ basic ISA!"); + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // Dispatch to each element type for lowering. If we don't have supprot for + // specific element type shuffles at 512 bits, immediately split them and + // lower them. Each lowering routine of a given type is allowed to assume that + // the requisite ISA extensions for that element type are available. + switch (VT.SimpleTy) { + case MVT::v8f64: + return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v16f32: + return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v8i64: + return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v16i32: + return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v32i16: + if (Subtarget->hasBWI()) + return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG); + break; + case MVT::v64i8: + if (Subtarget->hasBWI()) + return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG); + break; + + default: + llvm_unreachable("Not a valid 512-bit x86 vector type!"); + } + + // Otherwise fall back on splitting. + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); +} + /// \brief Top-level lowering for x86 vector shuffles. /// /// This handles decomposition, canonicalization, and lowering of all x86 @@ -7936,7 +10636,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, // but in some cases the first operand may be transformed to UNDEF. // In this case we should just commute the node. if (V1IsUndef) - return CommuteVectorShuffle(SVOp, DAG); + return DAG.getCommutedVectorShuffle(*SVOp); // Check for non-undef masks pointing at an undef vector and make the masks // undef as well. This makes it easier to match the shuffle based solely on @@ -7951,22 +10651,25 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask); } - // For integer vector shuffles, try to collapse them into a shuffle of fewer - // lanes but wider integers. We cap this to not form integers larger than i64 - // but it might be interesting to form i128 integers to handle flipping the - // low and high halves of AVX 256-bit vectors. - if (VT.isInteger() && VT.getScalarSizeInBits() < 64 && - areAdjacentMasksSequential(Mask)) { - SmallVector<int, 8> NewMask; - for (int i = 0, Size = Mask.size(); i < Size; i += 2) - NewMask.push_back(Mask[i] / 2); - MVT NewVT = - MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2), - VT.getVectorNumElements() / 2); - V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); - V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); - return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getVectorShuffle(NewVT, dl, V1, V2, NewMask)); + // Try to collapse shuffles into using a vector type with fewer elements but + // wider element types. We cap this to not form integers or floating point + // elements wider than 64 bits, but it might be interesting to form i128 + // integers to handle flipping the low and high halves of AVX 256-bit vectors. + SmallVector<int, 16> WidenedMask; + if (VT.getScalarSizeInBits() < 64 && + canWidenShuffleElements(Mask, WidenedMask)) { + MVT NewEltVT = VT.isFloatingPoint() + ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) + : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2); + MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2); + // Make sure that the new vector type is legal. For example, v2f64 isn't + // legal on SSE1. + if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { + V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask)); + } } int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0; @@ -7982,10 +10685,12 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, // V2. This allows us to match the shuffle pattern strictly on how many // elements come from V1 without handling the symmetric cases. if (NumV2Elements > NumV1Elements) - return CommuteVectorShuffle(SVOp, DAG); + return DAG.getCommutedVectorShuffle(*SVOp); // When the number of V1 and V2 elements are the same, try to minimize the - // number of uses of V2 in the low half of the vector. + // number of uses of V2 in the low half of the vector. When that is tied, + // ensure that the sum of indices for V1 is equal to or lower than the sum + // indices for V2. if (NumV1Elements == NumV2Elements) { int LowV1Elements = 0, LowV2Elements = 0; for (int M : SVOp->getMask().slice(0, NumElements / 2)) @@ -7993,14 +10698,32 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, ++LowV2Elements; else if (M >= 0) ++LowV1Elements; - if (LowV2Elements > LowV1Elements) - return CommuteVectorShuffle(SVOp, DAG); + if (LowV2Elements > LowV1Elements) { + return DAG.getCommutedVectorShuffle(*SVOp); + } else if (LowV2Elements == LowV1Elements) { + int SumV1Indices = 0, SumV2Indices = 0; + for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i) + if (SVOp->getMask()[i] >= NumElements) + SumV2Indices += i; + else if (SVOp->getMask()[i] >= 0) + SumV1Indices += i; + if (SumV2Indices < SumV1Indices) + return DAG.getCommutedVectorShuffle(*SVOp); + } } // For each vector width, delegate to a specialized lowering routine. if (VT.getSizeInBits() == 128) return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + if (VT.getSizeInBits() == 256) + return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + + // Force AVX-512 vectors to be scalarized for now. + // FIXME: Implement AVX-512 support! + if (VT.getSizeInBits() == 512) + return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + llvm_unreachable("Unimplemented!"); } @@ -9060,6 +11783,20 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, To = V2; DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) - Mask.begin(); + + // If we have 1 element from each vector, we have to check if we're + // changing V1's element's place. If so, we're done. Otherwise, we + // should assume we're changing V2's element's place and behave + // accordingly. + int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate); + assert(DestIndex <= INT32_MAX && "truncated destination index"); + if (FromV1 == FromV2 && + static_cast<int>(DestIndex) == Mask[DestIndex] % 4) { + From = V2; + To = V1; + DestIndex = + std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin(); + } } else { assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 && "More than one element from V1 and from V2, or no elements from one " @@ -9071,6 +11808,8 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin(); } + // Get an index into the source vector in the range [0,4) (the mask is + // in the range [0,8) because it can address V1 and V2) unsigned SrcIndex = Mask[DestIndex] % 4; if (MayFoldLoad(From)) { // Trivial case, when From comes from a load and is only used by the @@ -9155,37 +11894,6 @@ static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget, if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT)) return SDValue(); - // Simplify the operand as it's prepared to be fed into shuffle. - unsigned SignificantBits = NVT.getSizeInBits() >> Shift; - if (V1.getOpcode() == ISD::BITCAST && - V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && - V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && - V1.getOperand(0).getOperand(0) - .getSimpleValueType().getSizeInBits() == SignificantBits) { - // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) - SDValue V = V1.getOperand(0).getOperand(0).getOperand(0); - ConstantSDNode *CIdx = - dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1)); - // If it's foldable, i.e. normal load with single use, we will let code - // selection to fold it. Otherwise, we will short the conversion sequence. - if (CIdx && CIdx->getZExtValue() == 0 && - (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) { - MVT FullVT = V.getSimpleValueType(); - MVT V1VT = V1.getSimpleValueType(); - if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) { - // The "ext_vec_elt" node is wider than the result node. - // In this case we should extract subvector from V. - // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)). - unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits(); - MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(), - FullVT.getVectorNumElements()/Ratio); - V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V, - DAG.getIntPtrConstant(0)); - } - V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V); - } - } - return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::VZEXT, DL, NVT, V1)); } @@ -9278,7 +11986,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // but in some cases the first operand may be transformed to UNDEF. // In this case we should just commute the node. if (V1IsUndef) - return CommuteVectorShuffle(SVOp, DAG); + return DAG.getCommutedVectorShuffle(*SVOp); // Vector shuffle lowering takes 3 steps: // @@ -9335,7 +12043,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64)) - return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, + return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask, DAG); return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1, @@ -9347,6 +12055,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { getShufflePALIGNRImmediate(SVOp), DAG); + if (isVALIGNMask(M, VT, Subtarget)) + return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2, + getShuffleVALIGNImmediate(SVOp), + DAG); + // Check if this can be converted into a logical shift. bool isLeft = false; unsigned ShAmt = 0; @@ -9390,7 +12103,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (ShouldXformToMOVHLPS(M, VT) || ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT)) - return CommuteVectorShuffle(SVOp, DAG); + return DAG.getCommutedVectorShuffle(*SVOp); if (isShift) { // No better options. Use a vshldq / vsrldq. @@ -9462,7 +12175,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // Normalize the node to match x86 shuffle ops if needed if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true))) - return CommuteVectorShuffle(SVOp, DAG); + return DAG.getCommutedVectorShuffle(*SVOp); // The checks below are all present in isShuffleMaskLegal, but they are // inlined here right now to enable us to directly emit target specific @@ -9512,7 +12225,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32) return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, getShuffleSHUFImmediate(SVOp), DAG); - return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, + return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, getShuffleSHUFImmediate(SVOp), DAG); } @@ -9631,9 +12344,10 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, return true; } -// Try to lower a vselect node into a simple blend instruction. -static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend +/// instruction. +static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDValue Cond = Op.getOperand(0); SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); @@ -9675,7 +12389,14 @@ static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget, } SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { - SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG); + // A vselect where all conditions and data are constants can be optimized into + // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). + if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) && + ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) && + ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) + return SDValue(); + + SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG); if (BlendOp.getNode()) return BlendOp; @@ -9688,6 +12409,8 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { break; case MVT::v8i16: case MVT::v16i16: + if (Subtarget->hasBWI() && Subtarget->hasVLX()) + break; return SDValue(); } @@ -9906,59 +12629,6 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, return SDValue(); } -static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); - MVT EltVT = VT.getVectorElementType(); - SDLoc dl(Op); - - SDValue N0 = Op.getOperand(0); - SDValue N1 = Op.getOperand(1); - SDValue N2 = Op.getOperand(2); - - if (!VT.is128BitVector()) - return SDValue(); - - if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && - isa<ConstantSDNode>(N2)) { - unsigned Opc; - if (VT == MVT::v8i16) - Opc = X86ISD::PINSRW; - else if (VT == MVT::v16i8) - Opc = X86ISD::PINSRB; - else - Opc = X86ISD::PINSRB; - - // Transform it so it match pinsr{b,w} which expects a GR32 as its second - // argument. - if (N1.getValueType() != MVT::i32) - N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); - if (N2.getValueType() != MVT::i32) - N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); - return DAG.getNode(Opc, dl, VT, N0, N1, N2); - } - - if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { - // Bits [7:6] of the constant are the source select. This will always be - // zero here. The DAG Combiner may combine an extract_elt index into these - // bits. For example (insert (extract, 3), 2) could be matched by putting - // the '3' into bits [7:6] of X86ISD::INSERTPS. - // Bits [5:4] of the constant are the destination select. This is the - // value of the incoming immediate. - // Bits [3:0] of the constant are the zero mask. The DAG Combiner may - // combine either bitwise AND or insert of float 0.0 to set these bits. - N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); - // Create this as a scalar to vector.. - N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); - return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); - } - - if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) { - // PINSR* works with constant index. - return Op; - } - return SDValue(); -} - /// Insert one bit to mask vector, like v16i1 or v8i1. /// AVX-512 feature. SDValue @@ -9993,11 +12663,12 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { DAG.getConstant(MaxSift - IdxVal, MVT::i8)); return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); } -SDValue -X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { + +SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); - + if (EltVT == MVT::i1) return InsertBitToMaskVector(Op, DAG); @@ -10005,20 +12676,20 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2 = Op.getOperand(2); + if (!isa<ConstantSDNode>(N2)) + return SDValue(); + auto *N2C = cast<ConstantSDNode>(N2); + unsigned IdxVal = N2C->getZExtValue(); - // If this is a 256-bit vector result, first extract the 128-bit vector, - // insert the element into the extracted half and then place it back. + // If the vector is wider than 128 bits, extract the 128-bit subvector, insert + // into that, and then insert the subvector back into the result. if (VT.is256BitVector() || VT.is512BitVector()) { - if (!isa<ConstantSDNode>(N2)) - return SDValue(); - // Get the desired 128-bit vector half. - unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); // Insert the element into the desired half. - unsigned NumEltsIn128 = 128/EltVT.getSizeInBits(); - unsigned IdxIn128 = IdxVal - (IdxVal/NumEltsIn128) * NumEltsIn128; + unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits(); + unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128; V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, DAG.getConstant(IdxIn128, MVT::i32)); @@ -10026,20 +12697,60 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { // Insert the changed part back to the 256-bit vector return Insert128BitVector(N0, V, IdxVal, DAG, dl); } + assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); - if (Subtarget->hasSSE41()) - return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); + if (Subtarget->hasSSE41()) { + if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) { + unsigned Opc; + if (VT == MVT::v8i16) { + Opc = X86ISD::PINSRW; + } else { + assert(VT == MVT::v16i8); + Opc = X86ISD::PINSRB; + } + + // Transform it so it match pinsr{b,w} which expects a GR32 as its second + // argument. + if (N1.getValueType() != MVT::i32) + N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); + if (N2.getValueType() != MVT::i32) + N2 = DAG.getIntPtrConstant(IdxVal); + return DAG.getNode(Opc, dl, VT, N0, N1, N2); + } + + if (EltVT == MVT::f32) { + // Bits [7:6] of the constant are the source select. This will always be + // zero here. The DAG Combiner may combine an extract_elt index into + // these + // bits. For example (insert (extract, 3), 2) could be matched by + // putting + // the '3' into bits [7:6] of X86ISD::INSERTPS. + // Bits [5:4] of the constant are the destination select. This is the + // value of the incoming immediate. + // Bits [3:0] of the constant are the zero mask. The DAG Combiner may + // combine either bitwise AND or insert of float 0.0 to set these bits. + N2 = DAG.getIntPtrConstant(IdxVal << 4); + // Create this as a scalar to vector.. + N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); + } + + if (EltVT == MVT::i32 || EltVT == MVT::i64) { + // PINSR* works with constant index. + return Op; + } + } if (EltVT == MVT::i8) return SDValue(); - if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { + if (EltVT.getSizeInBits() == 16) { // Transform it so it match pinsrw which expects a 16-bit value in a GR32 // as its second argument. if (N1.getValueType() != MVT::i32) N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); if (N2.getValueType() != MVT::i32) - N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); + N2 = DAG.getIntPtrConstant(IdxVal); return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); } return SDValue(); @@ -10352,6 +13063,7 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. MFI->setAdjustsStack(true); + MFI->setHasCalls(true); SDValue Flag = Chain.getValue(1); return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); @@ -10585,7 +13297,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { if (Subtarget->is64Bit()) IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, IDX, MachinePointerInfo(), MVT::i32, - false, false, 0); + false, false, false, 0); else IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(), false, false, false, 0); @@ -10669,10 +13381,18 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { MVT SrcVT = Op.getOperand(0).getSimpleValueType(); + SDLoc dl(Op); - if (SrcVT.isVector()) + if (SrcVT.isVector()) { + if (SrcVT.getVectorElementType() == MVT::i1) { + MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); + return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, + Op.getOperand(0))); + } return SDValue(); - + } + assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && "Unknown SINT_TO_FP to lower!"); @@ -10685,7 +13405,6 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, return Op; } - SDLoc dl(Op); unsigned Size = SrcVT.getSizeInBits()/8; MachineFunction &MF = DAG.getMachineFunction(); int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); @@ -10872,19 +13591,135 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, return Sub; } +static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // The algorithm is the following: + // #ifdef __SSE4_1__ + // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); + // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), + // (uint4) 0x53000000, 0xaa); + // #else + // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; + // uint4 hi = (v >> 16) | (uint4) 0x53000000; + // #endif + // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); + // return (float4) lo + fhi; + + SDLoc DL(Op); + SDValue V = Op->getOperand(0); + EVT VecIntVT = V.getValueType(); + bool Is128 = VecIntVT == MVT::v4i32; + EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; + // If we convert to something else than the supported type, e.g., to v4f64, + // abort early. + if (VecFloatVT != Op->getValueType(0)) + return SDValue(); + + unsigned NumElts = VecIntVT.getVectorNumElements(); + assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && + "Unsupported custom type"); + assert(NumElts <= 8 && "The size of the constant array must be fixed"); + + // In the #idef/#else code, we have in common: + // - The vector of constants: + // -- 0x4b000000 + // -- 0x53000000 + // - A shift: + // -- v >> 16 + + // Create the splat vector for 0x4b000000. + SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32); + SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow, + CstLow, CstLow, CstLow, CstLow}; + SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, + makeArrayRef(&CstLowArray[0], NumElts)); + // Create the splat vector for 0x53000000. + SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32); + SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh, + CstHigh, CstHigh, CstHigh, CstHigh}; + SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, + makeArrayRef(&CstHighArray[0], NumElts)); + + // Create the right shift. + SDValue CstShift = DAG.getConstant(16, MVT::i32); + SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift, + CstShift, CstShift, CstShift, CstShift}; + SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, + makeArrayRef(&CstShiftArray[0], NumElts)); + SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift); + + SDValue Low, High; + if (Subtarget.hasSSE41()) { + EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; + // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); + SDValue VecCstLowBitcast = + DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow); + SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V); + // Low will be bitcasted right away, so do not bother bitcasting back to its + // original type. + Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, + VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32)); + // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), + // (uint4) 0x53000000, 0xaa); + SDValue VecCstHighBitcast = + DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh); + SDValue VecShiftBitcast = + DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift); + // High will be bitcasted right away, so do not bother bitcasting back to + // its original type. + High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast, + VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32)); + } else { + SDValue CstMask = DAG.getConstant(0xffff, MVT::i32); + SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask, + CstMask, CstMask, CstMask); + // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; + SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask); + Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow); + + // uint4 hi = (v >> 16) | (uint4) 0x53000000; + High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh); + } + + // Create the vector constant for -(0x1.0p39f + 0x1.0p23f). + SDValue CstFAdd = DAG.getConstantFP( + APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32); + SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd, + CstFAdd, CstFAdd, CstFAdd, CstFAdd}; + SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT, + makeArrayRef(&CstFAddArray[0], NumElts)); + + // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); + SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High); + SDValue FHigh = + DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd); + // return (float4) lo + fhi; + SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low); + return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh); +} + SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); MVT SVT = N0.getSimpleValueType(); SDLoc dl(Op); - assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 || - SVT == MVT::v8i8 || SVT == MVT::v8i16) && - "Custom UINT_TO_FP is not supported!"); - - MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements()); - return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), - DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); + switch (SVT.SimpleTy) { + default: + llvm_unreachable("Custom UINT_TO_FP is not supported!"); + case MVT::v4i8: + case MVT::v4i16: + case MVT::v8i8: + case MVT::v8i16: { + MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements()); + return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); + } + case MVT::v4i32: + case MVT::v8i32: + return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget); + } + llvm_unreachable(nullptr); } SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, @@ -10970,7 +13805,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // FIXME: Avoid the extend by constructing the right constant pool? SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr, MachinePointerInfo::getConstantPool(), - MVT::f32, false, false, 4); + MVT::f32, false, false, false, 4); // Extend everything to 80 bits to force it to be done on x87. SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); @@ -11184,12 +14019,9 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::i1) { assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) && "Invalid scalar TRUNCATE operation"); - if (InVT == MVT::i32) + if (InVT.getSizeInBits() >= 32) return SDValue(); - if (InVT.getSizeInBits() == 64) - In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::i32, In); - else if (InVT.getSizeInBits() < 32) - In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In); + In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In); return DAG.getNode(ISD::TRUNCATE, DL, VT, In); } assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && @@ -11367,58 +14199,47 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { In, DAG.getUNDEF(SVT))); } -static SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) { - LLVMContext *Context = DAG.getContext(); - SDLoc dl(Op); - MVT VT = Op.getSimpleValueType(); - MVT EltVT = VT; - unsigned NumElts = VT == MVT::f64 ? 2 : 4; - if (VT.isVector()) { - EltVT = VT.getVectorElementType(); - NumElts = VT.getVectorNumElements(); - } - Constant *C; - if (EltVT == MVT::f64) - C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, - APInt(64, ~(1ULL << 63)))); - else - C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle, - APInt(32, ~(1U << 31)))); - C = ConstantVector::getSplat(NumElts, C); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy()); - unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); - SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); - if (VT.isVector()) { - MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; - return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(ISD::AND, dl, ANDVT, - DAG.getNode(ISD::BITCAST, dl, ANDVT, - Op.getOperand(0)), - DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask))); - } - return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); -} +/// The only differences between FABS and FNEG are the mask and the logic op. +/// FNEG also has a folding opportunity for FNEG(FABS(x)). +static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { + assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && + "Wrong opcode for lowering FABS or FNEG."); + + bool IsFABS = (Op.getOpcode() == ISD::FABS); + + // If this is a FABS and it has an FNEG user, bail out to fold the combination + // into an FNABS. We'll lower the FABS after that if it is still in use. + if (IsFABS) + for (SDNode *User : Op->uses()) + if (User->getOpcode() == ISD::FNEG) + return Op; + + SDValue Op0 = Op.getOperand(0); + bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); -static SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) { - LLVMContext *Context = DAG.getContext(); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); + // Assume scalar op for initialization; update for vector if needed. + // Note that there are no scalar bitwise logical SSE/AVX instructions, so we + // generate a 16-byte vector constant and logic op even for the scalar case. + // Using a 16-byte mask allows folding the load of the mask with + // the logic op, so it can save (~4 bytes) on code size. MVT EltVT = VT; unsigned NumElts = VT == MVT::f64 ? 2 : 4; + // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to + // decide if we should generate a 16-byte constant mask when we only need 4 or + // 8 bytes for the scalar case. if (VT.isVector()) { EltVT = VT.getVectorElementType(); NumElts = VT.getVectorNumElements(); } - Constant *C; - if (EltVT == MVT::f64) - C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, - APInt(64, 1ULL << 63))); - else - C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle, - APInt(32, 1U << 31))); + + unsigned EltBits = EltVT.getSizeInBits(); + LLVMContext *Context = DAG.getContext(); + // For FABS, mask is 0x7f...; for FNEG, mask is 0x80... + APInt MaskElt = + IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits); + Constant *C = ConstantInt::get(*Context, MaskElt); C = ConstantVector::getSplat(NumElts, C); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy()); @@ -11426,16 +14247,24 @@ static SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) { SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), false, false, false, Alignment); + if (VT.isVector()) { - MVT XORVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits()/64); + // For a vector, cast operands to a vector type, perform the logic op, + // and cast the result back to the original value type. + MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); + SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask); + SDValue Operand = IsFNABS ? + DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) : + DAG.getNode(ISD::BITCAST, dl, VecVT, Op0); + unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR; return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(ISD::XOR, dl, XORVT, - DAG.getNode(ISD::BITCAST, dl, XORVT, - Op.getOperand(0)), - DAG.getNode(ISD::BITCAST, dl, XORVT, Mask))); + DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted)); } - - return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); + + // If not vector, then scalar. + unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; + SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; + return DAG.getNode(BitOp, dl, VT, Operand, Mask); } static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { @@ -11529,8 +14358,7 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); } -// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able. -// +// Check whether an OR'd tree is PTEST-able. static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); @@ -11938,6 +14766,66 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } +/// The minimum architected relative accuracy is 2^-12. We need one +/// Newton-Raphson step to have a good float result (24 bits of precision). +SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const { + // FIXME: We should use instruction latency models to calculate the cost of + // each potential sequence, but this is very hard to do reliably because + // at least Intel's Core* chips have variable timing based on the number of + // significant digits in the divisor and/or sqrt operand. + if (!Subtarget->useSqrtEst()) + return SDValue(); + + EVT VT = Op.getValueType(); + + // SSE1 has rsqrtss and rsqrtps. + // TODO: Add support for AVX512 (v16f32). + // It is likely not profitable to do this for f64 because a double-precision + // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 + // instructions: convert to single, rsqrtss, convert back to double, refine + // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA + // along with FMA, this could be a throughput win. + if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || + (Subtarget->hasAVX() && VT == MVT::v8f32)) { + RefinementSteps = 1; + UseOneConstNR = false; + return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); + } + return SDValue(); +} + +/// The minimum architected relative accuracy is 2^-12. We need one +/// Newton-Raphson step to have a good float result (24 bits of precision). +SDValue X86TargetLowering::getRecipEstimate(SDValue Op, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const { + // FIXME: We should use instruction latency models to calculate the cost of + // each potential sequence, but this is very hard to do reliably because + // at least Intel's Core* chips have variable timing based on the number of + // significant digits in the divisor. + if (!Subtarget->useReciprocalEst()) + return SDValue(); + + EVT VT = Op.getValueType(); + + // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. + // TODO: Add support for AVX512 (v16f32). + // It is likely not profitable to do this for f64 because a double-precision + // reciprocal estimate with refinement on x86 prior to FMA requires + // 15 instructions: convert to single, rcpss, convert back to double, refine + // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA + // along with FMA, this could be a throughput win. + if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || + (Subtarget->hasAVX() && VT == MVT::v8f32)) { + RefinementSteps = ReciprocalEstimateRefinementSteps; + return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); + } + return SDValue(); +} + static bool isAllOnes(SDValue V) { ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); return C && C->isAllOnesValue(); @@ -12097,7 +14985,7 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 32 && + assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 && Op.getValueType().getScalarType() == MVT::i1 && "Cannot set masked compare for this operation"); @@ -12211,11 +15099,12 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, EVT OpVT = Op1.getValueType(); if (Subtarget->hasAVX512()) { if (Op1.getValueType().is512BitVector() || + (Subtarget->hasBWI() && Subtarget->hasVLX()) || (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) return LowerIntVSETCC_AVX512(Op, DAG, Subtarget); // In AVX-512 architecture setcc returns mask with i1 elements, - // But there is no compare instruction for i8 and i16 elements. + // But there is no compare instruction for i8 and i16 elements in KNL. // We are not talking about 512-bit operands in this case, these // types are illegal. if (MaskResult && @@ -12721,18 +15610,40 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops); } -static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); + MVT VTElt = VT.getVectorElementType(); + MVT InVTElt = InVT.getVectorElementType(); SDLoc dl(Op); + // SKX processor + if ((InVTElt == MVT::i1) && + (((Subtarget->hasBWI() && Subtarget->hasVLX() && + VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) || + + ((Subtarget->hasBWI() && VT.is512BitVector() && + VTElt.getSizeInBits() <= 16)) || + + ((Subtarget->hasDQI() && Subtarget->hasVLX() && + VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) || + + ((Subtarget->hasDQI() && VT.is512BitVector() && + VTElt.getSizeInBits() >= 32)))) + return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + unsigned int NumElts = VT.getVectorNumElements(); + if (NumElts != 8 && NumElts != 16) return SDValue(); - if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) + if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) { + if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT) + return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0)); return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + } const TargetLowering &TLI = DAG.getTargetLoweringInfo(); assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); @@ -12760,7 +15671,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, SDLoc dl(Op); if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) - return LowerSIGN_EXTEND_AVX512(Op, DAG); + return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG); if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && (VT != MVT::v8i32 || InVT != MVT::v8i16) && @@ -12803,6 +15714,210 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } +// Lower vector extended loads using a shuffle. If SSSE3 is not available we +// may emit an illegal shuffle but the expansion is still better than scalar +// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise +// we'll emit a shuffle and a arithmetic shift. +// TODO: It is possible to support ZExt by zeroing the undef values during +// the shuffle phase or after the shuffle. +static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT RegVT = Op.getSimpleValueType(); + assert(RegVT.isVector() && "We only custom lower vector sext loads."); + assert(RegVT.isInteger() && + "We only custom lower integer vector sext loads."); + + // Nothing useful we can do without SSE2 shuffles. + assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2."); + + LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode()); + SDLoc dl(Ld); + EVT MemVT = Ld->getMemoryVT(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned RegSz = RegVT.getSizeInBits(); + + ISD::LoadExtType Ext = Ld->getExtensionType(); + + assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) + && "Only anyext and sext are currently implemented."); + assert(MemVT != RegVT && "Cannot extend to the same type"); + assert(MemVT.isVector() && "Must load a vector from memory"); + + unsigned NumElems = RegVT.getVectorNumElements(); + unsigned MemSz = MemVT.getSizeInBits(); + assert(RegSz > MemSz && "Register size must be greater than the mem size"); + + if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) { + // The only way in which we have a legal 256-bit vector result but not the + // integer 256-bit operations needed to directly lower a sextload is if we + // have AVX1 but not AVX2. In that case, we can always emit a sextload to + // a 128-bit vector and a normal sign_extend to 256-bits that should get + // correctly legalized. We do this late to allow the canonical form of + // sextload to persist throughout the rest of the DAG combiner -- it wants + // to fold together any extensions it can, and so will fuse a sign_extend + // of an sextload into a sextload targeting a wider value. + SDValue Load; + if (MemSz == 128) { + // Just switch this to a normal load. + assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, " + "it must be a legal 128-bit vector " + "type!"); + Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), + Ld->isInvariant(), Ld->getAlignment()); + } else { + assert(MemSz < 128 && + "Can't extend a type wider than 128 bits to a 256 bit vector!"); + // Do an sext load to a 128-bit vector type. We want to use the same + // number of elements, but elements half as wide. This will end up being + // recursively lowered by this routine, but will succeed as we definitely + // have all the necessary features if we're using AVX1. + EVT HalfEltVT = + EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2); + EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems); + Load = + DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), MemVT, Ld->isVolatile(), + Ld->isNonTemporal(), Ld->isInvariant(), + Ld->getAlignment()); + } + + // Replace chain users with the new chain. + assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); + + // Finally, do a normal sign-extend to the desired register. + return DAG.getSExtOrTrunc(Load, dl, RegVT); + } + + // All sizes must be a power of two. + assert(isPowerOf2_32(RegSz * MemSz * NumElems) && + "Non-power-of-two elements are not custom lowered!"); + + // Attempt to load the original value using scalar loads. + // Find the largest scalar type that divides the total loaded size. + MVT SclrLoadTy = MVT::i8; + for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; + tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { + MVT Tp = (MVT::SimpleValueType)tp; + if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { + SclrLoadTy = Tp; + } + } + + // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. + if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && + (64 <= MemSz)) + SclrLoadTy = MVT::f64; + + // Calculate the number of scalar loads that we need to perform + // in order to load our vector from memory. + unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); + + assert((Ext != ISD::SEXTLOAD || NumLoads == 1) && + "Can only lower sext loads with a single scalar load!"); + + unsigned loadRegZize = RegSz; + if (Ext == ISD::SEXTLOAD && RegSz == 256) + loadRegZize /= 2; + + // Represent our vector as a sequence of elements which are the + // largest scalar that we can load. + EVT LoadUnitVecVT = EVT::getVectorVT( + *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits()); + + // Represent the data using the same element type that is stored in + // memory. In practice, we ''widen'' MemVT. + EVT WideVecVT = + EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), + loadRegZize / MemVT.getScalarType().getSizeInBits()); + + assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && + "Invalid vector type"); + + // We can't shuffle using an illegal type. + assert(TLI.isTypeLegal(WideVecVT) && + "We only lower types that form legal widened vector types"); + + SmallVector<SDValue, 8> Chains; + SDValue Ptr = Ld->getBasePtr(); + SDValue Increment = + DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy()); + SDValue Res = DAG.getUNDEF(LoadUnitVecVT); + + for (unsigned i = 0; i < NumLoads; ++i) { + // Perform a single load. + SDValue ScalarLoad = + DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), + Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), + Ld->getAlignment()); + Chains.push_back(ScalarLoad.getValue(1)); + // Create the first element type using SCALAR_TO_VECTOR in order to avoid + // another round of DAGCombining. + if (i == 0) + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); + else + Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, + ScalarLoad, DAG.getIntPtrConstant(i)); + + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + } + + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + + // Bitcast the loaded value to a vector of the original element type, in + // the size of the target vector type. + SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res); + unsigned SizeRatio = RegSz / MemSz; + + if (Ext == ISD::SEXTLOAD) { + // If we have SSE4.1, we can directly emit a VSEXT node. + if (Subtarget->hasSSE41()) { + SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); + return Sext; + } + + // Otherwise we'll shuffle the small elements in the high bits of the + // larger type and perform an arithmetic shift. If the shift is not legal + // it's better to scalarize. + assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) && + "We can't implement a sext load without an arithmetic right shift!"); + + // Redistribute the loaded elements into the different locations. + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i * SizeRatio + SizeRatio - 1] = i; + + SDValue Shuff = DAG.getVectorShuffle( + WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); + + Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); + + // Build the arithmetic shift. + unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - + MemVT.getVectorElementType().getSizeInBits(); + Shuff = + DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT)); + + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); + return Shuff; + } + + // Redistribute the loaded elements into the different locations. + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i * SizeRatio] = i; + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, + DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); + + // Bitcast to the requested type. + Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); + return Shuff; +} + // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart // from the AND / OR. @@ -13108,7 +16223,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { } // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. -// Calls to _alloca is needed to probe the stack when allocating more than 4k +// Calls to _alloca are needed to probe the stack when allocating more than 4k // bytes in one go. Touching the stack at 4K increments is necessary to ensure // that the guard pages used by the OS virtual memory manager are allocated in // correct sequence. @@ -13143,7 +16258,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue(); - const TargetFrameLowering &TFI = *DAG.getTarget().getFrameLowering(); + const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering(); unsigned StackAlign = TFI.getStackAlignment(); Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value if (Align > StackAlign) @@ -13166,7 +16281,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, EVT VT = Op.getNode()->getValueType(0); bool Is64Bit = Subtarget->is64Bit(); - EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; + EVT SPTy = getPointerTy(); if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -13184,7 +16299,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, } const TargetRegisterClass *AddrRegClass = - getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32); + getRegClassFor(getPointerTy()); unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, @@ -13193,7 +16308,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, return DAG.getMergeValues(Ops1, dl); } else { SDValue Flag; - unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); + const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX); Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); Flag = Chain.getValue(1); @@ -13201,8 +16316,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); unsigned SPReg = RegInfo->getStackRegister(); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); Chain = SP.getValue(1); @@ -13475,112 +16590,178 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); } -static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { - SDLoc dl(Op); - unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - switch (IntNo) { - default: return SDValue(); // Don't custom lower most intrinsics. - // Comparison intrinsics. - case Intrinsic::x86_sse_comieq_ss: - case Intrinsic::x86_sse_comilt_ss: - case Intrinsic::x86_sse_comile_ss: - case Intrinsic::x86_sse_comigt_ss: - case Intrinsic::x86_sse_comige_ss: - case Intrinsic::x86_sse_comineq_ss: - case Intrinsic::x86_sse_ucomieq_ss: - case Intrinsic::x86_sse_ucomilt_ss: - case Intrinsic::x86_sse_ucomile_ss: - case Intrinsic::x86_sse_ucomigt_ss: - case Intrinsic::x86_sse_ucomige_ss: - case Intrinsic::x86_sse_ucomineq_ss: - case Intrinsic::x86_sse2_comieq_sd: - case Intrinsic::x86_sse2_comilt_sd: - case Intrinsic::x86_sse2_comile_sd: - case Intrinsic::x86_sse2_comigt_sd: - case Intrinsic::x86_sse2_comige_sd: - case Intrinsic::x86_sse2_comineq_sd: - case Intrinsic::x86_sse2_ucomieq_sd: - case Intrinsic::x86_sse2_ucomilt_sd: - case Intrinsic::x86_sse2_ucomile_sd: - case Intrinsic::x86_sse2_ucomigt_sd: - case Intrinsic::x86_sse2_ucomige_sd: - case Intrinsic::x86_sse2_ucomineq_sd: { - unsigned Opc; - ISD::CondCode CC; +/// \brief Return (and \p Op, \p Mask) for compare instructions or +/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the +/// necessary casting for \p Mask when lowering masking intrinsics. +static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, + SDValue PreservedSrc, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), + MVT::i1, VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDLoc dl(Op); + + assert(MaskVT.isSimple() && "invalid mask type"); + + if (isAllOnes(Mask)) + return Op; + + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getIntPtrConstant(0)); + + switch (Op.getOpcode()) { + default: break; + case X86ISD::PCMPEQM: + case X86ISD::PCMPGTM: + case X86ISD::CMPM: + case X86ISD::CMPMU: + return DAG.getNode(ISD::AND, dl, VT, Op, VMask); + } + if (PreservedSrc.getOpcode() == ISD::UNDEF) + PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc); +} + +static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) { switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_sse_comieq_ss: - case Intrinsic::x86_sse2_comieq_sd: - Opc = X86ISD::COMI; - CC = ISD::SETEQ; - break; - case Intrinsic::x86_sse_comilt_ss: - case Intrinsic::x86_sse2_comilt_sd: - Opc = X86ISD::COMI; - CC = ISD::SETLT; - break; - case Intrinsic::x86_sse_comile_ss: - case Intrinsic::x86_sse2_comile_sd: - Opc = X86ISD::COMI; - CC = ISD::SETLE; - break; - case Intrinsic::x86_sse_comigt_ss: - case Intrinsic::x86_sse2_comigt_sd: - Opc = X86ISD::COMI; - CC = ISD::SETGT; - break; - case Intrinsic::x86_sse_comige_ss: - case Intrinsic::x86_sse2_comige_sd: - Opc = X86ISD::COMI; - CC = ISD::SETGE; - break; - case Intrinsic::x86_sse_comineq_ss: - case Intrinsic::x86_sse2_comineq_sd: - Opc = X86ISD::COMI; - CC = ISD::SETNE; - break; - case Intrinsic::x86_sse_ucomieq_ss: - case Intrinsic::x86_sse2_ucomieq_sd: - Opc = X86ISD::UCOMI; - CC = ISD::SETEQ; - break; - case Intrinsic::x86_sse_ucomilt_ss: - case Intrinsic::x86_sse2_ucomilt_sd: - Opc = X86ISD::UCOMI; - CC = ISD::SETLT; - break; - case Intrinsic::x86_sse_ucomile_ss: - case Intrinsic::x86_sse2_ucomile_sd: - Opc = X86ISD::UCOMI; - CC = ISD::SETLE; - break; - case Intrinsic::x86_sse_ucomigt_ss: - case Intrinsic::x86_sse2_ucomigt_sd: - Opc = X86ISD::UCOMI; - CC = ISD::SETGT; - break; - case Intrinsic::x86_sse_ucomige_ss: - case Intrinsic::x86_sse2_ucomige_sd: - Opc = X86ISD::UCOMI; - CC = ISD::SETGE; - break; - case Intrinsic::x86_sse_ucomineq_ss: - case Intrinsic::x86_sse2_ucomineq_sd: - Opc = X86ISD::UCOMI; - CC = ISD::SETNE; - break; + case Intrinsic::x86_fma_vfmadd_ps: + case Intrinsic::x86_fma_vfmadd_pd: + case Intrinsic::x86_fma_vfmadd_ps_256: + case Intrinsic::x86_fma_vfmadd_pd_256: + case Intrinsic::x86_fma_mask_vfmadd_ps_512: + case Intrinsic::x86_fma_mask_vfmadd_pd_512: + return X86ISD::FMADD; + case Intrinsic::x86_fma_vfmsub_ps: + case Intrinsic::x86_fma_vfmsub_pd: + case Intrinsic::x86_fma_vfmsub_ps_256: + case Intrinsic::x86_fma_vfmsub_pd_256: + case Intrinsic::x86_fma_mask_vfmsub_ps_512: + case Intrinsic::x86_fma_mask_vfmsub_pd_512: + return X86ISD::FMSUB; + case Intrinsic::x86_fma_vfnmadd_ps: + case Intrinsic::x86_fma_vfnmadd_pd: + case Intrinsic::x86_fma_vfnmadd_ps_256: + case Intrinsic::x86_fma_vfnmadd_pd_256: + case Intrinsic::x86_fma_mask_vfnmadd_ps_512: + case Intrinsic::x86_fma_mask_vfnmadd_pd_512: + return X86ISD::FNMADD; + case Intrinsic::x86_fma_vfnmsub_ps: + case Intrinsic::x86_fma_vfnmsub_pd: + case Intrinsic::x86_fma_vfnmsub_ps_256: + case Intrinsic::x86_fma_vfnmsub_pd_256: + case Intrinsic::x86_fma_mask_vfnmsub_ps_512: + case Intrinsic::x86_fma_mask_vfnmsub_pd_512: + return X86ISD::FNMSUB; + case Intrinsic::x86_fma_vfmaddsub_ps: + case Intrinsic::x86_fma_vfmaddsub_pd: + case Intrinsic::x86_fma_vfmaddsub_ps_256: + case Intrinsic::x86_fma_vfmaddsub_pd_256: + case Intrinsic::x86_fma_mask_vfmaddsub_ps_512: + case Intrinsic::x86_fma_mask_vfmaddsub_pd_512: + return X86ISD::FMADDSUB; + case Intrinsic::x86_fma_vfmsubadd_ps: + case Intrinsic::x86_fma_vfmsubadd_pd: + case Intrinsic::x86_fma_vfmsubadd_ps_256: + case Intrinsic::x86_fma_vfmsubadd_pd_256: + case Intrinsic::x86_fma_mask_vfmsubadd_ps_512: + case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: + return X86ISD::FMSUBADD; } +} - SDValue LHS = Op.getOperand(1); - SDValue RHS = Op.getOperand(2); - unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); - assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); - SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); - SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86CC, MVT::i8), Cond); - return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); +static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc dl(Op); + unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + EVT VT = Op.getValueType(); + const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); + if (IntrData) { + switch(IntrData->Type) { + case INTR_TYPE_1OP: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1)); + case INTR_TYPE_2OP: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2)); + case INTR_TYPE_3OP: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); + case INTR_TYPE_1OP_MASK_RM: { + SDValue Src = Op.getOperand(1); + SDValue Src0 = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + SDValue RoundingMode = Op.getOperand(4); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, + RoundingMode), + Mask, Src0, Subtarget, DAG); + } + + case CMP_MASK: + case CMP_MASK_CC: { + // Comparison intrinsics with masks. + // Example of transformation: + // (i8 (int_x86_avx512_mask_pcmpeq_q_128 + // (v2i64 %a), (v2i64 %b), (i8 %mask))) -> + // (i8 (bitcast + // (v8i1 (insert_subvector undef, + // (v2i1 (and (PCMPEQM %a, %b), + // (extract_subvector + // (v8i1 (bitcast %mask)), 0))), 0)))) + EVT VT = Op.getOperand(1).getValueType(); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorNumElements()); + SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDValue Cmp; + if (IntrData->Type == CMP_MASK_CC) { + Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); + } else { + assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!"); + Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2)); + } + SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, + DAG.getTargetConstant(0, MaskVT), + Subtarget, DAG); + SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, + DAG.getUNDEF(BitcastVT), CmpMask, + DAG.getIntPtrConstant(0)); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); + } + case COMI: { // Comparison intrinsics + ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); + assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); + SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86CC, MVT::i8), Cond); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } + case VSHIFT: + return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), + Op.getOperand(1), Op.getOperand(2), DAG); + case VSHIFT_MASK: + return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), + Op.getOperand(1), Op.getOperand(2), DAG), + Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);; + default: + break; + } } + switch (IntNo) { + default: return SDValue(); // Don't custom lower most intrinsics. + // Arithmetic intrinsics. case Intrinsic::x86_sse2_pmulu_dq: case Intrinsic::x86_avx2_pmulu_dq: @@ -13602,128 +16783,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::MULHS, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); - // SSE2/AVX2 sub with unsigned saturation intrinsics - case Intrinsic::x86_sse2_psubus_b: - case Intrinsic::x86_sse2_psubus_w: - case Intrinsic::x86_avx2_psubus_b: - case Intrinsic::x86_avx2_psubus_w: - return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - // SSE3/AVX horizontal add/sub intrinsics - case Intrinsic::x86_sse3_hadd_ps: - case Intrinsic::x86_sse3_hadd_pd: - case Intrinsic::x86_avx_hadd_ps_256: - case Intrinsic::x86_avx_hadd_pd_256: - case Intrinsic::x86_sse3_hsub_ps: - case Intrinsic::x86_sse3_hsub_pd: - case Intrinsic::x86_avx_hsub_ps_256: - case Intrinsic::x86_avx_hsub_pd_256: - case Intrinsic::x86_ssse3_phadd_w_128: - case Intrinsic::x86_ssse3_phadd_d_128: - case Intrinsic::x86_avx2_phadd_w: - case Intrinsic::x86_avx2_phadd_d: - case Intrinsic::x86_ssse3_phsub_w_128: - case Intrinsic::x86_ssse3_phsub_d_128: - case Intrinsic::x86_avx2_phsub_w: - case Intrinsic::x86_avx2_phsub_d: { - unsigned Opcode; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_sse3_hadd_ps: - case Intrinsic::x86_sse3_hadd_pd: - case Intrinsic::x86_avx_hadd_ps_256: - case Intrinsic::x86_avx_hadd_pd_256: - Opcode = X86ISD::FHADD; - break; - case Intrinsic::x86_sse3_hsub_ps: - case Intrinsic::x86_sse3_hsub_pd: - case Intrinsic::x86_avx_hsub_ps_256: - case Intrinsic::x86_avx_hsub_pd_256: - Opcode = X86ISD::FHSUB; - break; - case Intrinsic::x86_ssse3_phadd_w_128: - case Intrinsic::x86_ssse3_phadd_d_128: - case Intrinsic::x86_avx2_phadd_w: - case Intrinsic::x86_avx2_phadd_d: - Opcode = X86ISD::HADD; - break; - case Intrinsic::x86_ssse3_phsub_w_128: - case Intrinsic::x86_ssse3_phsub_d_128: - case Intrinsic::x86_avx2_phsub_w: - case Intrinsic::x86_avx2_phsub_d: - Opcode = X86ISD::HSUB; - break; - } - return DAG.getNode(Opcode, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - } - - // SSE2/SSE41/AVX2 integer max/min intrinsics. - case Intrinsic::x86_sse2_pmaxu_b: - case Intrinsic::x86_sse41_pmaxuw: - case Intrinsic::x86_sse41_pmaxud: - case Intrinsic::x86_avx2_pmaxu_b: - case Intrinsic::x86_avx2_pmaxu_w: - case Intrinsic::x86_avx2_pmaxu_d: - case Intrinsic::x86_sse2_pminu_b: - case Intrinsic::x86_sse41_pminuw: - case Intrinsic::x86_sse41_pminud: - case Intrinsic::x86_avx2_pminu_b: - case Intrinsic::x86_avx2_pminu_w: - case Intrinsic::x86_avx2_pminu_d: - case Intrinsic::x86_sse41_pmaxsb: - case Intrinsic::x86_sse2_pmaxs_w: - case Intrinsic::x86_sse41_pmaxsd: - case Intrinsic::x86_avx2_pmaxs_b: - case Intrinsic::x86_avx2_pmaxs_w: - case Intrinsic::x86_avx2_pmaxs_d: - case Intrinsic::x86_sse41_pminsb: - case Intrinsic::x86_sse2_pmins_w: - case Intrinsic::x86_sse41_pminsd: - case Intrinsic::x86_avx2_pmins_b: - case Intrinsic::x86_avx2_pmins_w: - case Intrinsic::x86_avx2_pmins_d: { - unsigned Opcode; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_sse2_pmaxu_b: - case Intrinsic::x86_sse41_pmaxuw: - case Intrinsic::x86_sse41_pmaxud: - case Intrinsic::x86_avx2_pmaxu_b: - case Intrinsic::x86_avx2_pmaxu_w: - case Intrinsic::x86_avx2_pmaxu_d: - Opcode = X86ISD::UMAX; - break; - case Intrinsic::x86_sse2_pminu_b: - case Intrinsic::x86_sse41_pminuw: - case Intrinsic::x86_sse41_pminud: - case Intrinsic::x86_avx2_pminu_b: - case Intrinsic::x86_avx2_pminu_w: - case Intrinsic::x86_avx2_pminu_d: - Opcode = X86ISD::UMIN; - break; - case Intrinsic::x86_sse41_pmaxsb: - case Intrinsic::x86_sse2_pmaxs_w: - case Intrinsic::x86_sse41_pmaxsd: - case Intrinsic::x86_avx2_pmaxs_b: - case Intrinsic::x86_avx2_pmaxs_w: - case Intrinsic::x86_avx2_pmaxs_d: - Opcode = X86ISD::SMAX; - break; - case Intrinsic::x86_sse41_pminsb: - case Intrinsic::x86_sse2_pmins_w: - case Intrinsic::x86_sse41_pminsd: - case Intrinsic::x86_avx2_pmins_b: - case Intrinsic::x86_avx2_pmins_w: - case Intrinsic::x86_avx2_pmins_d: - Opcode = X86ISD::SMIN; - break; - } - return DAG.getNode(Opcode, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - } - // SSE/SSE2/AVX floating point max/min intrinsics. case Intrinsic::x86_sse_max_ps: case Intrinsic::x86_sse2_max_pd: @@ -13828,17 +16887,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); - case Intrinsic::x86_sse41_insertps: - return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - - case Intrinsic::x86_avx_vperm2f128_ps_256: - case Intrinsic::x86_avx_vperm2f128_pd_256: - case Intrinsic::x86_avx_vperm2f128_si_256: - case Intrinsic::x86_avx2_vperm2i128: - return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - case Intrinsic::x86_avx2_permd: case Intrinsic::x86_avx2_permps: // Operands intentionally swapped. Mask is last operand to intrinsic, @@ -13846,11 +16894,15 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(1)); - case Intrinsic::x86_sse_sqrt_ps: - case Intrinsic::x86_sse2_sqrt_pd: - case Intrinsic::x86_avx_sqrt_ps_256: - case Intrinsic::x86_avx_sqrt_pd_256: - return DAG.getNode(ISD::FSQRT, dl, Op.getValueType(), Op.getOperand(1)); + case Intrinsic::x86_avx512_mask_valign_q_512: + case Intrinsic::x86_avx512_mask_valign_d_512: + // Vector source operands are swapped. + return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl, + Op.getValueType(), Op.getOperand(2), + Op.getOperand(1), + Op.getOperand(3)), + Op.getOperand(5), Op.getOperand(4), + Subtarget, DAG); // ptest and testp intrinsics. The intrinsic these come from are designed to // return an integer value, not just an instruction so lower it to the ptest @@ -13928,100 +16980,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } - // SSE/AVX shift intrinsics - case Intrinsic::x86_sse2_psll_w: - case Intrinsic::x86_sse2_psll_d: - case Intrinsic::x86_sse2_psll_q: - case Intrinsic::x86_avx2_psll_w: - case Intrinsic::x86_avx2_psll_d: - case Intrinsic::x86_avx2_psll_q: - case Intrinsic::x86_sse2_psrl_w: - case Intrinsic::x86_sse2_psrl_d: - case Intrinsic::x86_sse2_psrl_q: - case Intrinsic::x86_avx2_psrl_w: - case Intrinsic::x86_avx2_psrl_d: - case Intrinsic::x86_avx2_psrl_q: - case Intrinsic::x86_sse2_psra_w: - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_avx2_psra_w: - case Intrinsic::x86_avx2_psra_d: { - unsigned Opcode; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_sse2_psll_w: - case Intrinsic::x86_sse2_psll_d: - case Intrinsic::x86_sse2_psll_q: - case Intrinsic::x86_avx2_psll_w: - case Intrinsic::x86_avx2_psll_d: - case Intrinsic::x86_avx2_psll_q: - Opcode = X86ISD::VSHL; - break; - case Intrinsic::x86_sse2_psrl_w: - case Intrinsic::x86_sse2_psrl_d: - case Intrinsic::x86_sse2_psrl_q: - case Intrinsic::x86_avx2_psrl_w: - case Intrinsic::x86_avx2_psrl_d: - case Intrinsic::x86_avx2_psrl_q: - Opcode = X86ISD::VSRL; - break; - case Intrinsic::x86_sse2_psra_w: - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_avx2_psra_w: - case Intrinsic::x86_avx2_psra_d: - Opcode = X86ISD::VSRA; - break; - } - return DAG.getNode(Opcode, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - } - - // SSE/AVX immediate shift intrinsics - case Intrinsic::x86_sse2_pslli_w: - case Intrinsic::x86_sse2_pslli_d: - case Intrinsic::x86_sse2_pslli_q: - case Intrinsic::x86_avx2_pslli_w: - case Intrinsic::x86_avx2_pslli_d: - case Intrinsic::x86_avx2_pslli_q: - case Intrinsic::x86_sse2_psrli_w: - case Intrinsic::x86_sse2_psrli_d: - case Intrinsic::x86_sse2_psrli_q: - case Intrinsic::x86_avx2_psrli_w: - case Intrinsic::x86_avx2_psrli_d: - case Intrinsic::x86_avx2_psrli_q: - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_sse2_psrai_d: - case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: { - unsigned Opcode; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_sse2_pslli_w: - case Intrinsic::x86_sse2_pslli_d: - case Intrinsic::x86_sse2_pslli_q: - case Intrinsic::x86_avx2_pslli_w: - case Intrinsic::x86_avx2_pslli_d: - case Intrinsic::x86_avx2_pslli_q: - Opcode = X86ISD::VSHLI; - break; - case Intrinsic::x86_sse2_psrli_w: - case Intrinsic::x86_sse2_psrli_d: - case Intrinsic::x86_sse2_psrli_q: - case Intrinsic::x86_avx2_psrli_w: - case Intrinsic::x86_avx2_psrli_d: - case Intrinsic::x86_avx2_psrli_q: - Opcode = X86ISD::VSRLI; - break; - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_sse2_psrai_d: - case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: - Opcode = X86ISD::VSRAI; - break; - } - return getTargetVShiftNode(Opcode, dl, Op.getSimpleValueType(), - Op.getOperand(1), Op.getOperand(2), DAG); - } - case Intrinsic::x86_sse42_pcmpistria128: case Intrinsic::x86_sse42_pcmpestria128: case Intrinsic::x86_sse42_pcmpistric128: @@ -14098,6 +17056,32 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps); } + + case Intrinsic::x86_fma_mask_vfmadd_ps_512: + case Intrinsic::x86_fma_mask_vfmadd_pd_512: + case Intrinsic::x86_fma_mask_vfmsub_ps_512: + case Intrinsic::x86_fma_mask_vfmsub_pd_512: + case Intrinsic::x86_fma_mask_vfnmadd_ps_512: + case Intrinsic::x86_fma_mask_vfnmadd_pd_512: + case Intrinsic::x86_fma_mask_vfnmsub_ps_512: + case Intrinsic::x86_fma_mask_vfnmsub_pd_512: + case Intrinsic::x86_fma_mask_vfmaddsub_ps_512: + case Intrinsic::x86_fma_mask_vfmaddsub_pd_512: + case Intrinsic::x86_fma_mask_vfmsubadd_ps_512: + case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: { + auto *SAE = cast<ConstantSDNode>(Op.getOperand(5)); + if (SAE->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION) + return getVectorMaskingNode(DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), + dl, Op.getValueType(), + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)), + Op.getOperand(4), Op.getOperand(1), + Subtarget, DAG); + else + return SDValue(); + } + case Intrinsic::x86_fma_vfmadd_ps: case Intrinsic::x86_fma_vfmadd_pd: case Intrinsic::x86_fma_vfmsub_ps: @@ -14122,74 +17106,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_fma_vfmaddsub_pd_256: case Intrinsic::x86_fma_vfmsubadd_ps_256: case Intrinsic::x86_fma_vfmsubadd_pd_256: - case Intrinsic::x86_fma_vfmadd_ps_512: - case Intrinsic::x86_fma_vfmadd_pd_512: - case Intrinsic::x86_fma_vfmsub_ps_512: - case Intrinsic::x86_fma_vfmsub_pd_512: - case Intrinsic::x86_fma_vfnmadd_ps_512: - case Intrinsic::x86_fma_vfnmadd_pd_512: - case Intrinsic::x86_fma_vfnmsub_ps_512: - case Intrinsic::x86_fma_vfnmsub_pd_512: - case Intrinsic::x86_fma_vfmaddsub_ps_512: - case Intrinsic::x86_fma_vfmaddsub_pd_512: - case Intrinsic::x86_fma_vfmsubadd_ps_512: - case Intrinsic::x86_fma_vfmsubadd_pd_512: { - unsigned Opc; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_fma_vfmadd_ps: - case Intrinsic::x86_fma_vfmadd_pd: - case Intrinsic::x86_fma_vfmadd_ps_256: - case Intrinsic::x86_fma_vfmadd_pd_256: - case Intrinsic::x86_fma_vfmadd_ps_512: - case Intrinsic::x86_fma_vfmadd_pd_512: - Opc = X86ISD::FMADD; - break; - case Intrinsic::x86_fma_vfmsub_ps: - case Intrinsic::x86_fma_vfmsub_pd: - case Intrinsic::x86_fma_vfmsub_ps_256: - case Intrinsic::x86_fma_vfmsub_pd_256: - case Intrinsic::x86_fma_vfmsub_ps_512: - case Intrinsic::x86_fma_vfmsub_pd_512: - Opc = X86ISD::FMSUB; - break; - case Intrinsic::x86_fma_vfnmadd_ps: - case Intrinsic::x86_fma_vfnmadd_pd: - case Intrinsic::x86_fma_vfnmadd_ps_256: - case Intrinsic::x86_fma_vfnmadd_pd_256: - case Intrinsic::x86_fma_vfnmadd_ps_512: - case Intrinsic::x86_fma_vfnmadd_pd_512: - Opc = X86ISD::FNMADD; - break; - case Intrinsic::x86_fma_vfnmsub_ps: - case Intrinsic::x86_fma_vfnmsub_pd: - case Intrinsic::x86_fma_vfnmsub_ps_256: - case Intrinsic::x86_fma_vfnmsub_pd_256: - case Intrinsic::x86_fma_vfnmsub_ps_512: - case Intrinsic::x86_fma_vfnmsub_pd_512: - Opc = X86ISD::FNMSUB; - break; - case Intrinsic::x86_fma_vfmaddsub_ps: - case Intrinsic::x86_fma_vfmaddsub_pd: - case Intrinsic::x86_fma_vfmaddsub_ps_256: - case Intrinsic::x86_fma_vfmaddsub_pd_256: - case Intrinsic::x86_fma_vfmaddsub_ps_512: - case Intrinsic::x86_fma_vfmaddsub_pd_512: - Opc = X86ISD::FMADDSUB; - break; - case Intrinsic::x86_fma_vfmsubadd_ps: - case Intrinsic::x86_fma_vfmsubadd_pd: - case Intrinsic::x86_fma_vfmsubadd_ps_256: - case Intrinsic::x86_fma_vfmsubadd_pd_256: - case Intrinsic::x86_fma_vfmsubadd_ps_512: - case Intrinsic::x86_fma_vfmsubadd_pd_512: - Opc = X86ISD::FMSUBADD; - break; - } - - return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1), - Op.getOperand(2), Op.getOperand(3)); - } + return DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } } @@ -14374,122 +17292,25 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Results, DL); } -enum IntrinsicType { - GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST -}; - -struct IntrinsicData { - IntrinsicData(IntrinsicType IType, unsigned IOpc0, unsigned IOpc1) - :Type(IType), Opc0(IOpc0), Opc1(IOpc1) {} - IntrinsicType Type; - unsigned Opc0; - unsigned Opc1; -}; - -std::map < unsigned, IntrinsicData> IntrMap; -static void InitIntinsicsMap() { - static bool Initialized = false; - if (Initialized) - return; - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512, - IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512, - IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpd_512, - IntrinsicData(GATHER, X86::VGATHERQPDZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpd_512, - IntrinsicData(GATHER, X86::VGATHERDPDZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dps_512, - IntrinsicData(GATHER, X86::VGATHERDPSZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpi_512, - IntrinsicData(GATHER, X86::VPGATHERQDZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpq_512, - IntrinsicData(GATHER, X86::VPGATHERQQZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpi_512, - IntrinsicData(GATHER, X86::VPGATHERDDZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpq_512, - IntrinsicData(GATHER, X86::VPGATHERDQZrm, 0))); - - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qps_512, - IntrinsicData(SCATTER, X86::VSCATTERQPSZmr, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpd_512, - IntrinsicData(SCATTER, X86::VSCATTERQPDZmr, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpd_512, - IntrinsicData(SCATTER, X86::VSCATTERDPDZmr, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dps_512, - IntrinsicData(SCATTER, X86::VSCATTERDPSZmr, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpi_512, - IntrinsicData(SCATTER, X86::VPSCATTERQDZmr, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpq_512, - IntrinsicData(SCATTER, X86::VPSCATTERQQZmr, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpi_512, - IntrinsicData(SCATTER, X86::VPSCATTERDDZmr, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpq_512, - IntrinsicData(SCATTER, X86::VPSCATTERDQZmr, 0))); - - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qps_512, - IntrinsicData(PREFETCH, X86::VGATHERPF0QPSm, - X86::VGATHERPF1QPSm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qpd_512, - IntrinsicData(PREFETCH, X86::VGATHERPF0QPDm, - X86::VGATHERPF1QPDm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dpd_512, - IntrinsicData(PREFETCH, X86::VGATHERPF0DPDm, - X86::VGATHERPF1DPDm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dps_512, - IntrinsicData(PREFETCH, X86::VGATHERPF0DPSm, - X86::VGATHERPF1DPSm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qps_512, - IntrinsicData(PREFETCH, X86::VSCATTERPF0QPSm, - X86::VSCATTERPF1QPSm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qpd_512, - IntrinsicData(PREFETCH, X86::VSCATTERPF0QPDm, - X86::VSCATTERPF1QPDm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dpd_512, - IntrinsicData(PREFETCH, X86::VSCATTERPF0DPDm, - X86::VSCATTERPF1DPDm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dps_512, - IntrinsicData(PREFETCH, X86::VSCATTERPF0DPSm, - X86::VSCATTERPF1DPSm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_16, - IntrinsicData(RDRAND, X86ISD::RDRAND, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_32, - IntrinsicData(RDRAND, X86ISD::RDRAND, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_64, - IntrinsicData(RDRAND, X86ISD::RDRAND, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_16, - IntrinsicData(RDSEED, X86ISD::RDSEED, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_32, - IntrinsicData(RDSEED, X86ISD::RDSEED, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_64, - IntrinsicData(RDSEED, X86ISD::RDSEED, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_xtest, - IntrinsicData(XTEST, X86ISD::XTEST, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdtsc, - IntrinsicData(RDTSC, X86ISD::RDTSC_DAG, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp, - IntrinsicData(RDTSC, X86ISD::RDTSCP_DAG, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdpmc, - IntrinsicData(RDPMC, X86ISD::RDPMC_DAG, 0))); - Initialized = true; -} static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - InitIntinsicsMap(); unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - std::map < unsigned, IntrinsicData>::const_iterator itr = IntrMap.find(IntNo); - if (itr == IntrMap.end()) + + const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo); + if (!IntrData) return SDValue(); SDLoc dl(Op); - IntrinsicData Intr = itr->second; - switch(Intr.Type) { + switch(IntrData->Type) { + default: + llvm_unreachable("Unknown Intrinsic Type"); + break; case RDSEED: case RDRAND: { // Emit the node with the right value type. SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); - SDValue Result = DAG.getNode(Intr.Opc0, dl, VTs, Op.getOperand(0)); + SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. // Otherwise return the value from Rand, which is always 0, casted to i32. @@ -14513,7 +17334,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue Index = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Scale = Op.getOperand(6); - return getGatherNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, + return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); } case SCATTER: { @@ -14524,7 +17345,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue Index = Op.getOperand(4); SDValue Src = Op.getOperand(5); SDValue Scale = Op.getOperand(6); - return getScatterNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain); + return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain); } case PREFETCH: { SDValue Hint = Op.getOperand(6); @@ -14532,7 +17353,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, if (dyn_cast<ConstantSDNode> (Hint) == nullptr || (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1) llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1"); - unsigned Opcode = (HintVal ? Intr.Opc1 : Intr.Opc0); + unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0); SDValue Chain = Op.getOperand(0); SDValue Mask = Op.getOperand(2); SDValue Index = Op.getOperand(3); @@ -14543,7 +17364,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP). case RDTSC: { SmallVector<SDValue, 2> Results; - getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results); + getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results); return DAG.getMergeValues(Results, dl); } // Read Performance Monitoring Counters. @@ -14555,7 +17376,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, // XTEST intrinsics. case XTEST: { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); - SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0)); + SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86::COND_NE, MVT::i8), InTrans); @@ -14563,8 +17384,26 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Ret, SDValue(InTrans.getNode(), 1)); } + // ADC/ADCX/SBB + case ADX: { + SmallVector<SDValue, 2> Results; + SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other); + SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other); + SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2), + DAG.getConstant(-1, MVT::i8)); + SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3), + Op.getOperand(4), GenCF.getValue(1)); + SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0), + Op.getOperand(5), MachinePointerInfo(), + false, false, 0); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86::COND_B, MVT::i8), + Res.getValue(1)); + Results.push_back(SetCC); + Results.push_back(Store); + return DAG.getMergeValues(Results, dl); + } } - llvm_unreachable("Unknown Intrinsic Type"); } SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, @@ -14581,8 +17420,8 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, PtrVT, @@ -14603,8 +17442,8 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && @@ -14632,8 +17471,8 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName, SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const { - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize()); } @@ -14644,8 +17483,8 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDLoc dl (Op); EVT PtrVT = getPointerTy(); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && @@ -14692,7 +17531,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SDLoc dl (Op); const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); - const TargetRegisterInfo* TRI = DAG.getTarget().getRegisterInfo(); + const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo(); if (Subtarget->is64Bit()) { SDValue OutChains[6]; @@ -14856,7 +17695,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering &TFI = *TM.getFrameLowering(); + const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); @@ -15156,10 +17995,23 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) || (VT == MVT::v8i32 && Subtarget->hasInt256())); - // Get the high parts. + // PMULxD operations multiply each even value (starting at 0) of LHS with + // the related value of RHS and produce a widen result. + // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h> + // => <2 x i64> <ae|cg> + // + // In other word, to have all the results, we need to perform two PMULxD: + // 1. one with the even values. + // 2. one with the odd values. + // To achieve #2, with need to place the odd values at an even position. + // + // Place the odd value at an even position (basically, shift all values 1 + // step to the left): const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1}; - SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask); - SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask); + // <a|b|c|d> => <b|undef|d|undef> + SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask); + // <e|f|g|h> => <f|undef|h|undef> + SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask); // Emit two multiplies, one for the lower 2 ints and one for the higher 2 // ints. @@ -15167,10 +18019,14 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI; unsigned Opcode = (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; + // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h> + // => <2 x i64> <ae|cg> SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1)); + // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef> + // => <2 x i64> <bf|dh> SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1)); + DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1)); // Shuffle it back into the right order. SDValue Highs, Lows; @@ -15200,7 +18056,10 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup); } - return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows); + // The first result of MUL_LOHI is actually the low value, followed by the + // high value. + SDValue Ops[] = {Lows, Highs}; + return DAG.getMergeValues(Ops, dl); } static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, @@ -15811,10 +18670,15 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { Cond = X86::COND_B; break; case ISD::SMULO: - BaseOp = X86ISD::SMUL; + BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL; Cond = X86::COND_O; break; case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs + if (N->getValueType(0) == MVT::i8) { + BaseOp = X86ISD::UMUL8; + Cond = X86::COND_O; + break; + } SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), MVT::i32); SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); @@ -15840,6 +18704,11 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } +// Sign extension of the low part of vector elements. This may be used either +// when sign extend instructions are not available or if the vector element +// sizes already match the sign-extended size. If the vector elements are in +// their pre-extended size and sign extend instructions are available, that will +// be handled by LowerSIGN_EXTEND. SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -15885,37 +18754,151 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, case MVT::v4i32: case MVT::v8i16: { SDValue Op0 = Op.getOperand(0); - SDValue Op00 = Op0.getOperand(0); - SDValue Tmp1; - // Hopefully, this VECTOR_SHUFFLE is just a VZEXT. - if (Op0.getOpcode() == ISD::BITCAST && - Op00.getOpcode() == ISD::VECTOR_SHUFFLE) { - // (sext (vzext x)) -> (vsext x) - Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG); - if (Tmp1.getNode()) { - EVT ExtraEltVT = ExtraVT.getVectorElementType(); - // This folding is only valid when the in-reg type is a vector of i8, - // i16, or i32. - if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 || - ExtraEltVT == MVT::i32) { - SDValue Tmp1Op0 = Tmp1.getOperand(0); - assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT && - "This optimization is invalid without a VZEXT."); - return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0)); - } - Op0 = Tmp1; - } - } - // If the above didn't work, then just use Shift-Left + Shift-Right. - Tmp1 = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, BitsDiff, - DAG); - return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Tmp1, BitsDiff, + // This is a sign extension of some low part of vector elements without + // changing the size of the vector elements themselves: + // Shift-Left + Shift-Right-Algebraic. + SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, + BitsDiff, DAG); + return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff, DAG); } } } +/// Returns true if the operand type is exactly twice the native width, and +/// the corresponding cmpxchg8b or cmpxchg16b instruction is available. +/// Used to know whether to use cmpxchg8/16b when expanding atomic operations +/// (otherwise we leave them alone to become __sync_fetch_and_... calls). +bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const { + const X86Subtarget &Subtarget = + getTargetMachine().getSubtarget<X86Subtarget>(); + unsigned OpWidth = MemType->getPrimitiveSizeInBits(); + + if (OpWidth == 64) + return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b + else if (OpWidth == 128) + return Subtarget.hasCmpxchg16b(); + else + return false; +} + +bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { + return needsCmpXchgNb(SI->getValueOperand()->getType()); +} + +// Note: this turns large loads into lock cmpxchg8b/16b. +// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b. +bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { + auto PTy = cast<PointerType>(LI->getPointerOperand()->getType()); + return needsCmpXchgNb(PTy->getElementType()); +} + +bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { + const X86Subtarget &Subtarget = + getTargetMachine().getSubtarget<X86Subtarget>(); + unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; + const Type *MemType = AI->getType(); + + // If the operand is too big, we must see if cmpxchg8/16b is available + // and default to library calls otherwise. + if (MemType->getPrimitiveSizeInBits() > NativeWidth) + return needsCmpXchgNb(MemType); + + AtomicRMWInst::BinOp Op = AI->getOperation(); + switch (Op) { + default: + llvm_unreachable("Unknown atomic operation"); + case AtomicRMWInst::Xchg: + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: + // It's better to use xadd, xsub or xchg for these in all cases. + return false; + case AtomicRMWInst::Or: + case AtomicRMWInst::And: + case AtomicRMWInst::Xor: + // If the atomicrmw's result isn't actually used, we can just add a "lock" + // prefix to a normal instruction for these operations. + return !AI->use_empty(); + case AtomicRMWInst::Nand: + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: + // These always require a non-trivial set of data operations on x86. We must + // use a cmpxchg loop. + return true; + } +} + +static bool hasMFENCE(const X86Subtarget& Subtarget) { + // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for + // no-sse2). There isn't any reason to disable it if the target processor + // supports it. + return Subtarget.hasSSE2() || Subtarget.is64Bit(); +} + +LoadInst * +X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { + const X86Subtarget &Subtarget = + getTargetMachine().getSubtarget<X86Subtarget>(); + unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; + const Type *MemType = AI->getType(); + // Accesses larger than the native width are turned into cmpxchg/libcalls, so + // there is no benefit in turning such RMWs into loads, and it is actually + // harmful as it introduces a mfence. + if (MemType->getPrimitiveSizeInBits() > NativeWidth) + return nullptr; + + auto Builder = IRBuilder<>(AI); + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + auto SynchScope = AI->getSynchScope(); + // We must restrict the ordering to avoid generating loads with Release or + // ReleaseAcquire orderings. + auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); + auto Ptr = AI->getPointerOperand(); + + // Before the load we need a fence. Here is an example lifted from + // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence + // is required: + // Thread 0: + // x.store(1, relaxed); + // r1 = y.fetch_add(0, release); + // Thread 1: + // y.fetch_add(42, acquire); + // r2 = x.load(relaxed); + // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is + // lowered to just a load without a fence. A mfence flushes the store buffer, + // making the optimization clearly correct. + // FIXME: it is required if isAtLeastRelease(Order) but it is not clear + // otherwise, we might be able to be more agressive on relaxed idempotent + // rmw. In practice, they do not look useful, so we don't try to be + // especially clever. + if (SynchScope == SingleThread) { + // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at + // the IR level, so we must wrap it in an intrinsic. + return nullptr; + } else if (hasMFENCE(Subtarget)) { + Function *MFence = llvm::Intrinsic::getDeclaration(M, + Intrinsic::x86_sse2_mfence); + Builder.CreateCall(MFence); + } else { + // FIXME: it might make sense to use a locked operation here but on a + // different cache-line to prevent cache-line bouncing. In practice it + // is probably a small win, and x86 processors without mfence are rare + // enough that we do not bother. + return nullptr; + } + + // Finally we can emit the atomic load. + LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr, + AI->getType()->getPrimitiveSizeInBits()); + Loaded->setAtomic(Order, SynchScope); + AI->replaceAllUsesWith(Loaded); + AI->eraseFromParent(); + return Loaded; +} + static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); @@ -15927,10 +18910,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, // The only fence that needs an instruction is a sequentially-consistent // cross-thread fence. if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { - // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for - // no-sse2). There isn't any reason to disable it if the target processor - // supports it. - if (Subtarget->hasSSE2() || Subtarget->is64Bit()) + if (hasMFENCE(*Subtarget)) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); SDValue Chain = Op.getOperand(0); @@ -16141,7 +19121,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy()); Type *RetTy = isF64 - ? (Type*)StructType::get(ArgTy, ArgTy, NULL) + ? (Type*)StructType::get(ArgTy, ArgTy, nullptr) : (Type*)VectorType::get(ArgTy, 4); TargetLowering::CallLoweringInfo CLI(DAG); @@ -16200,8 +19180,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); - case ISD::FABS: return LowerFABS(Op, DAG); - case ISD::FNEG: return LowerFNEG(Op, DAG); + case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG); + case ISD::FABS: + case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); @@ -16211,7 +19192,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); - case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); @@ -16252,29 +19233,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { } } -static void ReplaceATOMIC_LOAD(SDNode *Node, - SmallVectorImpl<SDValue> &Results, - SelectionDAG &DAG) { - SDLoc dl(Node); - EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); - - // Convert wide load -> cmpxchg8b/cmpxchg16b - // FIXME: On 32-bit, load -> fild or movq would be more efficient - // (The only way to get a 16-byte load is cmpxchg16b) - // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment. - SDValue Zero = DAG.getConstant(0, VT); - SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other); - SDValue Swap = - DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, VT, VTs, - Node->getOperand(0), Node->getOperand(1), Zero, Zero, - cast<AtomicSDNode>(Node)->getMemOperand(), - cast<AtomicSDNode>(Node)->getOrdering(), - cast<AtomicSDNode>(Node)->getOrdering(), - cast<AtomicSDNode>(Node)->getSynchScope()); - Results.push_back(Swap.getValue(0)); - Results.push_back(Swap.getValue(2)); -} - /// ReplaceNodeResults - Replace a node with an illegal result type /// with a new node built out of custom code. void X86TargetLowering::ReplaceNodeResults(SDNode *N, @@ -16433,12 +19391,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: + case ISD::ATOMIC_LOAD: { // Delegate to generic TypeLegalization. Situations we can really handle - // should have already been dealt with by X86AtomicExpand.cpp. + // should have already been dealt with by AtomicExpandPass.cpp. break; - case ISD::ATOMIC_LOAD: { - ReplaceATOMIC_LOAD(N, Results, DAG); - return; } case ISD::BITCAST: { assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); @@ -16521,8 +19477,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; case X86ISD::ANDNP: return "X86ISD::ANDNP"; case X86ISD::PSIGN: return "X86ISD::PSIGN"; - case X86ISD::BLENDV: return "X86ISD::BLENDV"; case X86ISD::BLENDI: return "X86ISD::BLENDI"; + case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND"; case X86ISD::SUBUS: return "X86ISD::SUBUS"; case X86ISD::HADD: return "X86ISD::HADD"; case X86ISD::HSUB: return "X86ISD::HSUB"; @@ -16578,6 +19534,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SBB: return "X86ISD::SBB"; case X86ISD::SMUL: return "X86ISD::SMUL"; case X86ISD::UMUL: return "X86ISD::UMUL"; + case X86ISD::SMUL8: return "X86ISD::SMUL8"; + case X86ISD::UMUL8: return "X86ISD::UMUL8"; + case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG"; + case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG"; case X86ISD::INC: return "X86ISD::INC"; case X86ISD::DEC: return "X86ISD::DEC"; case X86ISD::OR: return "X86ISD::OR"; @@ -16593,6 +19553,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PACKSS: return "X86ISD::PACKSS"; case X86ISD::PACKUS: return "X86ISD::PACKUS"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; + case X86ISD::VALIGN: return "X86ISD::VALIGN"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; @@ -16612,7 +19573,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; - case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; + case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; case X86ISD::VPERMV: return "X86ISD::VPERMV"; case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; @@ -16848,8 +19809,11 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, return (SVT.getVectorNumElements() == 2 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isMOVLMask(M, SVT) || + isMOVHLPSMask(M, SVT) || isSHUFPMask(M, SVT) || + isSHUFPMask(M, SVT, /* Commuted */ true) || isPSHUFDMask(M, SVT) || + isPSHUFDMask(M, SVT, /* SecondOperand */ true) || isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) || isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) || isPALIGNRMask(M, SVT, Subtarget) || @@ -16857,7 +19821,8 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, isUNPCKHMask(M, SVT, Subtarget->hasInt256()) || isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || - isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256())); + isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) || + (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT))); } bool @@ -16875,7 +19840,9 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, return (isMOVLMask(Mask, SVT) || isCommutedMOVLMask(Mask, SVT, true) || isSHUFPMask(Mask, SVT) || - isSHUFPMask(Mask, SVT, /* Commuted */ true)); + isSHUFPMask(Mask, SVT, /* Commuted */ true) || + isBlendMask(Mask, SVT, Subtarget->hasSSE41(), + Subtarget->hasInt256())); } return false; } @@ -17073,7 +20040,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter( MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); // Machine Information - const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo(); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); @@ -17329,7 +20296,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( XMMSaveMBB->addSuccessor(EndMBB); // Now add the instructions. - const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); unsigned CountReg = MI->getOperand(0).getReg(); @@ -17412,7 +20379,7 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *BB) const { - const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); // To "insert" a SELECT_CC instruction, we actually have to insert the @@ -17438,7 +20405,8 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. - const TargetRegisterInfo* TRI = BB->getParent()->getTarget().getRegisterInfo(); + const TargetRegisterInfo *TRI = + BB->getParent()->getSubtarget().getRegisterInfo(); if (!MI->killsRegister(X86::EFLAGS) && !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { copy0MBB->addLiveIn(X86::EFLAGS); @@ -17477,17 +20445,20 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, } MachineBasicBlock * -X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, - bool Is64Bit) const { +X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, + MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); - const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); assert(MF->shouldSplitStack()); - unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; - unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; + const bool Is64Bit = Subtarget->is64Bit(); + const bool IsLP64 = Subtarget->isTarget64BitLP64(); + + const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; + const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30; // BB: // ... [Till the alloca] @@ -17511,14 +20482,14 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, MachineRegisterInfo &MRI = MF->getRegInfo(); const TargetRegisterClass *AddrRegClass = - getRegClassFor(Is64Bit ? MVT::i64:MVT::i32); + getRegClassFor(getPointerTy()); unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), sizeVReg = MI->getOperand(1).getReg(), - physSPReg = Is64Bit ? X86::RSP : X86::ESP; + physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP; MachineFunction::iterator MBBIter = BB; ++MBBIter; @@ -17534,9 +20505,9 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, // Add code to the main basic block to check if the stack limit has been hit, // and if so, jump to mallocMBB otherwise to bumpMBB. BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); - BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) + BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) .addReg(tmpSPVReg).addReg(sizeVReg); - BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr)) + BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr)) .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) .addReg(SPLimitVReg); BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB); @@ -17550,9 +20521,11 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); // Calls into a routine in libgcc to allocate more space from the heap. - const uint32_t *RegMask = - MF->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C); - if (Is64Bit) { + const uint32_t *RegMask = MF->getTarget() + .getSubtargetImpl() + ->getRegisterInfo() + ->getCallPreservedMask(CallingConv::C); + if (IsLP64) { BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) .addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) @@ -17560,6 +20533,14 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, .addRegMask(RegMask) .addReg(X86::RDI, RegState::Implicit) .addReg(X86::RAX, RegState::ImplicitDefine); + } else if (Is64Bit) { + BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI) + .addReg(sizeVReg); + BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) + .addExternalSymbol("__morestack_allocate_stack_space") + .addRegMask(RegMask) + .addReg(X86::EDI, RegState::Implicit) + .addReg(X86::EAX, RegState::ImplicitDefine); } else { BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) .addImm(12); @@ -17575,7 +20556,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, .addImm(16); BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) - .addReg(Is64Bit ? X86::RAX : X86::EAX); + .addReg(IsLP64 ? X86::RAX : X86::EAX); BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); // Set up the CFG correctly. @@ -17600,7 +20581,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, MachineBasicBlock * X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { - const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); assert(!Subtarget->isTargetMacho()); @@ -17633,8 +20614,10 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, .addReg(X86::RAX); } } else { - const char *StackProbeSymbol = - Subtarget->isTargetKnownWindowsMSVC() ? "_chkstk" : "_alloca"; + const char *StackProbeSymbol = (Subtarget->isTargetKnownWindowsMSVC() || + Subtarget->isTargetWindowsItanium()) + ? "_chkstk" + : "_alloca"; BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) .addExternalSymbol(StackProbeSymbol) @@ -17657,8 +20640,8 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, // or EAX and doing an indirect call. The return value will then // be in the normal return register. MachineFunction *F = BB->getParent(); - const X86InstrInfo *TII - = static_cast<const X86InstrInfo*>(F->getTarget().getInstrInfo()); + const X86InstrInfo *TII = + static_cast<const X86InstrInfo *>(F->getSubtarget().getInstrInfo()); DebugLoc DL = MI->getDebugLoc(); assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); @@ -17667,8 +20650,10 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, // Get a register mask for the lowered call. // FIXME: The 32-bit calls have non-standard calling conventions. Use a // proper register mask. - const uint32_t *RegMask = - F->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C); + const uint32_t *RegMask = F->getTarget() + .getSubtargetImpl() + ->getRegisterInfo() + ->getCallPreservedMask(CallingConv::C); if (Subtarget->is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI) @@ -17713,7 +20698,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); MachineFunction *MF = MBB->getParent(); - const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); @@ -17819,8 +20804,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) .addMBB(restoreMBB); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + MF->getSubtarget().getRegisterInfo()); MIB.addRegMask(RegInfo->getNoPreservedMask()); thisMBB->addSuccessor(mainMBB); thisMBB->addSuccessor(restoreMBB); @@ -17850,7 +20835,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); MachineFunction *MF = MBB->getParent(); - const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); // Memory Reference @@ -17865,8 +20850,8 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; unsigned Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + MF->getSubtarget().getRegisterInfo()); unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; unsigned SP = RegInfo->getStackRegister(); @@ -17965,6 +20950,11 @@ X86TargetLowering::emitFMA3Instr(MachineInstr *MI, case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break; case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break; case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break; + case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break; + case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break; + case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break; + case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break; + case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break; case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break; case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break; @@ -17973,10 +20963,14 @@ X86TargetLowering::emitFMA3Instr(MachineInstr *MI, case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break; case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break; case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break; + case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break; + case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break; + case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break; + case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break; default: llvm_unreachable("Unrecognized FMA variant."); } - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc)) .addOperand(MI->getOperand(0)) @@ -18007,9 +21001,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::WIN_ALLOCA: return EmitLoweredWinAlloca(MI, BB); case X86::SEG_ALLOCA_32: - return EmitLoweredSegAlloca(MI, BB, false); case X86::SEG_ALLOCA_64: - return EmitLoweredSegAlloca(MI, BB, true); + return EmitLoweredSegAlloca(MI, BB); case X86::TLSCall_32: case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); @@ -18042,7 +21035,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::FP80_TO_INT32_IN_MEM: case X86::FP80_TO_INT64_IN_MEM: { MachineFunction *F = BB->getParent(); - const TargetInstrInfo *TII = F->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); // Change the floating point control register to use "round towards zero" @@ -18126,7 +21119,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VPCMPESTRM128MEM: assert(Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); - return EmitPCMPSTRM(MI, BB, BB->getParent()->getTarget().getInstrInfo()); + return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo()); // String/text processing lowering. case X86::PCMPISTRIREG: @@ -18139,15 +21132,16 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VPCMPESTRIMEM: assert(Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); - return EmitPCMPSTRI(MI, BB, BB->getParent()->getTarget().getInstrInfo()); + return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo()); // Thread synchronization. case X86::MONITOR: - return EmitMonitor(MI, BB, BB->getParent()->getTarget().getInstrInfo(), Subtarget); + return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(), + Subtarget); // xbegin case X86::XBEGIN: - return EmitXBegin(MI, BB, BB->getParent()->getTarget().getInstrInfo()); + return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo()); case X86::VASTART_SAVE_XMM_REGS: return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); @@ -18183,6 +21177,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VFNMSUBPSr213r: case X86::VFNMSUBSDr213r: case X86::VFNMSUBSSr213r: + case X86::VFMADDSUBPDr213r: + case X86::VFMADDSUBPSr213r: + case X86::VFMSUBADDPDr213r: + case X86::VFMSUBADDPSr213r: case X86::VFMADDPDr213rY: case X86::VFMADDPSr213rY: case X86::VFMSUBPDr213rY: @@ -18191,6 +21189,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VFNMADDPSr213rY: case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPSr213rY: + case X86::VFMADDSUBPDr213rY: + case X86::VFMADDSUBPSr213rY: + case X86::VFMSUBADDPDr213rY: + case X86::VFMSUBADDPSr213rY: return emitFMA3Instr(MI, BB); } } @@ -18420,6 +21422,329 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// \brief Combine an arbitrary chain of shuffles into a single instruction if +/// possible. +/// +/// This is the leaf of the recursive combinine below. When we have found some +/// chain of single-use x86 shuffle instructions and accumulated the combined +/// shuffle mask represented by them, this will try to pattern match that mask +/// into either a single instruction if there is a special purpose instruction +/// for this operation, or into a PSHUFB instruction which is a fully general +/// instruction but should only be used to replace chains over a certain depth. +static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, + int Depth, bool HasPSHUFB, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + assert(!Mask.empty() && "Cannot combine an empty shuffle mask!"); + + // Find the operand that enters the chain. Note that multiple uses are OK + // here, we're not going to remove the operand we find. + SDValue Input = Op.getOperand(0); + while (Input.getOpcode() == ISD::BITCAST) + Input = Input.getOperand(0); + + MVT VT = Input.getSimpleValueType(); + MVT RootVT = Root.getSimpleValueType(); + SDLoc DL(Root); + + // Just remove no-op shuffle masks. + if (Mask.size() == 1) { + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input), + /*AddTo*/ true); + return true; + } + + // Use the float domain if the operand type is a floating point type. + bool FloatDomain = VT.isFloatingPoint(); + + // For floating point shuffles, we don't have free copies in the shuffle + // instructions or the ability to load as part of the instruction, so + // canonicalize their shuffles to UNPCK or MOV variants. + // + // Note that even with AVX we prefer the PSHUFD form of shuffle for integer + // vectors because it can have a load folded into it that UNPCK cannot. This + // doesn't preclude something switching to the shorter encoding post-RA. + if (FloatDomain) { + if (Mask.equals(0, 0) || Mask.equals(1, 1)) { + bool Lo = Mask.equals(0, 0); + unsigned Shuffle; + MVT ShuffleVT; + // Check if we have SSE3 which will let us use MOVDDUP. That instruction + // is no slower than UNPCKLPD but has the option to fold the input operand + // into even an unaligned memory load. + if (Lo && Subtarget->hasSSE3()) { + Shuffle = X86ISD::MOVDDUP; + ShuffleVT = MVT::v2f64; + } else { + // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller + // than the UNPCK variants. + Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS; + ShuffleVT = MVT::v4f32; + } + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! + Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); + DCI.AddToWorklist(Op.getNode()); + if (Shuffle == X86ISD::MOVDDUP) + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); + else + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + /*AddTo*/ true); + return true; + } + if (Subtarget->hasSSE3() && + (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) { + bool Lo = Mask.equals(0, 0, 2, 2); + unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP; + MVT ShuffleVT = MVT::v4f32; + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! + Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); + DCI.AddToWorklist(Op.getNode()); + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + /*AddTo*/ true); + return true; + } + if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) { + bool Lo = Mask.equals(0, 0, 1, 1); + unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; + MVT ShuffleVT = MVT::v4f32; + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! + Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); + DCI.AddToWorklist(Op.getNode()); + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + /*AddTo*/ true); + return true; + } + } + + // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK + // variants as none of these have single-instruction variants that are + // superior to the UNPCK formulation. + if (!FloatDomain && + (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) || + Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) || + Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) || + Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, + 15))) { + bool Lo = Mask[0] == 0; + unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! + MVT ShuffleVT; + switch (Mask.size()) { + case 8: + ShuffleVT = MVT::v8i16; + break; + case 16: + ShuffleVT = MVT::v16i8; + break; + default: + llvm_unreachable("Impossible mask size!"); + }; + Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); + DCI.AddToWorklist(Op.getNode()); + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + /*AddTo*/ true); + return true; + } + + // Don't try to re-form single instruction chains under any circumstances now + // that we've done encoding canonicalization for them. + if (Depth < 2) + return false; + + // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we + // can replace them with a single PSHUFB instruction profitably. Intel's + // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but + // in practice PSHUFB tends to be *very* fast so we're more aggressive. + if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) { + SmallVector<SDValue, 16> PSHUFBMask; + assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!"); + int Ratio = 16 / Mask.size(); + for (unsigned i = 0; i < 16; ++i) { + if (Mask[i / Ratio] == SM_SentinelUndef) { + PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8)); + continue; + } + int M = Mask[i / Ratio] != SM_SentinelZero + ? Ratio * Mask[i / Ratio] + i % Ratio + : 255; + PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8)); + } + Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input); + DCI.AddToWorklist(Op.getNode()); + SDValue PSHUFBMaskOp = + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask); + DCI.AddToWorklist(PSHUFBMaskOp.getNode()); + Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + /*AddTo*/ true); + return true; + } + + // Failed to find any combines. + return false; +} + +/// \brief Fully generic combining of x86 shuffle instructions. +/// +/// This should be the last combine run over the x86 shuffle instructions. Once +/// they have been fully optimized, this will recursively consider all chains +/// of single-use shuffle instructions, build a generic model of the cumulative +/// shuffle operation, and check for simpler instructions which implement this +/// operation. We use this primarily for two purposes: +/// +/// 1) Collapse generic shuffles to specialized single instructions when +/// equivalent. In most cases, this is just an encoding size win, but +/// sometimes we will collapse multiple generic shuffles into a single +/// special-purpose shuffle. +/// 2) Look for sequences of shuffle instructions with 3 or more total +/// instructions, and replace them with the slightly more expensive SSSE3 +/// PSHUFB instruction if available. We do this as the last combining step +/// to ensure we avoid using PSHUFB if we can implement the shuffle with +/// a suitable short sequence of other instructions. The PHUFB will either +/// use a register or have to read from memory and so is slightly (but only +/// slightly) more expensive than the other shuffle instructions. +/// +/// Because this is inherently a quadratic operation (for each shuffle in +/// a chain, we recurse up the chain), the depth is limited to 8 instructions. +/// This should never be an issue in practice as the shuffle lowering doesn't +/// produce sequences of more than 8 instructions. +/// +/// FIXME: We will currently miss some cases where the redundant shuffling +/// would simplify under the threshold for PSHUFB formation because of +/// combine-ordering. To fix this, we should do the redundant instruction +/// combining in this recursive walk. +static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, + ArrayRef<int> RootMask, + int Depth, bool HasPSHUFB, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + // Bound the depth of our recursive combine because this is ultimately + // quadratic in nature. + if (Depth > 8) + return false; + + // Directly rip through bitcasts to find the underlying operand. + while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse()) + Op = Op.getOperand(0); + + MVT VT = Op.getSimpleValueType(); + if (!VT.isVector()) + return false; // Bail if we hit a non-vector. + // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit + // version should be added. + if (VT.getSizeInBits() != 128) + return false; + + assert(Root.getSimpleValueType().isVector() && + "Shuffles operate on vector types!"); + assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && + "Can only combine shuffles of the same vector register size."); + + if (!isTargetShuffle(Op.getOpcode())) + return false; + SmallVector<int, 16> OpMask; + bool IsUnary; + bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary); + // We only can combine unary shuffles which we can decode the mask for. + if (!HaveMask || !IsUnary) + return false; + + assert(VT.getVectorNumElements() == OpMask.size() && + "Different mask size from vector size!"); + assert(((RootMask.size() > OpMask.size() && + RootMask.size() % OpMask.size() == 0) || + (OpMask.size() > RootMask.size() && + OpMask.size() % RootMask.size() == 0) || + OpMask.size() == RootMask.size()) && + "The smaller number of elements must divide the larger."); + int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size()); + int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size()); + assert(((RootRatio == 1 && OpRatio == 1) || + (RootRatio == 1) != (OpRatio == 1)) && + "Must not have a ratio for both incoming and op masks!"); + + SmallVector<int, 16> Mask; + Mask.reserve(std::max(OpMask.size(), RootMask.size())); + + // Merge this shuffle operation's mask into our accumulated mask. Note that + // this shuffle's mask will be the first applied to the input, followed by the + // root mask to get us all the way to the root value arrangement. The reason + // for this order is that we are recursing up the operation chain. + for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) { + int RootIdx = i / RootRatio; + if (RootMask[RootIdx] < 0) { + // This is a zero or undef lane, we're done. + Mask.push_back(RootMask[RootIdx]); + continue; + } + + int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio; + int OpIdx = RootMaskedIdx / OpRatio; + if (OpMask[OpIdx] < 0) { + // The incoming lanes are zero or undef, it doesn't matter which ones we + // are using. + Mask.push_back(OpMask[OpIdx]); + continue; + } + + // Ok, we have non-zero lanes, map them through. + Mask.push_back(OpMask[OpIdx] * OpRatio + + RootMaskedIdx % OpRatio); + } + + // See if we can recurse into the operand to combine more things. + switch (Op.getOpcode()) { + case X86ISD::PSHUFB: + HasPSHUFB = true; + case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + if (Op.getOperand(0).hasOneUse() && + combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, + HasPSHUFB, DAG, DCI, Subtarget)) + return true; + break; + + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!"); + // We can't check for single use, we have to check that this shuffle is the only user. + if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && + combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, + HasPSHUFB, DAG, DCI, Subtarget)) + return true; + break; + } + + // Minor canonicalization of the accumulated shuffle mask to make it easier + // to match below. All this does is detect masks with squential pairs of + // elements, and shrink them to the half-width mask. It does this in a loop + // so it will reduce the size of the mask to the minimal width mask which + // performs an equivalent shuffle. + SmallVector<int, 16> WidenedMask; + while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { + Mask = std::move(WidenedMask); + WidenedMask.clear(); + } + + return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI, + Subtarget); +} + /// \brief Get the PSHUF-style mask from PSHUF node. /// /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 @@ -18452,19 +21777,23 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) { /// We walk up the chain and look for a combinable shuffle, skipping over /// shuffles that we could hoist this shuffle's transformation past without /// altering anything. -static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, - SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { +static SDValue +combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { assert(N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"); SDLoc DL(N); - // Walk up a single-use chain looking for a combinable shuffle. + // Walk up a single-use chain looking for a combinable shuffle. Keep a stack + // of the shuffles in the chain so that we can form a fresh chain to replace + // this one. + SmallVector<SDValue, 8> Chain; SDValue V = N.getOperand(0); for (; V.hasOneUse(); V = V.getOperand(0)) { switch (V.getOpcode()) { default: - return false; // Nothing combined! + return SDValue(); // Nothing combined! case ISD::BITCAST: // Skip bitcasts as we always know the type for the target specific @@ -18480,8 +21809,9 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, // dword shuffle, and the high words are self-contained. if (Mask[0] != 0 || Mask[1] != 1 || !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4)) - return false; + return SDValue(); + Chain.push_back(V); continue; case X86ISD::PSHUFHW: @@ -18489,8 +21819,9 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, // dword shuffle, and the low words are self-contained. if (Mask[2] != 2 || Mask[3] != 3 || !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2)) - return false; + return SDValue(); + Chain.push_back(V); continue; case X86ISD::UNPCKL: @@ -18498,25 +21829,28 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword // shuffle into a preceding word shuffle. if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16) - return false; + return SDValue(); // Search for a half-shuffle which we can combine with. unsigned CombineOp = V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; if (V.getOperand(0) != V.getOperand(1) || !V->isOnlyUserOf(V.getOperand(0).getNode())) - return false; + return SDValue(); + Chain.push_back(V); V = V.getOperand(0); do { switch (V.getOpcode()) { default: - return false; // Nothing to combine. + return SDValue(); // Nothing to combine. case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: if (V.getOpcode() == CombineOp) break; + Chain.push_back(V); + // Fallthrough! case ISD::BITCAST: V = V.getOperand(0); @@ -18532,10 +21866,7 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, if (!V.hasOneUse()) // We fell out of the loop without finding a viable combining instruction. - return false; - - // Record the old value to use in RAUW-ing. - SDValue Old = V; + return SDValue(); // Merge this node's mask and our incoming mask. SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); @@ -18544,20 +21875,34 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0), getV4X86ShuffleImm8ForMask(Mask, DAG)); - // It is possible that one of the combinable shuffles was completely absorbed - // by the other, just replace it and revisit all users in that case. - if (Old.getNode() == V.getNode()) { - DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo=*/true); - return true; - } + // Rebuild the chain around this new shuffle. + while (!Chain.empty()) { + SDValue W = Chain.pop_back_val(); - // Replace N with its operand as we're going to combine that shuffle away. - DAG.ReplaceAllUsesWith(N, N.getOperand(0)); + if (V.getValueType() != W.getOperand(0).getValueType()) + V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V); - // Replace the combinable shuffle with the combined one, updating all users - // so that we re-evaluate the chain here. - DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); - return true; + switch (W.getOpcode()) { + default: + llvm_unreachable("Only PSHUF and UNPCK instructions get here!"); + + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V); + break; + + case X86ISD::PSHUFD: + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1)); + break; + } + } + if (V.getValueType() != N.getValueType()) + V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V); + + // Return the new chain to replace N. + return V; } /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw. @@ -18593,26 +21938,6 @@ static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask, // Other-half shuffles are no-ops. continue; - - case X86ISD::PSHUFD: { - // We can only handle pshufd if the half we are combining either stays in - // its half, or switches to the other half. Bail if one of these isn't - // true. - SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); - int DOffset = CombineOpcode == X86ISD::PSHUFLW ? 0 : 2; - if (!((VMask[DOffset + 0] < 2 && VMask[DOffset + 1] < 2) || - (VMask[DOffset + 0] >= 2 && VMask[DOffset + 1] >= 2))) - return false; - - // Map the mask through the pshufd and keep walking up the chain. - for (int i = 0; i < 4; ++i) - Mask[i] = 2 * (VMask[DOffset + Mask[i] / 2] % 2) + Mask[i] % 2; - - // Switch halves if the pshufd does. - CombineOpcode = - VMask[DOffset + Mask[0] / 2] < 2 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; - continue; - } } // Break out of the loop if we break out of the switch. break; @@ -18622,7 +21947,11 @@ static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask, // We fell out of the loop without finding a viable combining instruction. return false; - // Record the old value to use in RAUW-ing. + // Combine away the bottom node as its shuffle will be accumulated into + // a preceding shuffle. + DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); + + // Record the old value. SDValue Old = V; // Merge this node's mask and our incoming mask (adjusted to account for all @@ -18633,12 +21962,13 @@ static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask, V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0), getV4X86ShuffleImm8ForMask(Mask, DAG)); - // Replace N with its operand as we're going to combine that shuffle away. - DAG.ReplaceAllUsesWith(N, N.getOperand(0)); + // Check that the shuffles didn't cancel each other out. If not, we need to + // combine to the new one. + if (Old != V) + // Replace the combinable shuffle with the combined one, updating all users + // so that we re-evaluate the chain here. + DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); - // Replace the combinable shuffle with the combined one, updating all users - // so that we re-evaluate the chain here. - DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); return true; } @@ -18679,13 +22009,13 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, return SDValue(); // We combined away this shuffle, so we're done. // See if this reduces to a PSHUFD which is no more expensive and can - // combine with more operations. - if (Mask[0] % 2 == 0 && Mask[2] % 2 == 0 && - areAdjacentMasksSequential(Mask)) { - int DMask[] = {-1, -1, -1, -1}; + // combine with more operations. Note that it has to at least flip the + // dwords as otherwise it would have been removed as a no-op. + if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) { + int DMask[] = {0, 1, 2, 3}; int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; - DMask[DOffset + 0] = DOffset + Mask[0] / 2; - DMask[DOffset + 1] = DOffset + Mask[2] / 2; + DMask[DOffset + 0] = DOffset + 1; + DMask[DOffset + 1] = DOffset + 0; V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V); DCI.AddToWorklist(V.getNode()); V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V, @@ -18738,8 +22068,8 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, break; case X86ISD::PSHUFD: - if (combineRedundantDWordShuffle(N, Mask, DAG, DCI)) - return SDValue(); // We combined away this shuffle. + if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI)) + return NewN; break; } @@ -18747,6 +22077,61 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, return SDValue(); } +/// \brief Try to combine a shuffle into a target-specific add-sub node. +/// +/// We combine this directly on the abstract vector shuffle nodes so it is +/// easier to generically match. We also insert dummy vector shuffle nodes for +/// the operands which explicitly discard the lanes which are unused by this +/// operation to try to flow through the rest of the combiner the fact that +/// they're unused. +static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + + // We only handle target-independent shuffles. + // FIXME: It would be easy and harmless to use the target shuffle mask + // extraction tool to support more. + if (N->getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + + auto *SVN = cast<ShuffleVectorSDNode>(N); + ArrayRef<int> Mask = SVN->getMask(); + SDValue V1 = N->getOperand(0); + SDValue V2 = N->getOperand(1); + + // We require the first shuffle operand to be the SUB node, and the second to + // be the ADD node. + // FIXME: We should support the commuted patterns. + if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD) + return SDValue(); + + // If there are other uses of these operations we can't fold them. + if (!V1->hasOneUse() || !V2->hasOneUse()) + return SDValue(); + + // Ensure that both operations have the same operands. Note that we can + // commute the FADD operands. + SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1); + if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) && + (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS)) + return SDValue(); + + // We're looking for blends between FADD and FSUB nodes. We insist on these + // nodes being lined up in a specific expected pattern. + if (!(isShuffleEquivalent(Mask, 0, 3) || + isShuffleEquivalent(Mask, 0, 5, 2, 7) || + isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15))) + return SDValue(); + + // Only specific types are legal at this point, assert so we notice if and + // when these change. + assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 || + VT == MVT::v4f64) && + "Unknown vector type encountered!"); + + return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS); +} + /// PerformShuffleCombine - Performs several different shuffle combines. static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -18756,54 +22141,17 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); - // Canonicalize shuffles that perform 'addsub' on packed float vectors - // according to the rule: - // (shuffle (FADD A, B), (FSUB A, B), Mask) -> - // (shuffle (FSUB A, -B), (FADD A, -B), Mask) - // - // Where 'Mask' is: - // <0,5,2,7> -- for v4f32 and v4f64 shuffles; - // <0,3> -- for v2f64 shuffles; - // <0,9,2,11,4,13,6,15> -- for v8f32 shuffles. - // - // This helps pattern-matching more SSE3/AVX ADDSUB instructions - // during ISel stage. - if (N->getOpcode() == ISD::VECTOR_SHUFFLE && - ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || - (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && - N0->getOpcode() == ISD::FADD && N1->getOpcode() == ISD::FSUB && - // Operands to the FADD and FSUB must be the same. - ((N0->getOperand(0) == N1->getOperand(0) && - N0->getOperand(1) == N1->getOperand(1)) || - // FADD is commutable. See if by commuting the operands of the FADD - // we would still be able to match the operands of the FSUB dag node. - (N0->getOperand(1) == N1->getOperand(0) && - N0->getOperand(0) == N1->getOperand(1))) && - N0->getOperand(0)->getOpcode() != ISD::UNDEF && - N0->getOperand(1)->getOpcode() != ISD::UNDEF) { - - ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N); - unsigned NumElts = VT.getVectorNumElements(); - ArrayRef<int> Mask = SV->getMask(); - bool CanFold = true; - - for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) - CanFold = Mask[i] == (int)((i & 1) ? i + NumElts : i); - - if (CanFold) { - SDValue Op0 = N1->getOperand(0); - SDValue Op1 = DAG.getNode(ISD::FNEG, dl, VT, N1->getOperand(1)); - SDValue Sub = DAG.getNode(ISD::FSUB, dl, VT, Op0, Op1); - SDValue Add = DAG.getNode(ISD::FADD, dl, VT, Op0, Op1); - return DAG.getVectorShuffle(VT, dl, Sub, Add, Mask); - } - } - // Don't create instructions with illegal types after legalize types has run. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) return SDValue(); + // If we have legalized the vector types, look for blends of FADD and FSUB + // nodes that we can fuse into an ADDSUB node. + if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3()) + if (SDValue AddSub = combineShuffleToAddSub(N, DAG)) + return AddSub; + // Combine 256-bit vector shuffles. This is only profitable when in AVX mode if (Subtarget->hasFp256() && VT.is256BitVector() && N->getOpcode() == ISD::VECTOR_SHUFFLE) @@ -18880,6 +22228,18 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget); if (Shuffle.getNode()) return Shuffle; + + // Try recursively combining arbitrary sequences of x86 shuffle + // instructions into higher-order shuffles. We do this after combining + // specific PSHUF instruction sequences into their minimal form so that we + // can evaluate how many specialized shuffle instructions are involved in + // a particular chain. + SmallVector<int, 1> NonceMask; // Just a placeholder. + NonceMask.push_back(0); + if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask, + /*Depth*/ 1, /*HasPSHUFB*/ false, DAG, + DCI, Subtarget)) + return SDValue(); // This routine will use CombineTo to replace N. } return SDValue(); @@ -18897,7 +22257,7 @@ static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target /// specific shuffle of a load can be folded into a single element load. /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but -/// shuffles have been customed lowered so we need to handle those here. +/// shuffles have been custom lowered so we need to handle those here. static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { if (DCI.isBeforeLegalizeOps()) @@ -18909,20 +22269,20 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, if (!isa<ConstantSDNode>(EltNo)) return SDValue(); - EVT VT = InVec.getValueType(); + EVT OriginalVT = InVec.getValueType(); - bool HasShuffleIntoBitcast = false; if (InVec.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); EVT BCVT = InVec.getOperand(0).getValueType(); - if (BCVT.getVectorNumElements() != VT.getVectorNumElements()) + if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) return SDValue(); InVec = InVec.getOperand(0); - HasShuffleIntoBitcast = true; } + EVT CurrentVT = InVec.getValueType(); + if (!isTargetShuffle(InVec.getOpcode())) return SDValue(); @@ -18932,12 +22292,12 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, SmallVector<int, 16> ShuffleMask; bool UnaryShuffle; - if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask, - UnaryShuffle)) + if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), + ShuffleMask, UnaryShuffle)) return SDValue(); // Select the input vector, guarding against out of range extract vector. - unsigned NumElems = VT.getVectorNumElements(); + unsigned NumElems = CurrentVT.getVectorNumElements(); int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) @@ -18963,28 +22323,28 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) return SDValue(); - if (HasShuffleIntoBitcast) { - // If there's a bitcast before the shuffle, check if the load type and - // alignment is valid. - unsigned Align = LN0->getAlignment(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned NewAlign = TLI.getDataLayout()-> - getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); + EVT EltVT = N->getValueType(0); + // If there's a bitcast before the shuffle, check if the load type and + // alignment is valid. + unsigned Align = LN0->getAlignment(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment( + EltVT.getTypeForEVT(*DAG.getContext())); - if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) - return SDValue(); - } + if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT)) + return SDValue(); // All checks match so transform back to vector_shuffle so that DAG combiner // can finish the job SDLoc dl(N); // Create shuffle node taking into account the case that its a unary shuffle - SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1); - Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl, + SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) + : InVec.getOperand(1); + Shuffle = DAG.getVectorShuffle(CurrentVT, dl, InVec.getOperand(0), Shuffle, &ShuffleMask[0]); - Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); + Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, EltNo); } @@ -19190,6 +22550,12 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) return SDValue(); + // A vselect where all conditions and data are constants can be optimized into + // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). + if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) && + ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) + return SDValue(); + unsigned MaskValue = 0; if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue)) return SDValue(); @@ -19367,13 +22733,15 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() && CondVT.getVectorElementType() == MVT::i1) { // v16i8 (select v16i1, v16i8, v16i8) does not have a proper - // lowering on AVX-512. In this case we convert it to + // lowering on KNL. In this case we convert it to // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. - // The same situation for all 128 and 256-bit vectors of i8 and i16 + // The same situation for all 128 and 256-bit vectors of i8 and i16. + // Since SKX these selects have a proper lowering. EVT OpVT = LHS.getValueType(); if ((OpVT.is128BitVector() || OpVT.is256BitVector()) && (OpVT.getVectorElementType() == MVT::i8 || - OpVT.getVectorElementType() == MVT::i16)) { + OpVT.getVectorElementType() == MVT::i16) && + !(Subtarget->hasBWI() && Subtarget->hasVLX())) { Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond); DCI.AddToWorklist(Cond.getNode()); return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS); @@ -19593,22 +22961,22 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(Opc, DL, VT, LHS, RHS); } - // Simplify vector selection if the selector will be produced by CMPP*/PCMP*. - if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && - // Check if SETCC has already been promoted - TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT && - // Check that condition value type matches vselect operand type - CondVT == VT) { - + // Simplify vector selection if condition value type matches vselect + // operand type + if (N->getOpcode() == ISD::VSELECT && CondVT == VT) { assert(Cond.getValueType().isVector() && "vector select expects a vector selector!"); bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); - if (!TValIsAllOnes && !FValIsAllZeros) { - // Try invert the condition if true value is not all 1s and false value - // is not all 0s. + // Try invert the condition if true value is not all 1s and false value + // is not all 0s. + if (!TValIsAllOnes && !FValIsAllZeros && + // Check if the selector will be produced by CMPP*/PCMP* + Cond.getOpcode() == ISD::SETCC && + // Check if SETCC has already been promoted + TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) { bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); @@ -19726,22 +23094,17 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // build_vector of constants. This will be taken care in a later // condition. (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 && - VT != MVT::v8i16)) { + VT != MVT::v8i16) && + // Don't optimize vector of constants. Those are handled by + // the generic code and all the bits must be properly set for + // the generic optimizer. + !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); // Don't optimize vector selects that map to mask-registers. if (BitWidth == 1) return SDValue(); - // Check all uses of that condition operand to check whether it will be - // consumed by non-BLEND instructions, which may depend on all bits are set - // properly. - for (SDNode::use_iterator I = Cond->use_begin(), - E = Cond->use_end(); I != E; ++I) - if (I->getOpcode() != ISD::VSELECT) - // TODO: Add other opcodes eventually lowered into BLEND. - return SDValue(); - assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); @@ -19749,8 +23112,45 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), DCI.isBeforeLegalizeOps()); if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || - TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) - DCI.CommitTargetLoweringOpt(TLO); + TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, + TLO)) { + // If we changed the computation somewhere in the DAG, this change + // will affect all users of Cond. + // Make sure it is fine and update all the nodes so that we do not + // use the generic VSELECT anymore. Otherwise, we may perform + // wrong optimizations as we messed up with the actual expectation + // for the vector boolean values. + if (Cond != TLO.Old) { + // Check all uses of that condition operand to check whether it will be + // consumed by non-BLEND instructions, which may depend on all bits are + // set properly. + for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); + I != E; ++I) + if (I->getOpcode() != ISD::VSELECT) + // TODO: Add other opcodes eventually lowered into BLEND. + return SDValue(); + + // Update all the users of the condition, before committing the change, + // so that the VSELECT optimizations that expect the correct vector + // boolean value will not be triggered. + for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); + I != E; ++I) + DAG.ReplaceAllUsesOfValueWith( + SDValue(*I, 0), + DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0), + Cond, I->getOperand(1), I->getOperand(2))); + DCI.CommitTargetLoweringOpt(TLO); + return SDValue(); + } + // At this point, only Cond is changed. Change the condition + // just for N to keep the opportunity to optimize all other + // users their own way. + DAG.ReplaceAllUsesOfValueWith( + SDValue(N, 0), + DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0), + TLO.New, N->getOperand(1), N->getOperand(2))); + return SDValue(); + } } // We should generate an X86ISD::BLENDI from a vselect if its argument @@ -19764,7 +23164,9 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Iff we find this pattern and the build_vectors are built from // constants, we translate the vselect into a shuffle_vector that we // know will be matched by LowerVECTOR_SHUFFLEtoBlend. - if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) { + if ((N->getOpcode() == ISD::VSELECT || + N->getOpcode() == X86ISD::SHRUNKBLEND) && + !DCI.isBeforeLegalize()) { SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); if (Shuffle.getNode()) return Shuffle; @@ -20830,7 +24232,6 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, EVT MemVT = Ld->getMemoryVT(); SDLoc dl(Ld); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned RegSz = RegVT.getSizeInBits(); // On Sandybridge unaligned 256bit loads are inefficient. ISD::LoadExtType Ext = Ld->getExtensionType(); @@ -20866,153 +24267,6 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, return DCI.CombineTo(N, NewVec, TF, true); } - // If this is a vector EXT Load then attempt to optimize it using a - // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the - // expansion is still better than scalar code. - // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll - // emit a shuffle and a arithmetic shift. - // TODO: It is possible to support ZExt by zeroing the undef values - // during the shuffle phase or after the shuffle. - if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() && - (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) { - assert(MemVT != RegVT && "Cannot extend to the same type"); - assert(MemVT.isVector() && "Must load a vector from memory"); - - unsigned NumElems = RegVT.getVectorNumElements(); - unsigned MemSz = MemVT.getSizeInBits(); - assert(RegSz > MemSz && "Register size must be greater than the mem size"); - - if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) - return SDValue(); - - // All sizes must be a power of two. - if (!isPowerOf2_32(RegSz * MemSz * NumElems)) - return SDValue(); - - // Attempt to load the original value using scalar loads. - // Find the largest scalar type that divides the total loaded size. - MVT SclrLoadTy = MVT::i8; - for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; - tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { - MVT Tp = (MVT::SimpleValueType)tp; - if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { - SclrLoadTy = Tp; - } - } - - // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. - if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && - (64 <= MemSz)) - SclrLoadTy = MVT::f64; - - // Calculate the number of scalar loads that we need to perform - // in order to load our vector from memory. - unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); - if (Ext == ISD::SEXTLOAD && NumLoads > 1) - return SDValue(); - - unsigned loadRegZize = RegSz; - if (Ext == ISD::SEXTLOAD && RegSz == 256) - loadRegZize /= 2; - - // Represent our vector as a sequence of elements which are the - // largest scalar that we can load. - EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy, - loadRegZize/SclrLoadTy.getSizeInBits()); - - // Represent the data using the same element type that is stored in - // memory. In practice, we ''widen'' MemVT. - EVT WideVecVT = - EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), - loadRegZize/MemVT.getScalarType().getSizeInBits()); - - assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && - "Invalid vector type"); - - // We can't shuffle using an illegal type. - if (!TLI.isTypeLegal(WideVecVT)) - return SDValue(); - - SmallVector<SDValue, 8> Chains; - SDValue Ptr = Ld->getBasePtr(); - SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8, - TLI.getPointerTy()); - SDValue Res = DAG.getUNDEF(LoadUnitVecVT); - - for (unsigned i = 0; i < NumLoads; ++i) { - // Perform a single load. - SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), - Ptr, Ld->getPointerInfo(), - Ld->isVolatile(), Ld->isNonTemporal(), - Ld->isInvariant(), Ld->getAlignment()); - Chains.push_back(ScalarLoad.getValue(1)); - // Create the first element type using SCALAR_TO_VECTOR in order to avoid - // another round of DAGCombining. - if (i == 0) - Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); - else - Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, - ScalarLoad, DAG.getIntPtrConstant(i)); - - Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); - } - - SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); - - // Bitcast the loaded value to a vector of the original element type, in - // the size of the target vector type. - SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res); - unsigned SizeRatio = RegSz/MemSz; - - if (Ext == ISD::SEXTLOAD) { - // If we have SSE4.1 we can directly emit a VSEXT node. - if (Subtarget->hasSSE41()) { - SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); - return DCI.CombineTo(N, Sext, TF, true); - } - - // Otherwise we'll shuffle the small elements in the high bits of the - // larger type and perform an arithmetic shift. If the shift is not legal - // it's better to scalarize. - if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT)) - return SDValue(); - - // Redistribute the loaded elements into the different locations. - SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i*SizeRatio + SizeRatio-1] = i; - - SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, - DAG.getUNDEF(WideVecVT), - &ShuffleVec[0]); - - Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); - - // Build the arithmetic shift. - unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - - MemVT.getVectorElementType().getSizeInBits(); - Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, - DAG.getConstant(Amt, RegVT)); - - return DCI.CombineTo(N, Shuff, TF, true); - } - - // Redistribute the loaded elements into the different locations. - SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i*SizeRatio] = i; - - SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, - DAG.getUNDEF(WideVecVT), - &ShuffleVec[0]); - - // Bitcast to the requested type. - Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); - // Replace the original load with the new sequence - // and return the new chain. - return DCI.CombineTo(N, Shuff, TF, true); - } - return SDValue(); } @@ -21535,13 +24789,29 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + // (i8,i32 sext (sdivrem (i8 x, i8 y)) -> + // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y) + // This exposes the sext to the sdivrem lowering, so that it directly extends + // from AH (which we otherwise need to do contortions to access). + if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 && + N0.getValueType() == MVT::i8 && VT == MVT::i32) { + SDLoc dl(N); + SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); + SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys, + N0.getOperand(0), N0.getOperand(1)); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); + return R.getValue(1); + } + if (!DCI.isBeforeLegalizeOps()) return SDValue(); if (!Subtarget->hasFp256()) return SDValue(); - EVT VT = N->getValueType(0); if (VT.isVector() && VT.getSizeInBits() == 256) { SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); if (R.getNode()) @@ -21634,6 +24904,20 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, return R; } + // (i8,i32 zext (udivrem (i8 x, i8 y)) -> + // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y) + // This exposes the zext to the udivrem lowering, so that it directly extends + // from AH (which we otherwise need to do contortions to access). + if (N0.getOpcode() == ISD::UDIVREM && + N0.getResNo() == 1 && N0.getValueType() == MVT::i8 && + (VT == MVT::i32 || VT == MVT::i64)) { + SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); + SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys, + N0.getOperand(0), N0.getOperand(1)); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); + return R.getValue(1); + } + return SDValue(); } @@ -21803,8 +25087,61 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, + SelectionDAG &DAG) { + // Take advantage of vector comparisons producing 0 or -1 in each lane to + // optimize away operation when it's from a constant. + // + // The general transformation is: + // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> + // AND(VECTOR_CMP(x,y), constant2) + // constant2 = UNARYOP(constant) + + // Early exit if this isn't a vector operation, the operand of the + // unary operation isn't a bitwise AND, or if the sizes of the operations + // aren't the same. + EVT VT = N->getValueType(0); + if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || + N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || + VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) + return SDValue(); + + // Now check that the other operand of the AND is a constant. We could + // make the transformation for non-constant splats as well, but it's unclear + // that would be a benefit as it would not eliminate any operations, just + // perform one more step in scalar code before moving to the vector unit. + if (BuildVectorSDNode *BV = + dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { + // Bail out if the vector isn't a constant. + if (!BV->isConstant()) + return SDValue(); + + // Everything checks out. Build up the new and improved node. + SDLoc DL(N); + EVT IntVT = BV->getValueType(0); + // Create a new constant of the appropriate type for the transformed + // DAG. + SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); + // The AND node needs bitcasts to/from an integer vector type around it. + SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); + SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, + N->getOperand(0)->getOperand(0), MaskConst); + SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); + return Res; + } + + return SDValue(); +} + static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86TargetLowering *XTLI) { + // First try to optimize away the conversion entirely when it's + // conditionally from a constant. Vectors only. + SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); + if (Res != SDValue()) + return Res; + + // Now move on to more general possibilities. SDValue Op0 = N->getOperand(0); EVT InVT = Op0->getValueType(0); @@ -21950,18 +25287,68 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, /// performVZEXTCombine - Performs build vector combines static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDLoc DL(N); + MVT VT = N->getSimpleValueType(0); + SDValue Op = N->getOperand(0); + MVT OpVT = Op.getSimpleValueType(); + MVT OpEltVT = OpVT.getVectorElementType(); + unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements(); + // (vzext (bitcast (vzext (x)) -> (vzext x) - SDValue In = N->getOperand(0); - while (In.getOpcode() == ISD::BITCAST) - In = In.getOperand(0); + SDValue V = Op; + while (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); - if (In.getOpcode() != X86ISD::VZEXT) - return SDValue(); + if (V != Op && V.getOpcode() == X86ISD::VZEXT) { + MVT InnerVT = V.getSimpleValueType(); + MVT InnerEltVT = InnerVT.getVectorElementType(); + + // If the element sizes match exactly, we can just do one larger vzext. This + // is always an exact type match as vzext operates on integer types. + if (OpEltVT == InnerEltVT) { + assert(OpVT == InnerVT && "Types must match for vzext!"); + return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0)); + } + + // The only other way we can combine them is if only a single element of the + // inner vzext is used in the input to the outer vzext. + if (InnerEltVT.getSizeInBits() < InputBits) + return SDValue(); + + // In this case, the inner vzext is completely dead because we're going to + // only look at bits inside of the low element. Just do the outer vzext on + // a bitcast of the input to the inner. + return DAG.getNode(X86ISD::VZEXT, DL, VT, + DAG.getNode(ISD::BITCAST, DL, OpVT, V)); + } + + // Check if we can bypass extracting and re-inserting an element of an input + // vector. Essentialy: + // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) + if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && + V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && + V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) { + SDValue ExtractedV = V.getOperand(0); + SDValue OrigV = ExtractedV.getOperand(0); + if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1))) + if (ExtractIdx->getZExtValue() == 0) { + MVT OrigVT = OrigV.getSimpleValueType(); + // Extract a subvector if necessary... + if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) { + int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits(); + OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(), + OrigVT.getVectorNumElements() / Ratio); + OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV, + DAG.getIntPtrConstant(0)); + } + Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV); + return DAG.getNode(X86ISD::VZEXT, DL, VT, Op); + } + } - return DAG.getNode(X86ISD::VZEXT, SDLoc(N), N->getValueType(0), - In.getOperand(0)); + return SDValue(); } SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, @@ -21972,7 +25359,9 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::EXTRACT_VECTOR_ELT: return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); case ISD::VSELECT: - case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); + case ISD::SELECT: + case X86ISD::SHRUNKBLEND: + return PerformSELECTCombine(N, DAG, DCI, Subtarget); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); @@ -22013,12 +25402,13 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::UNPCKL: case X86ISD::MOVHLPS: case X86ISD::MOVLHPS: + case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::MOVSS: case X86ISD::MOVSD: - case X86ISD::VPERMILP: + case X86ISD::VPERMILPI: case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); @@ -22668,14 +26058,14 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, Constraint[5] == ')' && Constraint[6] == '}') { - Res.first = X86::ST0+Constraint[4]-'0'; + Res.first = X86::FP0+Constraint[4]-'0'; Res.second = &X86::RFP80RegClass; return Res; } // GCC allows "st(0)" to be called just plain "st". if (StringRef("{st}").equals_lower(Constraint)) { - Res.first = X86::ST0; + Res.first = X86::FP0; Res.second = &X86::RFP80RegClass; return Res; } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index c8cdce7..7c6ffa2 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86ISELLOWERING_H -#define X86ISELLOWERING_H +#ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H +#define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" @@ -187,12 +187,17 @@ namespace llvm { /// PSIGN - Copy integer sign. PSIGN, - /// BLENDV - Blend where the selector is a register. - BLENDV, - /// BLENDI - Blend where the selector is an immediate. BLENDI, + /// SHRUNKBLEND - Blend where the condition has been shrunk. + /// This is used to emphasize that the condition mask is + /// no more valid for generic VSELECT optimizations. + SHRUNKBLEND, + + /// ADDSUB - Combined add and sub on an FP vector. + ADDSUB, + // SUBUS - Integer sub with unsigned saturation. SUBUS, @@ -301,6 +306,13 @@ namespace llvm { UMUL, // LOW, HI, FLAGS = umul LHS, RHS + // 8-bit SMUL/UMUL - AX, FLAGS = smul8/umul8 AL, RHS + SMUL8, UMUL8, + + // 8-bit divrem that zero-extend the high result (AH). + UDIVREM8_ZEXT_HREG, + SDIVREM8_SEXT_HREG, + // MUL_IMM - X86 specific multiply by immediate. MUL_IMM, @@ -320,7 +332,10 @@ namespace llvm { // Several flavors of instructions with vector shuffle behaviors. PACKSS, PACKUS, + // Intra-lane alignr PALIGNR, + // AVX512 inter-lane alignr + VALIGN, PSHUFD, PSHUFHW, PSHUFLW, @@ -337,7 +352,8 @@ namespace llvm { MOVSS, UNPCKL, UNPCKH, - VPERMILP, + VPERMILPV, + VPERMILPI, VPERMV, VPERMV3, VPERMIV3, @@ -350,9 +366,9 @@ namespace llvm { VINSERT, VEXTRACT, - // PMULUDQ - Vector multiply packed unsigned doubleword integers + // Vector multiply packed unsigned doubleword integers PMULUDQ, - // PMULUDQ - Vector multiply packed signed doubleword integers + // Vector multiply packed signed doubleword integers PMULDQ, // FMA nodes @@ -363,20 +379,19 @@ namespace llvm { FMADDSUB, FMSUBADD, - // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack, - // according to %al. An operator is needed so that this can be expanded - // with control flow. + // Save xmm argument registers to the stack, according to %al. An operator + // is needed so that this can be expanded with control flow. VASTART_SAVE_XMM_REGS, - // WIN_ALLOCA - Windows's _chkstk call to do stack probing. + // Windows's _chkstk call to do stack probing. WIN_ALLOCA, - // SEG_ALLOCA - For allocating variable amounts of stack space when using + // For allocating variable amounts of stack space when using // segmented stacks. Check if the current stacklet has enough space, and // falls back to heap allocation if not. SEG_ALLOCA, - // WIN_FTOL - Windows's _ftol2 runtime routine to do fptoui. + // Windows's _ftol2 runtime routine to do fptoui. WIN_FTOL, // Memory barrier @@ -385,38 +400,40 @@ namespace llvm { SFENCE, LFENCE, - // FNSTSW16r - Store FP status word into i16 register. + // Store FP status word into i16 register. FNSTSW16r, - // SAHF - Store contents of %ah into %eflags. + // Store contents of %ah into %eflags. SAHF, - // RDRAND - Get a random integer and indicate whether it is valid in CF. + // Get a random integer and indicate whether it is valid in CF. RDRAND, - // RDSEED - Get a NIST SP800-90B & C compliant random integer and + // Get a NIST SP800-90B & C compliant random integer and // indicate whether it is valid in CF. RDSEED, - // PCMP*STRI PCMPISTRI, PCMPESTRI, - // XTEST - Test if in transactional execution. + // Test if in transactional execution. XTEST, - // LCMPXCHG_DAG, LCMPXCHG8_DAG, LCMPXCHG16_DAG - Compare and swap. + // ERI instructions + RSQRT28, RCP28, EXP2, + + // Compare and swap. LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, LCMPXCHG8_DAG, LCMPXCHG16_DAG, - // VZEXT_LOAD - Load, scalar_to_vector, and zero extend. + // Load, scalar_to_vector, and zero extend. VZEXT_LOAD, - // FNSTCW16m - Store FP control world into i16 memory. + // Store FP control world into i16 memory. FNSTCW16m, - /// FP_TO_INT*_IN_MEM - This instruction implements FP_TO_SINT with the + /// This instruction implements FP_TO_SINT with the /// integer destination in memory and a FP reg source. This corresponds /// to the X86::FIST*m instructions and the rounding mode change stuff. It /// has two inputs (token chain and address) and two outputs (int value @@ -425,7 +442,7 @@ namespace llvm { FP_TO_INT32_IN_MEM, FP_TO_INT64_IN_MEM, - /// FILD, FILD_FLAG - This instruction implements SINT_TO_FP with the + /// This instruction implements SINT_TO_FP with the /// integer source in memory and FP reg result. This corresponds to the /// X86::FILD*m instructions. It has three inputs (token chain, address, /// and source type) and two outputs (FP value and token chain). FILD_FLAG @@ -433,19 +450,19 @@ namespace llvm { FILD, FILD_FLAG, - /// FLD - This instruction implements an extending load to FP stack slots. + /// This instruction implements an extending load to FP stack slots. /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain /// operand, ptr to load from, and a ValueType node indicating the type /// to load to. FLD, - /// FST - This instruction implements a truncating store to FP stack + /// This instruction implements a truncating store to FP stack /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a /// chain operand, value to store, address, and a ValueType to store it /// as. FST, - /// VAARG_64 - This instruction grabs the address of the next argument + /// This instruction grabs the address of the next argument /// from a va_list. (reads and modifies the va_list in memory) VAARG_64 @@ -457,67 +474,76 @@ namespace llvm { /// Define some predicates that are used for node matching. namespace X86 { - /// isVEXTRACT128Index - Return true if the specified + /// Return true if the specified /// EXTRACT_SUBVECTOR operand specifies a vector extract that is /// suitable for input to VEXTRACTF128, VEXTRACTI128 instructions. bool isVEXTRACT128Index(SDNode *N); - /// isVINSERT128Index - Return true if the specified + /// Return true if the specified /// INSERT_SUBVECTOR operand specifies a subvector insert that is /// suitable for input to VINSERTF128, VINSERTI128 instructions. bool isVINSERT128Index(SDNode *N); - /// isVEXTRACT256Index - Return true if the specified + /// Return true if the specified /// EXTRACT_SUBVECTOR operand specifies a vector extract that is /// suitable for input to VEXTRACTF64X4, VEXTRACTI64X4 instructions. bool isVEXTRACT256Index(SDNode *N); - /// isVINSERT256Index - Return true if the specified + /// Return true if the specified /// INSERT_SUBVECTOR operand specifies a subvector insert that is /// suitable for input to VINSERTF64X4, VINSERTI64X4 instructions. bool isVINSERT256Index(SDNode *N); - /// getExtractVEXTRACT128Immediate - Return the appropriate + /// Return the appropriate /// immediate to extract the specified EXTRACT_SUBVECTOR index /// with VEXTRACTF128, VEXTRACTI128 instructions. unsigned getExtractVEXTRACT128Immediate(SDNode *N); - /// getInsertVINSERT128Immediate - Return the appropriate + /// Return the appropriate /// immediate to insert at the specified INSERT_SUBVECTOR index /// with VINSERTF128, VINSERT128 instructions. unsigned getInsertVINSERT128Immediate(SDNode *N); - /// getExtractVEXTRACT256Immediate - Return the appropriate + /// Return the appropriate /// immediate to extract the specified EXTRACT_SUBVECTOR index /// with VEXTRACTF64X4, VEXTRACTI64x4 instructions. unsigned getExtractVEXTRACT256Immediate(SDNode *N); - /// getInsertVINSERT256Immediate - Return the appropriate + /// Return the appropriate /// immediate to insert at the specified INSERT_SUBVECTOR index /// with VINSERTF64x4, VINSERTI64x4 instructions. unsigned getInsertVINSERT256Immediate(SDNode *N); - /// isZeroNode - Returns true if Elt is a constant zero or a floating point - /// constant +0.0. + /// Returns true if Elt is a constant zero or floating point constant +0.0. bool isZeroNode(SDValue Elt); - /// isOffsetSuitableForCodeModel - Returns true of the given offset can be + /// Returns true of the given offset can be /// fit into displacement field of the instruction. bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement = true); - /// isCalleePop - Determines whether the callee is required to pop its + /// Determines whether the callee is required to pop its /// own arguments. Callee pop is necessary to support tail calls. bool isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg, bool TailCallOpt); + + /// AVX512 static rounding constants. These need to match the values in + /// avx512fintrin.h. + enum STATIC_ROUNDING { + TO_NEAREST_INT = 0, + TO_NEG_INF = 1, + TO_POS_INF = 2, + TO_ZERO = 3, + CUR_DIRECTION = 4 + }; } //===--------------------------------------------------------------------===// - // X86TargetLowering - X86 Implementation of the TargetLowering interface + // X86 Implementation of the TargetLowering interface class X86TargetLowering final : public TargetLowering { public: - explicit X86TargetLowering(X86TargetMachine &TM); + explicit X86TargetLowering(const X86TargetMachine &TM); unsigned getJumpTableEncoding() const override; @@ -528,21 +554,20 @@ namespace llvm { const MachineBasicBlock *MBB, unsigned uid, MCContext &Ctx) const override; - /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC - /// jumptable. + /// Returns relocation base for the given PIC jumptable. SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const override; const MCExpr * getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const override; - /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate + /// Return the desired alignment for ByVal aggregate /// function arguments in the caller parameter area. For X86, aggregates /// that contains are placed at 16-byte boundaries while the rest are at /// 4-byte boundaries. unsigned getByValTypeAlignment(Type *Ty) const override; - /// getOptimalMemOpType - Returns the target specific optimal type for load + /// Returns the target specific optimal type for load /// and store operations as a result of memset, memcpy, and memmove /// lowering. If DstAlign is zero that means it's safe to destination /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it @@ -557,7 +582,7 @@ namespace llvm { bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const override; - /// isSafeMemOpType - Returns true if it's safe to use load / store of the + /// Returns true if it's safe to use load / store of the /// specified type to expand memcpy / memset inline. This is mostly true /// for all types except for some special cases. For example, on X86 /// targets without SSE2 f64 load / store are done with fldl / fstpl which @@ -565,17 +590,17 @@ namespace llvm { /// legal as the hook is used before type legalization. bool isSafeMemOpType(MVT VT) const override; - /// allowsUnalignedMemoryAccesses - Returns true if the target allows + /// Returns true if the target allows /// unaligned memory accesses. of the specified type. Returns whether it /// is "fast" by reference in the second argument. - bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS, + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, bool *Fast) const override; - /// LowerOperation - Provide custom lowering hooks for some operations. + /// Provide custom lowering hooks for some operations. /// SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; - /// ReplaceNodeResults - Replace the results of node with an illegal result + /// Replace the results of node with an illegal result /// type with new values built out of custom code. /// void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results, @@ -584,13 +609,13 @@ namespace llvm { SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; - /// isTypeDesirableForOp - Return true if the target has native support for + /// Return true if the target has native support for /// the specified value type and it is 'desirable' to use the type for the /// given node type. e.g. On x86 i16 is legal, but undesirable since i16 /// instruction encodings are longer and some i16 instructions are slow. bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override; - /// isTypeDesirable - Return true if the target has native support for the + /// Return true if the target has native support for the /// specified value type and it is 'desirable' to use the type. e.g. On x86 /// i16 is legal, but undesirable since i16 instruction encodings are longer /// and some i16 instructions are slow. @@ -601,24 +626,21 @@ namespace llvm { MachineBasicBlock *MBB) const override; - /// getTargetNodeName - This method returns the name of a target specific - /// DAG node. + /// This method returns the name of a target specific DAG node. const char *getTargetNodeName(unsigned Opcode) const override; - /// getSetCCResultType - Return the value type to use for ISD::SETCC. + /// Return the value type to use for ISD::SETCC. EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; - /// computeKnownBitsForTargetNode - Determine which of the bits specified - /// in Mask are known to be either zero or one and return them in the - /// KnownZero/KnownOne bitsets. + /// Determine which of the bits specified in Mask are known to be either + /// zero or one and return them in the KnownZero/KnownOne bitsets. void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth = 0) const override; - // ComputeNumSignBitsForTargetNode - Determine the number of bits in the - // operation that are sign bits. + /// Determine the number of bits in the operation that are sign bits. unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &DAG, unsigned Depth) const override; @@ -641,16 +663,15 @@ namespace llvm { const char *LowerXConstraint(EVT ConstraintVT) const override; - /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops - /// vector. If it is invalid, don't add anything to Ops. If hasMemory is - /// true it means one of the asm constraint of the inline asm instruction - /// being processed is 'm'. + /// Lower the specified operand into the Ops vector. If it is invalid, don't + /// add anything to Ops. If hasMemory is true it means one of the asm + /// constraint of the inline asm instruction being processed is 'm'. void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const override; - /// getRegForInlineAsmConstraint - Given a physical register constraint + /// Given a physical register constraint /// (e.g. {edx}), return the register number and the register class for the /// register. This should only be used for C_Register constraints. On /// error, this returns a register number of 0. @@ -658,17 +679,17 @@ namespace llvm { getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const override; - /// isLegalAddressingMode - Return true if the addressing mode represented + /// Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override; - /// isLegalICmpImmediate - Return true if the specified immediate is legal + /// Return true if the specified immediate is legal /// icmp immediate, that is the target has icmp instructions which can /// compare a register against the immediate without having to materialize /// the immediate into a register. bool isLegalICmpImmediate(int64_t Imm) const override; - /// isLegalAddImmediate - Return true if the specified immediate is legal + /// Return true if the specified immediate is legal /// add immediate, that is the target has add instructions which can /// add a register and the immediate without having to materialize /// the immediate into a register. @@ -683,7 +704,7 @@ namespace llvm { bool isVectorShiftByScalarCheap(Type *Ty) const override; - /// isTruncateFree - Return true if it's free to truncate a value of + /// Return true if it's free to truncate a value of /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in /// register EAX to i16 by referencing its sub-register AX. bool isTruncateFree(Type *Ty1, Type *Ty2) const override; @@ -691,7 +712,7 @@ namespace llvm { bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override; - /// isZExtFree - Return true if any actual instruction that defines a + /// Return true if any actual instruction that defines a /// value of type Ty1 implicit zero-extends the value to Ty2 in the result /// register. This does not necessarily include registers defined in /// unknown ways, such as incoming arguments, or copies from unknown @@ -703,37 +724,35 @@ namespace llvm { bool isZExtFree(EVT VT1, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; - /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster - /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be - /// expanded to FMAs when this method returns true, otherwise fmuladd is - /// expanded to fmul + fadd. + /// Return true if an FMA operation is faster than a pair of fmul and fadd + /// instructions. fmuladd intrinsics will be expanded to FMAs when this + /// method returns true, otherwise fmuladd is expanded to fmul + fadd. bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; - /// isNarrowingProfitable - Return true if it's profitable to narrow + /// Return true if it's profitable to narrow /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow /// from i32 to i8 but not from i32 to i16. bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; - /// isFPImmLegal - Returns true if the target can instruction select the + /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; - /// isShuffleMaskLegal - Targets can use this to indicate that they only - /// support *some* VECTOR_SHUFFLE operations, those with specific masks. - /// By default, if a target supports the VECTOR_SHUFFLE node, all mask - /// values are assumed to be legal. + /// Targets can use this to indicate that they only support *some* + /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a + /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to + /// be legal. bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask, EVT VT) const override; - /// isVectorClearMaskLegal - Similar to isShuffleMaskLegal. This is - /// used by Targets can use this to indicate if there is a suitable - /// VECTOR_SHUFFLE that can be used to replace a VAND with a constant - /// pool entry. + /// Similar to isShuffleMaskLegal. This is used by Targets can use this to + /// indicate if there is a suitable VECTOR_SHUFFLE that can be used to + /// replace a VAND with a constant pool entry. bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, EVT VT) const override; - /// ShouldShrinkFPConstant - If true, then instruction selection should + /// If true, then instruction selection should /// seek to shrink the FP constant of the specified type to a smaller type /// in order to save space and / or reduce runtime. bool ShouldShrinkFPConstant(EVT VT) const override { @@ -747,19 +766,18 @@ namespace llvm { return Subtarget; } - /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is - /// computed in an SSE register, not on the X87 floating point stack. + /// Return true if the specified scalar FP type is computed in an SSE + /// register, not on the X87 floating point stack. bool isScalarFPTypeInSSEReg(EVT VT) const { return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 } - /// isTargetFTOL - Return true if the target uses the MSVC _ftol2 routine - /// for fptoui. + /// Return true if the target uses the MSVC _ftol2 routine for fptoui. bool isTargetFTOL() const; - /// isIntegerTypeFTOL - Return true if the MSVC _ftol2 routine should be - /// used for fptoui to the given type. + /// Return true if the MSVC _ftol2 routine should be used for fptoui to the + /// given type. bool isIntegerTypeFTOL(EVT VT) const { return isTargetFTOL() && VT == MVT::i64; } @@ -776,15 +794,14 @@ namespace llvm { unsigned getRegisterByName(const char* RegName, EVT VT) const override; - /// createFastISel - This method returns a target specific FastISel object, + /// This method returns a target specific FastISel object, /// or null if the target does not support "fast" ISel. FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override; - /// getStackCookieLocation - Return true if the target stores stack - /// protector cookies at a fixed offset in some non-standard address - /// space, and populates the address space and offset as - /// appropriate. + /// Return true if the target stores stack protector cookies at a fixed + /// offset in some non-standard address space, and populates the address + /// space and offset as appropriate. bool getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const override; @@ -796,6 +813,7 @@ namespace llvm { /// \brief Reset the operation actions based on target options. void resetOperationActions() override; + bool useLoadStackGuardNode() const override; /// \brief Customize the preferred legalization strategy for certain types. LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; @@ -804,7 +822,7 @@ namespace llvm { findRepresentativeClass(MVT VT) const override; private: - /// Subtarget - Keep a pointer to the X86Subtarget around so that we can + /// Keep a pointer to the X86Subtarget around so that we can /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; const DataLayout *TD; @@ -813,17 +831,16 @@ namespace llvm { /// the operation actions unless we have to. TargetOptions TO; - /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 - /// floating point ops. + /// Select between SSE or x87 floating point ops. /// When SSE is available, use it for f32 operations. /// When SSE2 is available, use it for f64 operations. bool X86ScalarSSEf32; bool X86ScalarSSEf64; - /// LegalFPImmediates - A list of legal fp immediates. + /// A list of legal FP immediates. std::vector<APFloat> LegalFPImmediates; - /// addLegalFPImmediate - Indicate that this x86 target can instruction + /// Indicate that this x86 target can instruction /// select the specified FP immediate natively. void addLegalFPImmediate(const APFloat& Imm) { LegalFPImmediates.push_back(Imm); @@ -847,9 +864,8 @@ namespace llvm { // Call lowering helpers. - /// IsEligibleForTailCallOptimization - Check whether the call is eligible - /// for tail call optimization. Targets which want to do tail call - /// optimization should implement this function. + /// Check whether the call is eligible for tail call optimization. Targets + /// that want to do tail call optimization should implement this function. bool IsEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, @@ -936,7 +952,7 @@ namespace llvm { bool mayBeEmittedAsTailCall(CallInst *CI) const override; - MVT getTypeForExtArgOrReturn(MVT VT, + EVT getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override; bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, @@ -946,6 +962,15 @@ namespace llvm { const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; + bool shouldExpandAtomicLoadInIR(LoadInst *SI) const override; + bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; + bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + + LoadInst * + lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; + + bool needsCmpXchgNb(const Type *MemType) const; + /// Utility function to emit atomic-load-arith operations (and, or, xor, /// nand, max, min, umax, umin). It takes the corresponding instruction to /// expand, the associated machine basic block, and the associated X86 @@ -975,8 +1000,7 @@ namespace llvm { MachineBasicBlock *BB) const; MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr *MI, - MachineBasicBlock *BB, - bool Is64Bit) const; + MachineBasicBlock *BB) const; MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI, MachineBasicBlock *BB) const; @@ -1005,6 +1029,15 @@ namespace llvm { /// Convert a comparison if required by the subtarget. SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const; + + /// Use rsqrt* to speed up sqrt calculations. + SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const override; + + /// Use rcp* to speed up fdiv calculations. + SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const override; }; namespace X86 { diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 41e900e..b188cd5 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1,19 +1,277 @@ +// Group template arguments that can be derived from the vector type (EltNum x +// EltVT). These are things like the register class for the writemask, etc. +// The idea is to pass one of these as the template argument rather than the +// individual arguments. +class X86VectorVTInfo<int numelts, ValueType EltVT, RegisterClass rc, + string suffix = ""> { + RegisterClass RC = rc; + int NumElts = numelts; + + // Corresponding mask register class. + RegisterClass KRC = !cast<RegisterClass>("VK" # NumElts); + + // Corresponding write-mask register class. + RegisterClass KRCWM = !cast<RegisterClass>("VK" # NumElts # "WM"); + + // The GPR register class that can hold the write mask. Use GR8 for fewer + // than 8 elements. Use shift-right and equal to work around the lack of + // !lt in tablegen. + RegisterClass MRC = + !cast<RegisterClass>("GR" # + !if (!eq (!srl(NumElts, 3), 0), 8, NumElts)); + + // Suffix used in the instruction mnemonic. + string Suffix = suffix; + + string VTName = "v" # NumElts # EltVT; + + // The vector VT. + ValueType VT = !cast<ValueType>(VTName); + + string EltTypeName = !cast<string>(EltVT); + // Size of the element type in bits, e.g. 32 for v16i32. + string EltSizeName = !subst("i", "", !subst("f", "", EltTypeName)); + int EltSize = EltVT.Size; + + // "i" for integer types and "f" for floating-point types + string TypeVariantName = !subst(EltSizeName, "", EltTypeName); + + // Size of RC in bits, e.g. 512 for VR512. + int Size = VT.Size; + + // The corresponding memory operand, e.g. i512mem for VR512. + X86MemOperand MemOp = !cast<X86MemOperand>(TypeVariantName # Size # "mem"); + X86MemOperand ScalarMemOp = !cast<X86MemOperand>(EltVT # "mem"); + + // Load patterns + // Note: For 128/256-bit integer VT we choose loadv2i64/loadv4i64 + // due to load promotion during legalization + PatFrag LdFrag = !cast<PatFrag>("load" # + !if (!eq (TypeVariantName, "i"), + !if (!eq (Size, 128), "v2i64", + !if (!eq (Size, 256), "v4i64", + VTName)), VTName)); + PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT); + + // Load patterns used for memory operands. We only have this defined in + // case of i64 element types for sub-512 integer vectors. For now, keep + // MemOpFrag undefined in these cases. + PatFrag MemOpFrag = + !if (!eq (TypeVariantName, "f"), !cast<PatFrag>("memop" # VTName), + !if (!eq (EltTypeName, "i64"), !cast<PatFrag>("memop" # VTName), + !if (!eq (VTName, "v16i32"), !cast<PatFrag>("memop" # VTName), ?))); + + // The corresponding float type, e.g. v16f32 for v16i32 + // Note: For EltSize < 32, FloatVT is illegal and TableGen + // fails to compile, so we choose FloatVT = VT + ValueType FloatVT = !cast<ValueType>( + !if (!eq (!srl(EltSize,5),0), + VTName, + !if (!eq(TypeVariantName, "i"), + "v" # NumElts # "f" # EltSize, + VTName))); + + // The string to specify embedded broadcast in assembly. + string BroadcastStr = "{1to" # NumElts # "}"; + + // 8-bit compressed displacement tuple/subvector format. This is only + // defined for NumElts <= 8. + CD8VForm CD8TupleForm = !if (!eq (!srl(NumElts, 4), 0), + !cast<CD8VForm>("CD8VT" # NumElts), ?); + + SubRegIndex SubRegIdx = !if (!eq (Size, 128), sub_xmm, + !if (!eq (Size, 256), sub_ymm, ?)); + + Domain ExeDomain = !if (!eq (EltTypeName, "f32"), SSEPackedSingle, + !if (!eq (EltTypeName, "f64"), SSEPackedDouble, + SSEPackedInt)); + + // A vector type of the same width with element type i32. This is used to + // create the canonical constant zero node ImmAllZerosV. + ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32"); + dag ImmAllZerosV = (VT (bitconvert (i32VT immAllZerosV))); +} + +def v64i8_info : X86VectorVTInfo<64, i8, VR512, "b">; +def v32i16_info : X86VectorVTInfo<32, i16, VR512, "w">; +def v16i32_info : X86VectorVTInfo<16, i32, VR512, "d">; +def v8i64_info : X86VectorVTInfo<8, i64, VR512, "q">; +def v16f32_info : X86VectorVTInfo<16, f32, VR512, "ps">; +def v8f64_info : X86VectorVTInfo<8, f64, VR512, "pd">; + +// "x" in v32i8x_info means RC = VR256X +def v32i8x_info : X86VectorVTInfo<32, i8, VR256X, "b">; +def v16i16x_info : X86VectorVTInfo<16, i16, VR256X, "w">; +def v8i32x_info : X86VectorVTInfo<8, i32, VR256X, "d">; +def v4i64x_info : X86VectorVTInfo<4, i64, VR256X, "q">; +def v8f32x_info : X86VectorVTInfo<8, f32, VR256X, "ps">; +def v4f64x_info : X86VectorVTInfo<4, f64, VR256X, "pd">; + +def v16i8x_info : X86VectorVTInfo<16, i8, VR128X, "b">; +def v8i16x_info : X86VectorVTInfo<8, i16, VR128X, "w">; +def v4i32x_info : X86VectorVTInfo<4, i32, VR128X, "d">; +def v2i64x_info : X86VectorVTInfo<2, i64, VR128X, "q">; +def v4f32x_info : X86VectorVTInfo<4, f32, VR128X, "ps">; +def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">; + +class AVX512VLVectorVTInfo<X86VectorVTInfo i512, X86VectorVTInfo i256, + X86VectorVTInfo i128> { + X86VectorVTInfo info512 = i512; + X86VectorVTInfo info256 = i256; + X86VectorVTInfo info128 = i128; +} + +def avx512vl_i8_info : AVX512VLVectorVTInfo<v64i8_info, v32i8x_info, + v16i8x_info>; +def avx512vl_i16_info : AVX512VLVectorVTInfo<v32i16_info, v16i16x_info, + v8i16x_info>; +def avx512vl_i32_info : AVX512VLVectorVTInfo<v16i32_info, v8i32x_info, + v4i32x_info>; +def avx512vl_i64_info : AVX512VLVectorVTInfo<v8i64_info, v4i64x_info, + v2i64x_info>; +def avx512vl_f32_info : AVX512VLVectorVTInfo<v16f32_info, v8f32x_info, + v4f32x_info>; +def avx512vl_f64_info : AVX512VLVectorVTInfo<v8f64_info, v4f64x_info, + v2f64x_info>; + +// This multiclass generates the masking variants from the non-masking +// variant. It only provides the assembly pieces for the masking variants. +// It assumes custom ISel patterns for masking which can be provided as +// template arguments. +multiclass AVX512_maskable_custom<bits<8> O, Format F, + dag Outs, + dag Ins, dag MaskingIns, dag ZeroMaskingIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + list<dag> Pattern, + list<dag> MaskingPattern, + list<dag> ZeroMaskingPattern, + string Round = "", + string MaskingConstraint = "", + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0> { + let isCommutable = IsCommutable in + def NAME: AVX512<O, F, Outs, Ins, + OpcodeStr#"\t{"#AttSrcAsm#", $dst "#Round#"|"# + "$dst "#Round#", "#IntelSrcAsm#"}", + Pattern, itin>; + + // Prefer over VMOV*rrk Pat<> + let AddedComplexity = 20 in + def NAME#k: AVX512<O, F, Outs, MaskingIns, + OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}"#Round#"|"# + "$dst {${mask}}"#Round#", "#IntelSrcAsm#"}", + MaskingPattern, itin>, + EVEX_K { + // In case of the 3src subclass this is overridden with a let. + string Constraints = MaskingConstraint; + } + let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<> + def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns, + OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}"#Round#"|"# + "$dst {${mask}} {z}"#Round#", "#IntelSrcAsm#"}", + ZeroMaskingPattern, + itin>, + EVEX_KZ; +} + + +// Common base class of AVX512_maskable and AVX512_maskable_3src. +multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, + dag Ins, dag MaskingIns, dag ZeroMaskingIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, dag MaskingRHS, + string Round = "", + string MaskingConstraint = "", + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0> : + AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr, + AttSrcAsm, IntelSrcAsm, + [(set _.RC:$dst, RHS)], + [(set _.RC:$dst, MaskingRHS)], + [(set _.RC:$dst, + (vselect _.KRCWM:$mask, RHS, _.ImmAllZerosV))], + Round, MaskingConstraint, NoItinerary, IsCommutable>; + +// This multiclass generates the unconditional/non-masking, the masking and +// the zero-masking variant of the instruction. In the masking case, the +// perserved vector elements come from a new dummy input operand tied to $dst. +multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, string Round = "", + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0> : + AVX512_maskable_common<O, F, _, Outs, Ins, + !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), + !con((ins _.KRCWM:$mask), Ins), + OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, + (vselect _.KRCWM:$mask, RHS, _.RC:$src0), Round, + "$src0 = $dst", itin, IsCommutable>; + +// Similar to AVX512_maskable but in this case one of the source operands +// ($src1) is already tied to $dst so we just use that for the preserved +// vector elements. NOTE that the NonTiedIns (the ins dag) should exclude +// $src1. +multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag NonTiedIns, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS> : + AVX512_maskable_common<O, F, _, Outs, + !con((ins _.RC:$src1), NonTiedIns), + !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), + !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), + OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, + (vselect _.KRCWM:$mask, RHS, _.RC:$src1)>; + + +multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + list<dag> Pattern> : + AVX512_maskable_custom<O, F, Outs, Ins, + !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), + !con((ins _.KRCWM:$mask), Ins), + OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [], "", + "$src0 = $dst">; + // Bitcasts between 512-bit vector types. Return the original type since // no instruction is needed for the conversion let Predicates = [HasAVX512] in { - def : Pat<(v8f64 (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>; - def : Pat<(v8f64 (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>; def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>; - def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v8f64 (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v8f64 (bitconvert (v32i16 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v8f64 (bitconvert (v64i8 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v8f64 (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>; def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v16f32 (bitconvert (v32i16 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v16f32 (bitconvert (v64i8 VR512:$src))), (v16f32 VR512:$src)>; def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))), (v16f32 VR512:$src)>; - def : Pat<(v8i64 (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>; def : Pat<(v8i64 (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>; + def : Pat<(v8i64 (bitconvert (v32i16 VR512:$src))), (v8i64 VR512:$src)>; + def : Pat<(v8i64 (bitconvert (v64i8 VR512:$src))), (v8i64 VR512:$src)>; def : Pat<(v8i64 (bitconvert (v8f64 VR512:$src))), (v8i64 VR512:$src)>; + def : Pat<(v8i64 (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>; def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>; - def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v32i16 VR512:$src))), (v16i32 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v64i8 VR512:$src))), (v16i32 VR512:$src)>; def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))), (v16i32 VR512:$src)>; - def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v32i16 (bitconvert (v8i64 VR512:$src))), (v32i16 VR512:$src)>; + def : Pat<(v32i16 (bitconvert (v16i32 VR512:$src))), (v32i16 VR512:$src)>; + def : Pat<(v32i16 (bitconvert (v64i8 VR512:$src))), (v32i16 VR512:$src)>; + def : Pat<(v32i16 (bitconvert (v8f64 VR512:$src))), (v32i16 VR512:$src)>; + def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>; + def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>; + def : Pat<(v64i8 (bitconvert (v8i64 VR512:$src))), (v64i8 VR512:$src)>; + def : Pat<(v64i8 (bitconvert (v16i32 VR512:$src))), (v64i8 VR512:$src)>; + def : Pat<(v64i8 (bitconvert (v32i16 VR512:$src))), (v64i8 VR512:$src)>; + def : Pat<(v64i8 (bitconvert (v8f64 VR512:$src))), (v64i8 VR512:$src)>; + def : Pat<(v64i8 (bitconvert (v16f32 VR512:$src))), (v64i8 VR512:$src)>; def : Pat<(v2i64 (bitconvert (v4i32 VR128X:$src))), (v2i64 VR128X:$src)>; def : Pat<(v2i64 (bitconvert (v8i16 VR128X:$src))), (v2i64 VR128X:$src)>; @@ -99,120 +357,92 @@ def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>; //===----------------------------------------------------------------------===// // AVX-512 - VECTOR INSERT // -// -- 32x8 form -- -let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { -def VINSERTF32x4rr : AVX512AIi8<0x18, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src1, VR128X:$src2, i8imm:$src3), - "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512; -let mayLoad = 1 in -def VINSERTF32x4rm : AVX512AIi8<0x18, MRMSrcMem, (outs VR512:$dst), - (ins VR512:$src1, f128mem:$src2, i8imm:$src3), - "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>; -} - -// -- 64x4 fp form -- -let hasSideEffects = 0, ExeDomain = SSEPackedDouble in { -def VINSERTF64x4rr : AVX512AIi8<0x1a, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src1, VR256X:$src2, i8imm:$src3), - "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, VEX_W; -let mayLoad = 1 in -def VINSERTF64x4rm : AVX512AIi8<0x1a, MRMSrcMem, (outs VR512:$dst), - (ins VR512:$src1, i256mem:$src2, i8imm:$src3), - "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; + +multiclass vinsert_for_size_no_alt<int Opcode, + X86VectorVTInfo From, X86VectorVTInfo To, + PatFrag vinsert_insert, + SDNodeXForm INSERT_get_vinsert_imm> { + let hasSideEffects = 0, ExeDomain = To.ExeDomain in { + def rr : AVX512AIi8<Opcode, MRMSrcReg, (outs VR512:$dst), + (ins VR512:$src1, From.RC:$src2, i8imm:$src3), + "vinsert" # From.EltTypeName # "x" # From.NumElts # + "\t{$src3, $src2, $src1, $dst|" + "$dst, $src1, $src2, $src3}", + [(set To.RC:$dst, (vinsert_insert:$src3 (To.VT VR512:$src1), + (From.VT From.RC:$src2), + (iPTR imm)))]>, + EVEX_4V, EVEX_V512; + + let mayLoad = 1 in + def rm : AVX512AIi8<Opcode, MRMSrcMem, (outs VR512:$dst), + (ins VR512:$src1, From.MemOp:$src2, i8imm:$src3), + "vinsert" # From.EltTypeName # "x" # From.NumElts # + "\t{$src3, $src2, $src1, $dst|" + "$dst, $src1, $src2, $src3}", + []>, + EVEX_4V, EVEX_V512, EVEX_CD8<From.EltSize, From.CD8TupleForm>; + } } -// -- 32x4 integer form -- -let hasSideEffects = 0 in { -def VINSERTI32x4rr : AVX512AIi8<0x38, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src1, VR128X:$src2, i8imm:$src3), - "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512; -let mayLoad = 1 in -def VINSERTI32x4rm : AVX512AIi8<0x38, MRMSrcMem, (outs VR512:$dst), - (ins VR512:$src1, i128mem:$src2, i8imm:$src3), - "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>; +multiclass vinsert_for_size<int Opcode, + X86VectorVTInfo From, X86VectorVTInfo To, + X86VectorVTInfo AltFrom, X86VectorVTInfo AltTo, + PatFrag vinsert_insert, + SDNodeXForm INSERT_get_vinsert_imm> : + vinsert_for_size_no_alt<Opcode, From, To, + vinsert_insert, INSERT_get_vinsert_imm> { + // Codegen pattern with the alternative types, e.g. v2i64 -> v8i64 for + // vinserti32x4. Only add this if 64x2 and friends are not supported + // natively via AVX512DQ. + let Predicates = [NoDQI] in + def : Pat<(vinsert_insert:$ins + (AltTo.VT VR512:$src1), (AltFrom.VT From.RC:$src2), (iPTR imm)), + (AltTo.VT (!cast<Instruction>(NAME # From.EltSize # "x4rr") + VR512:$src1, From.RC:$src2, + (INSERT_get_vinsert_imm VR512:$ins)))>; } -let hasSideEffects = 0 in { -// -- 64x4 form -- -def VINSERTI64x4rr : AVX512AIi8<0x3a, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src1, VR256X:$src2, i8imm:$src3), - "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, VEX_W; -let mayLoad = 1 in -def VINSERTI64x4rm : AVX512AIi8<0x3a, MRMSrcMem, (outs VR512:$dst), - (ins VR512:$src1, i256mem:$src2, i8imm:$src3), - "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; -} - -def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (v4f32 VR128X:$src2), - (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8f64 VR512:$src1), (v2f64 VR128X:$src2), - (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (v2i64 VR128X:$src2), - (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v4i32 VR128X:$src2), - (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; - -def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (loadv4f32 addr:$src2), - (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), - (bc_v4i32 (loadv2i64 addr:$src2)), - (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8f64 VR512:$src1), (loadv2f64 addr:$src2), - (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (loadv2i64 addr:$src2), - (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; - -def : Pat<(vinsert256_insert:$ins (v16f32 VR512:$src1), (v8f32 VR256X:$src2), - (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert256_insert:$ins (v8f64 VR512:$src1), (v4f64 VR256X:$src2), - (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (v4i64 VR256X:$src2), - (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v8i32 VR256X:$src2), - (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; - -def : Pat<(vinsert256_insert:$ins (v16f32 VR512:$src1), (loadv8f32 addr:$src2), - (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert256_insert:$ins (v8f64 VR512:$src1), (loadv4f64 addr:$src2), - (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert256_insert:$ins (v8i64 VR512:$src1), (loadv4i64 addr:$src2), - (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert256_insert:$ins (v16i32 VR512:$src1), - (bc_v8i32 (loadv4i64 addr:$src2)), - (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; +multiclass vinsert_for_type<ValueType EltVT32, int Opcode128, + ValueType EltVT64, int Opcode256> { + defm NAME # "32x4" : vinsert_for_size<Opcode128, + X86VectorVTInfo< 4, EltVT32, VR128X>, + X86VectorVTInfo<16, EltVT32, VR512>, + X86VectorVTInfo< 2, EltVT64, VR128X>, + X86VectorVTInfo< 8, EltVT64, VR512>, + vinsert128_insert, + INSERT_get_vinsert128_imm>; + let Predicates = [HasDQI] in + defm NAME # "64x2" : vinsert_for_size_no_alt<Opcode128, + X86VectorVTInfo< 2, EltVT64, VR128X>, + X86VectorVTInfo< 8, EltVT64, VR512>, + vinsert128_insert, + INSERT_get_vinsert128_imm>, VEX_W; + defm NAME # "64x4" : vinsert_for_size<Opcode256, + X86VectorVTInfo< 4, EltVT64, VR256X>, + X86VectorVTInfo< 8, EltVT64, VR512>, + X86VectorVTInfo< 8, EltVT32, VR256>, + X86VectorVTInfo<16, EltVT32, VR512>, + vinsert256_insert, + INSERT_get_vinsert256_imm>, VEX_W; + let Predicates = [HasDQI] in + defm NAME # "32x8" : vinsert_for_size_no_alt<Opcode256, + X86VectorVTInfo< 8, EltVT32, VR256X>, + X86VectorVTInfo<16, EltVT32, VR512>, + vinsert256_insert, + INSERT_get_vinsert256_imm>; +} + +defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a>; +defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>; // vinsertps - insert f32 to XMM def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), - (ins VR128X:$src1, VR128X:$src2, u32u8imm:$src3), + (ins VR128X:$src1, VR128X:$src2, i8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>, EVEX_4V; def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), - (ins VR128X:$src1, f32mem:$src2, u32u8imm:$src3), + (ins VR128X:$src1, f32mem:$src2, i8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128X:$dst, (X86insertps VR128X:$src1, (v4f32 (scalar_to_vector (loadf32 addr:$src2))), @@ -221,106 +451,90 @@ def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), //===----------------------------------------------------------------------===// // AVX-512 VECTOR EXTRACT //--- -let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { -// -- 32x4 form -- -def VEXTRACTF32x4rr : AVX512AIi8<0x19, MRMDestReg, (outs VR128X:$dst), - (ins VR512:$src1, i8imm:$src2), - "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512; -def VEXTRACTF32x4mr : AVX512AIi8<0x19, MRMDestMem, (outs), - (ins f128mem:$dst, VR512:$src1, i8imm:$src2), - "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>; - -// -- 64x4 form -- -def VEXTRACTF64x4rr : AVX512AIi8<0x1b, MRMDestReg, (outs VR256X:$dst), - (ins VR512:$src1, i8imm:$src2), - "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, VEX_W; -let mayStore = 1 in -def VEXTRACTF64x4mr : AVX512AIi8<0x1b, MRMDestMem, (outs), - (ins f256mem:$dst, VR512:$src1, i8imm:$src2), - "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; + +multiclass vextract_for_size<int Opcode, + X86VectorVTInfo From, X86VectorVTInfo To, + X86VectorVTInfo AltFrom, X86VectorVTInfo AltTo, + PatFrag vextract_extract, + SDNodeXForm EXTRACT_get_vextract_imm> { + let hasSideEffects = 0, ExeDomain = To.ExeDomain in { + defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst), + (ins VR512:$src1, i8imm:$idx), + "vextract" # To.EltTypeName # "x4", + "$idx, $src1", "$src1, $idx", + [(set To.RC:$dst, (vextract_extract:$idx (From.VT VR512:$src1), + (iPTR imm)))]>, + AVX512AIi8Base, EVEX, EVEX_V512; + let mayStore = 1 in + def rm : AVX512AIi8<Opcode, MRMDestMem, (outs), + (ins To.MemOp:$dst, VR512:$src1, i8imm:$src2), + "vextract" # To.EltTypeName # "x4\t{$src2, $src1, $dst|" + "$dst, $src1, $src2}", + []>, EVEX, EVEX_V512, EVEX_CD8<To.EltSize, CD8VT4>; + } + + // Codegen pattern with the alternative types, e.g. v8i64 -> v2i64 for + // vextracti32x4 + def : Pat<(vextract_extract:$ext (AltFrom.VT VR512:$src1), (iPTR imm)), + (AltTo.VT (!cast<Instruction>(NAME # To.EltSize # "x4rr") + VR512:$src1, + (EXTRACT_get_vextract_imm To.RC:$ext)))>; + + // A 128/256-bit subvector extract from the first 512-bit vector position is + // a subregister copy that needs no instruction. + def : Pat<(To.VT (extract_subvector (From.VT VR512:$src), (iPTR 0))), + (To.VT + (EXTRACT_SUBREG (From.VT VR512:$src), To.SubRegIdx))>; + + // And for the alternative types. + def : Pat<(AltTo.VT (extract_subvector (AltFrom.VT VR512:$src), (iPTR 0))), + (AltTo.VT + (EXTRACT_SUBREG (AltFrom.VT VR512:$src), AltTo.SubRegIdx))>; + + // Intrinsic call with masking. + def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # + "x4_512") + VR512:$src1, (iPTR imm:$idx), To.RC:$src0, GR8:$mask), + (!cast<Instruction>(NAME # To.EltSize # "x4rrk") To.RC:$src0, + (v4i1 (COPY_TO_REGCLASS GR8:$mask, VK4WM)), + VR512:$src1, imm:$idx)>; + + // Intrinsic call with zero-masking. + def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # + "x4_512") + VR512:$src1, (iPTR imm:$idx), To.ImmAllZerosV, GR8:$mask), + (!cast<Instruction>(NAME # To.EltSize # "x4rrkz") + (v4i1 (COPY_TO_REGCLASS GR8:$mask, VK4WM)), + VR512:$src1, imm:$idx)>; + + // Intrinsic call without masking. + def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # + "x4_512") + VR512:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)), + (!cast<Instruction>(NAME # To.EltSize # "x4rr") + VR512:$src1, imm:$idx)>; } -let hasSideEffects = 0 in { -// -- 32x4 form -- -def VEXTRACTI32x4rr : AVX512AIi8<0x39, MRMDestReg, (outs VR128X:$dst), - (ins VR512:$src1, i8imm:$src2), - "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512; -def VEXTRACTI32x4mr : AVX512AIi8<0x39, MRMDestMem, (outs), - (ins i128mem:$dst, VR512:$src1, i8imm:$src2), - "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>; - -// -- 64x4 form -- -def VEXTRACTI64x4rr : AVX512AIi8<0x3b, MRMDestReg, (outs VR256X:$dst), - (ins VR512:$src1, i8imm:$src2), - "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, VEX_W; -let mayStore = 1 in -def VEXTRACTI64x4mr : AVX512AIi8<0x3b, MRMDestMem, (outs), - (ins i256mem:$dst, VR512:$src1, i8imm:$src2), - "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; -} - -def : Pat<(vextract128_extract:$ext (v16f32 VR512:$src1), (iPTR imm)), - (v4f32 (VEXTRACTF32x4rr VR512:$src1, - (EXTRACT_get_vextract128_imm VR128X:$ext)))>; - -def : Pat<(vextract128_extract:$ext VR512:$src1, (iPTR imm)), - (v4i32 (VEXTRACTF32x4rr VR512:$src1, - (EXTRACT_get_vextract128_imm VR128X:$ext)))>; - -def : Pat<(vextract128_extract:$ext (v8f64 VR512:$src1), (iPTR imm)), - (v2f64 (VEXTRACTF32x4rr VR512:$src1, - (EXTRACT_get_vextract128_imm VR128X:$ext)))>; - -def : Pat<(vextract128_extract:$ext (v8i64 VR512:$src1), (iPTR imm)), - (v2i64 (VEXTRACTI32x4rr VR512:$src1, - (EXTRACT_get_vextract128_imm VR128X:$ext)))>; - - -def : Pat<(vextract256_extract:$ext (v16f32 VR512:$src1), (iPTR imm)), - (v8f32 (VEXTRACTF64x4rr VR512:$src1, - (EXTRACT_get_vextract256_imm VR256X:$ext)))>; - -def : Pat<(vextract256_extract:$ext (v16i32 VR512:$src1), (iPTR imm)), - (v8i32 (VEXTRACTI64x4rr VR512:$src1, - (EXTRACT_get_vextract256_imm VR256X:$ext)))>; - -def : Pat<(vextract256_extract:$ext (v8f64 VR512:$src1), (iPTR imm)), - (v4f64 (VEXTRACTF64x4rr VR512:$src1, - (EXTRACT_get_vextract256_imm VR256X:$ext)))>; - -def : Pat<(vextract256_extract:$ext (v8i64 VR512:$src1), (iPTR imm)), - (v4i64 (VEXTRACTI64x4rr VR512:$src1, - (EXTRACT_get_vextract256_imm VR256X:$ext)))>; - -// A 256-bit subvector extract from the first 512-bit vector position -// is a subregister copy that needs no instruction. -def : Pat<(v8i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))), - (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm))>; -def : Pat<(v8f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))), - (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm))>; -def : Pat<(v4i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))), - (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm))>; -def : Pat<(v4f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))), - (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm))>; - -// zmm -> xmm -def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))), - (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>; -def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))), - (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>; -def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))), - (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>; -def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))), - (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; +multiclass vextract_for_type<ValueType EltVT32, int Opcode32, + ValueType EltVT64, int Opcode64> { + defm NAME # "32x4" : vextract_for_size<Opcode32, + X86VectorVTInfo<16, EltVT32, VR512>, + X86VectorVTInfo< 4, EltVT32, VR128X>, + X86VectorVTInfo< 8, EltVT64, VR512>, + X86VectorVTInfo< 2, EltVT64, VR128X>, + vextract128_extract, + EXTRACT_get_vextract128_imm>; + defm NAME # "64x4" : vextract_for_size<Opcode64, + X86VectorVTInfo< 8, EltVT64, VR512>, + X86VectorVTInfo< 4, EltVT64, VR256X>, + X86VectorVTInfo<16, EltVT32, VR512>, + X86VectorVTInfo< 8, EltVT32, VR256>, + vextract256_extract, + EXTRACT_get_vextract256_imm>, VEX_W; +} +defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b>; +defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b>; // A 128-bit subvector insert to the first 512-bit vector position // is a subregister copy that needs no instruction. @@ -352,13 +566,13 @@ def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)), // vextractps - extract 32 bits from XMM def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst), - (ins VR128X:$src1, u32u8imm:$src2), + (ins VR128X:$src1, i32i8imm:$src2), "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>, EVEX; def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs), - (ins f32mem:$dst, VR128X:$src1, u32u8imm:$src2), + (ins f32mem:$dst, VR128X:$src1, i32i8imm:$src2), "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2), addr:$dst)]>, EVEX, EVEX_CD8<32, CD8VT1>; @@ -366,36 +580,57 @@ def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs), //===---------------------------------------------------------------------===// // AVX-512 BROADCAST //--- -multiclass avx512_fp_broadcast<bits<8> opc, string OpcodeStr, - RegisterClass DestRC, - RegisterClass SrcRC, X86MemOperand x86memop> { - def rr : AVX5128I<opc, MRMSrcReg, (outs DestRC:$dst), (ins SrcRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), - []>, EVEX; - def rm : AVX5128I<opc, MRMSrcMem, (outs DestRC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),[]>, EVEX; +multiclass avx512_fp_broadcast<bits<8> opc, SDNode OpNode, RegisterClass SrcRC, + ValueType svt, X86VectorVTInfo _> { + defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins SrcRC:$src), "vbroadcast"## !subst("p", "s", _.Suffix), + "$src", "$src", (_.VT (OpNode (svt SrcRC:$src)))>, + T8PD, EVEX; + + let mayLoad = 1 in { + defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src), + "vbroadcast"##!subst("p", "s", _.Suffix), "$src", "$src", + (_.VT (OpNode (_.ScalarLdFrag addr:$src)))>, + T8PD, EVEX; + } } + +multiclass avx512_fp_broadcast_vl<bits<8> opc, SDNode OpNode, + AVX512VLVectorVTInfo _> { + defm Z : avx512_fp_broadcast<opc, OpNode, VR128X, _.info128.VT, _.info512>, + EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : avx512_fp_broadcast<opc, OpNode, VR128X, _.info128.VT, _.info256>, + EVEX_V256; + } +} + let ExeDomain = SSEPackedSingle in { - defm VBROADCASTSSZ : avx512_fp_broadcast<0x18, "vbroadcastss", VR512, - VR128X, f32mem>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; + defm VBROADCASTSS : avx512_fp_broadcast_vl<0x18, X86VBroadcast, + avx512vl_f32_info>, EVEX_CD8<32, CD8VT1>; + let Predicates = [HasVLX] in { + defm VBROADCASTSSZ128 : avx512_fp_broadcast<0x18, X86VBroadcast, VR128X, + v4f32, v4f32x_info>, EVEX_V128, + EVEX_CD8<32, CD8VT1>; + } } let ExeDomain = SSEPackedDouble in { - defm VBROADCASTSDZ : avx512_fp_broadcast<0x19, "vbroadcastsd", VR512, - VR128X, f64mem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; + defm VBROADCASTSD : avx512_fp_broadcast_vl<0x19, X86VBroadcast, + avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VT1>; } def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))), - (VBROADCASTSSZrm addr:$src)>; + (VBROADCASTSSZm addr:$src)>; def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))), - (VBROADCASTSDZrm addr:$src)>; + (VBROADCASTSDZm addr:$src)>; def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src), - (VBROADCASTSSZrm addr:$src)>; + (VBROADCASTSSZm addr:$src)>; def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src), - (VBROADCASTSDZrm addr:$src)>; + (VBROADCASTSDZm addr:$src)>; multiclass avx512_int_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass SrcRC, RegisterClass KRC> { @@ -503,22 +738,27 @@ def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_512 (v4i32 VR128X:$src))), def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))), (VPBROADCASTQZrr VR128X:$src)>; -def : Pat<(v16f32 (X86VBroadcast (v4f32 VR128X:$src))), - (VBROADCASTSSZrr VR128X:$src)>; -def : Pat<(v8f64 (X86VBroadcast (v2f64 VR128X:$src))), - (VBROADCASTSDZrr VR128X:$src)>; +def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))), + (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; +def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))), + (VBROADCASTSDZr (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>; + +def : Pat<(v16i32 (X86VBroadcast (v16i32 VR512:$src))), + (VPBROADCASTDZrr (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>; +def : Pat<(v8i64 (X86VBroadcast (v8i64 VR512:$src))), + (VPBROADCASTQZrr (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>; def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))), - (VBROADCASTSSZrr VR128X:$src)>; + (VBROADCASTSSZr VR128X:$src)>; def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))), - (VBROADCASTSDZrr VR128X:$src)>; + (VBROADCASTSDZr VR128X:$src)>; // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. def : Pat<(v16f32 (X86VBroadcast FR32X:$src)), - (VBROADCASTSSZrr (COPY_TO_REGCLASS FR32X:$src, VR128X))>; + (VBROADCASTSSZr (COPY_TO_REGCLASS FR32X:$src, VR128X))>; def : Pat<(v8f64 (X86VBroadcast FR64X:$src)), - (VBROADCASTSDZrr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; + (VBROADCASTSDZr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; let Predicates = [HasAVX512] in { @@ -532,48 +772,91 @@ def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))), //--- multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr, - RegisterClass DstRC, RegisterClass KRC, - ValueType OpVT, ValueType SrcVT> { -def rr : AVX512XS8I<opc, MRMDestReg, (outs DstRC:$dst), (ins KRC:$src), + RegisterClass KRC> { +let Predicates = [HasCDI] in +def Zrr : AVX512XS8I<opc, MRMSrcReg, (outs VR512:$dst), (ins KRC:$src), !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), - []>, EVEX; + []>, EVEX, EVEX_V512; + +let Predicates = [HasCDI, HasVLX] in { +def Z128rr : AVX512XS8I<opc, MRMSrcReg, (outs VR128:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + []>, EVEX, EVEX_V128; +def Z256rr : AVX512XS8I<opc, MRMSrcReg, (outs VR256:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + []>, EVEX, EVEX_V256; +} } let Predicates = [HasCDI] in { -defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512, - VK16, v16i32, v16i1>, EVEX_V512; -defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512, - VK8, v8i64, v8i1>, EVEX_V512, VEX_W; +defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", + VK16>; +defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", + VK8>, VEX_W; } //===----------------------------------------------------------------------===// // AVX-512 - VPERM // // -- immediate form -- -multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, RegisterClass RC, - SDNode OpNode, PatFrag mem_frag, - X86MemOperand x86memop, ValueType OpVT> { - def ri : AVX512AIi8<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, i8imm:$src2), +multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + def ri : AVX512AIi8<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, i8imm:$src2), !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, - (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>, + [(set _.RC:$dst, + (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>, EVEX; - def mi : AVX512AIi8<opc, MRMSrcMem, (outs RC:$dst), - (ins x86memop:$src1, i8imm:$src2), + def mi : AVX512AIi8<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.MemOp:$src1, i8imm:$src2), !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, - (OpVT (OpNode (mem_frag addr:$src1), - (i8 imm:$src2))))]>, EVEX; + [(set _.RC:$dst, + (_.VT (OpNode (_.MemOpFrag addr:$src1), + (i8 imm:$src2))))]>, + EVEX, EVEX_CD8<_.EltSize, CD8VF>; +} +} + +multiclass avx512_permil<bits<8> OpcImm, bits<8> OpcVar, X86VectorVTInfo _, + X86VectorVTInfo Ctrl> : + avx512_perm_imm<OpcImm, "vpermil" # _.Suffix, X86VPermilpi, _> { + let ExeDomain = _.ExeDomain in { + def rr : AVX5128I<OpcVar, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), + !strconcat("vpermil" # _.Suffix, + " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst, + (_.VT (X86VPermilpv _.RC:$src1, + (Ctrl.VT Ctrl.RC:$src2))))]>, + EVEX_4V; + def rm : AVX5128I<OpcVar, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, Ctrl.MemOp:$src2), + !strconcat("vpermil" # _.Suffix, + " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst, + (_.VT (X86VPermilpv _.RC:$src1, + (Ctrl.VT (Ctrl.MemOpFrag addr:$src2)))))]>, + EVEX_4V; + } } -defm VPERMQZ : avx512_perm_imm<0x00, "vpermq", VR512, X86VPermi, memopv8i64, - i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -let ExeDomain = SSEPackedDouble in -defm VPERMPDZ : avx512_perm_imm<0x01, "vpermpd", VR512, X86VPermi, memopv8f64, - f512mem, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMQZ : avx512_perm_imm<0x00, "vpermq", X86VPermi, v8i64_info>, + EVEX_V512, VEX_W; +defm VPERMPDZ : avx512_perm_imm<0x01, "vpermpd", X86VPermi, v8f64_info>, + EVEX_V512, VEX_W; + +defm VPERMILPSZ : avx512_permil<0x04, 0x0C, v16f32_info, v16i32_info>, + EVEX_V512; +defm VPERMILPDZ : avx512_permil<0x05, 0x0D, v8f64_info, v8i64_info>, + EVEX_V512, VEX_W; + +def : Pat<(v16i32 (X86VPermilpi VR512:$src1, (i8 imm:$imm))), + (VPERMILPSZri VR512:$src1, imm:$imm)>; +def : Pat<(v8i64 (X86VPermilpi VR512:$src1, (i8 imm:$imm))), + (VPERMILPDZri VR512:$src1, imm:$imm)>; // -- VPERM - register form -- multiclass avx512_perm<bits<8> opc, string OpcodeStr, RegisterClass RC, @@ -834,98 +1117,295 @@ defm VCMPSDZ : avx512_cmp_scalar<FR64X, f64mem, AVXCC, X86cmpms, f64, loadf64, XD, VEX_W; } -multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, RegisterClass KRC, - RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, - SDNode OpNode, ValueType vt> { +multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { def rr : AVX512BI<opc, MRMSrcReg, - (outs KRC:$dst), (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))], + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))], IIC_SSE_ALU_F32P_RR>, EVEX_4V; + let mayLoad = 1 in def rm : AVX512BI<opc, MRMSrcMem, - (outs KRC:$dst), (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2)))], + (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2)))))], IIC_SSE_ALU_F32P_RM>, EVEX_4V; + def rrk : AVX512BI<opc, MRMSrcReg, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))], + IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K; + let mayLoad = 1 in + def rmk : AVX512BI<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert + (_.LdFrag addr:$src2))))))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K; } -defm VPCMPEQDZ : avx512_icmp_packed<0x76, "vpcmpeqd", VK16, VR512, i512mem, - memopv16i32, X86pcmpeqm, v16i32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPCMPEQQZ : avx512_icmp_packed<0x29, "vpcmpeqq", VK8, VR512, i512mem, - memopv8i64, X86pcmpeqm, v8i64>, T8PD, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; +multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> : + avx512_icmp_packed<opc, OpcodeStr, OpNode, _> { + let mayLoad = 1 in { + def rmb : AVX512BI<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), + !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst", + "|$dst, $src1, ${src2}", _.BroadcastStr, "}"), + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2))))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B; + def rmbk : AVX512BI<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, + _.ScalarMemOp:$src2), + !strconcat(OpcodeStr, + "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (X86VBroadcast + (_.ScalarLdFrag addr:$src2)))))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B; + } +} -defm VPCMPGTDZ : avx512_icmp_packed<0x66, "vpcmpgtd", VK16, VR512, i512mem, - memopv16i32, X86pcmpgtm, v16i32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPCMPGTQZ : avx512_icmp_packed<0x37, "vpcmpgtq", VK8, VR512, i512mem, - memopv8i64, X86pcmpgtm, v8i64>, T8PD, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; +multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info512>, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info256>, + EVEX_V256; + defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info128>, + EVEX_V128; + } +} + +multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr, + SDNode OpNode, AVX512VLVectorVTInfo VTInfo, + Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info512>, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info256>, + EVEX_V256; + defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info128>, + EVEX_V128; + } +} + +defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm, + avx512vl_i8_info, HasBWI>, + EVEX_CD8<8, CD8VF>; + +defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm, + avx512vl_i16_info, HasBWI>, + EVEX_CD8<16, CD8VF>; + +defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm, + avx512vl_i32_info, HasAVX512>, + EVEX_CD8<32, CD8VF>; + +defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm, + avx512vl_i64_info, HasAVX512>, + T8PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, + avx512vl_i8_info, HasBWI>, + EVEX_CD8<8, CD8VF>; + +defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, + avx512vl_i16_info, HasBWI>, + EVEX_CD8<16, CD8VF>; + +defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, + avx512vl_i32_info, HasAVX512>, + EVEX_CD8<32, CD8VF>; + +defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, + avx512vl_i64_info, HasAVX512>, + T8PD, VEX_W, EVEX_CD8<64, CD8VF>; def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), - (COPY_TO_REGCLASS (VPCMPGTDZrr + (COPY_TO_REGCLASS (VPCMPGTDZrr (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>; def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), - (COPY_TO_REGCLASS (VPCMPEQDZrr + (COPY_TO_REGCLASS (VPCMPEQDZrr (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>; -multiclass avx512_icmp_cc<bits<8> opc, RegisterClass WMRC, RegisterClass KRC, - RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, - SDNode OpNode, ValueType vt, Operand CC, string Suffix> { +multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, + X86VectorVTInfo _> { def rri : AVX512AIi8<opc, MRMSrcReg, - (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), !strconcat("vpcmp${cc}", Suffix, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2), imm:$cc))], + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + imm:$cc))], IIC_SSE_ALU_F32P_RR>, EVEX_4V; + let mayLoad = 1 in def rmi : AVX512AIi8<opc, MRMSrcMem, - (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), + (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc), !strconcat("vpcmp${cc}", Suffix, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2), - imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V; + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), + imm:$cc))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V; + def rrik : AVX512AIi8<opc, MRMSrcReg, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, + AVXCC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{$src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + imm:$cc)))], + IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K; + let mayLoad = 1 in + def rmik : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2, + AVXCC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{$src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), + imm:$cc)))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K; + // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { def rri_alt : AVX512AIi8<opc, MRMSrcReg, - (outs KRC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), - !strconcat("vpcmp", Suffix, - "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, i8imm:$cc), + !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|", + "$dst, $src1, $src2, $cc}"), [], IIC_SSE_ALU_F32P_RR>, EVEX_4V; + def rmi_alt : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, i8imm:$cc), + !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|", + "$dst, $src1, $src2, $cc}"), + [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; def rrik_alt : AVX512AIi8<opc, MRMSrcReg, - (outs KRC:$dst), (ins WMRC:$mask, RC:$src1, RC:$src2, i8imm:$cc), + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, + i8imm:$cc), !strconcat("vpcmp", Suffix, - "\t{$cc, $src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2, $cc}"), + "\t{$cc, $src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2, $cc}"), [], IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K; - def rmi_alt : AVX512AIi8<opc, MRMSrcMem, - (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), - !strconcat("vpcmp", Suffix, - "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), - [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; def rmik_alt : AVX512AIi8<opc, MRMSrcMem, - (outs KRC:$dst), (ins WMRC:$mask, RC:$src1, x86memop:$src2, i8imm:$cc), + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2, + i8imm:$cc), !strconcat("vpcmp", Suffix, - "\t{$cc, $src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2, $cc}"), + "\t{$cc, $src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2, $cc}"), [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K; } } -defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16WM, VK16, VR512, i512mem, memopv16i32, - X86cmpm, v16i32, AVXCC, "d">, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16WM, VK16, VR512, i512mem, memopv16i32, - X86cmpmu, v16i32, AVXCC, "ud">, - EVEX_V512, EVEX_CD8<32, CD8VF>; +multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode, + X86VectorVTInfo _> : + avx512_icmp_cc<opc, Suffix, OpNode, _> { + let mayLoad = 1 in { + def rmib : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, + AVXCC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{${src2}", _.BroadcastStr, ", $src1, $dst|", + "$dst, $src1, ${src2}", _.BroadcastStr, "}"), + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + imm:$cc))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B; + def rmibk : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, + _.ScalarMemOp:$src2, AVXCC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + imm:$cc)))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B; + } + + // Accept explicit immediate argument form instead of comparison code. + let isAsmParserOnly = 1, hasSideEffects = 0 in { + def rmib_alt : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, + i8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|", + "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"), + [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B; + def rmibk_alt : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, + _.ScalarMemOp:$src2, i8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"), + [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B; + } +} + +multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info512>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info256>, EVEX_V256; + defm Z128 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info128>, EVEX_V128; + } +} + +multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info512>, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info256>, + EVEX_V256; + defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info128>, + EVEX_V128; + } +} + +defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, avx512vl_i8_info, + HasBWI>, EVEX_CD8<8, CD8VF>; +defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, avx512vl_i8_info, + HasBWI>, EVEX_CD8<8, CD8VF>; -defm VPCMPQZ : avx512_icmp_cc<0x1F, VK8WM, VK8, VR512, i512mem, memopv8i64, - X86cmpm, v8i64, AVXCC, "q">, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8WM, VK8, VR512, i512mem, memopv8i64, - X86cmpmu, v8i64, AVXCC, "uq">, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; +defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, avx512vl_i16_info, + HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; +defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, avx512vl_i16_info, + HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; + +defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, avx512vl_i32_info, + HasAVX512>, EVEX_CD8<32, CD8VF>; +defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, avx512vl_i32_info, + HasAVX512>, EVEX_CD8<32, CD8VF>; + +defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info, + HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info, + HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; // avx512_cmp_packed - compare packed instructions multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC, @@ -1015,14 +1495,14 @@ def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1), // multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk, string OpcodeStr, RegisterClass KRC, - ValueType vt, X86MemOperand x86memop> { + ValueType vvt, ValueType ivt, X86MemOperand x86memop> { let hasSideEffects = 0 in { def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src), !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>; let mayLoad = 1 in def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), - [(set KRC:$dst, (vt (load addr:$src)))]>; + [(set KRC:$dst, (vvt (bitconvert (ivt (load addr:$src)))))]>; let mayStore = 1 in def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src), !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>; @@ -1040,32 +1520,82 @@ multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk, } } -let Predicates = [HasAVX512] in { - defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>, - VEX, PS; - defm KMOVW : avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>, +let Predicates = [HasDQI] in + defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8, + i8mem>, + avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32>, + VEX, PD; + +let Predicates = [HasAVX512] in + defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16, + i16mem>, + avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>, VEX, PS; + +let Predicates = [HasBWI] in { + defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1, i32, + i32mem>, VEX, PD, VEX_W; + defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>, + VEX, XD; } +let Predicates = [HasBWI] in { + defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64, + i64mem>, VEX, PS, VEX_W; + defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>, + VEX, XD, VEX_W; +} + +// GR from/to mask register +let Predicates = [HasDQI] in { + def : Pat<(v8i1 (bitconvert (i8 GR8:$src))), + (KMOVBkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit))>; + def : Pat<(i8 (bitconvert (v8i1 VK8:$src))), + (EXTRACT_SUBREG (KMOVBrk VK8:$src), sub_8bit)>; +} let Predicates = [HasAVX512] in { - // GR16 from/to 16-bit mask def : Pat<(v16i1 (bitconvert (i16 GR16:$src))), (KMOVWkr (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit))>; def : Pat<(i16 (bitconvert (v16i1 VK16:$src))), (EXTRACT_SUBREG (KMOVWrk VK16:$src), sub_16bit)>; +} +let Predicates = [HasBWI] in { + def : Pat<(v32i1 (bitconvert (i32 GR32:$src))), (KMOVDkr GR32:$src)>; + def : Pat<(i32 (bitconvert (v32i1 VK32:$src))), (KMOVDrk VK32:$src)>; +} +let Predicates = [HasBWI] in { + def : Pat<(v64i1 (bitconvert (i64 GR64:$src))), (KMOVQkr GR64:$src)>; + def : Pat<(i64 (bitconvert (v64i1 VK64:$src))), (KMOVQrk VK64:$src)>; +} - // Store kreg in memory - def : Pat<(store (v16i1 VK16:$src), addr:$dst), +// Load/store kreg +let Predicates = [HasDQI] in { + def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), + (KMOVBmk addr:$dst, VK8:$src)>; +} +let Predicates = [HasAVX512] in { + def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst), (KMOVWmk addr:$dst, VK16:$src)>; - - def : Pat<(store VK8:$src, addr:$dst), + def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>; - def : Pat<(i1 (load addr:$src)), (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK1)>; - - def : Pat<(v8i1 (load addr:$src)), + def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>; +} +let Predicates = [HasBWI] in { + def : Pat<(store (i32 (bitconvert (v32i1 VK32:$src))), addr:$dst), + (KMOVDmk addr:$dst, VK32:$src)>; +} +let Predicates = [HasBWI] in { + def : Pat<(store (i64 (bitconvert (v64i1 VK64:$src))), addr:$dst), + (KMOVQmk addr:$dst, VK64:$src)>; +} + +let Predicates = [HasAVX512] in { + def : Pat<(i1 (trunc (i64 GR64:$src))), + (COPY_TO_REGCLASS (KMOVWkr (AND32ri (EXTRACT_SUBREG $src, sub_32bit), + (i32 1))), VK1)>; def : Pat<(i1 (trunc (i32 GR32:$src))), (COPY_TO_REGCLASS (KMOVWkr (AND32ri $src, (i32 1))), VK1)>; @@ -1078,7 +1608,7 @@ let Predicates = [HasAVX512] in { (COPY_TO_REGCLASS (KMOVWkr (AND32ri (SUBREG_TO_REG (i32 0), $src, sub_16bit), (i32 1))), VK1)>; - + def : Pat<(i32 (zext VK1:$src)), (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>; def : Pat<(i8 (zext VK1:$src)), @@ -1097,6 +1627,14 @@ let Predicates = [HasAVX512] in { def : Pat<(v8i1 (scalar_to_vector VK1:$src)), (COPY_TO_REGCLASS VK1:$src, VK8)>; } +let Predicates = [HasBWI] in { + def : Pat<(v32i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK32)>; + def : Pat<(v64i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK64)>; +} + + // With AVX-512 only, 8-bit mask is promoted to 16-bit mask. let Predicates = [HasAVX512] in { // GR from/to 8-bit mask without native support @@ -1113,26 +1651,38 @@ let Predicates = [HasAVX512] in { (COPY_TO_REGCLASS VK16:$src, VK1)>; def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))), (COPY_TO_REGCLASS VK8:$src, VK1)>; - +} +let Predicates = [HasBWI] in { + def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK32:$src, VK1)>; + def : Pat<(i1 (X86Vextract VK64:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK64:$src, VK1)>; } // Mask unary operation // - KNOT multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr, - RegisterClass KRC, SDPatternOperator OpNode> { - let Predicates = [HasAVX512] in + RegisterClass KRC, SDPatternOperator OpNode, + Predicate prd> { + let Predicates = [prd] in def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src), !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), [(set KRC:$dst, (OpNode KRC:$src))]>; } -multiclass avx512_mask_unop_w<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode> { - defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>, - VEX, PS; +multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode> { + defm B : avx512_mask_unop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode, + HasDQI>, VEX, PD; + defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode, + HasAVX512>, VEX, PS; + defm D : avx512_mask_unop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode, + HasBWI>, VEX, PD, VEX_W; + defm Q : avx512_mask_unop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode, + HasBWI>, VEX, PS, VEX_W; } -defm KNOT : avx512_mask_unop_w<0x44, "knot", not>; +defm KNOT : avx512_mask_unop_all<0x44, "knot", not>; multiclass avx512_mask_unop_int<string IntName, string InstName> { let Predicates = [HasAVX512] in @@ -1143,43 +1693,60 @@ multiclass avx512_mask_unop_int<string IntName, string InstName> { } defm : avx512_mask_unop_int<"knot", "KNOT">; +let Predicates = [HasDQI] in +def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)), (KNOTBrr VK8:$src1)>; +let Predicates = [HasAVX512] in def : Pat<(xor VK16:$src1, (v16i1 immAllOnesV)), (KNOTWrr VK16:$src1)>; +let Predicates = [HasBWI] in +def : Pat<(xor VK32:$src1, (v32i1 immAllOnesV)), (KNOTDrr VK32:$src1)>; +let Predicates = [HasBWI] in +def : Pat<(xor VK64:$src1, (v64i1 immAllOnesV)), (KNOTQrr VK64:$src1)>; + +// KNL does not support KMOVB, 8-bit mask is promoted to 16-bit +let Predicates = [HasAVX512] in { def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)), (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src1, VK16)), VK8)>; -// With AVX-512, 8-bit mask is promoted to 16-bit mask. def : Pat<(not VK8:$src), (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>; +} // Mask binary operation // - KAND, KANDN, KOR, KXNOR, KXOR multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr, - RegisterClass KRC, SDPatternOperator OpNode> { - let Predicates = [HasAVX512] in + RegisterClass KRC, SDPatternOperator OpNode, + Predicate prd> { + let Predicates = [prd] in def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2), !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>; } -multiclass avx512_mask_binop_w<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode> { - defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>, - VEX_4V, VEX_L, PS; +multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode> { + defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode, + HasDQI>, VEX_4V, VEX_L, PD; + defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode, + HasAVX512>, VEX_4V, VEX_L, PS; + defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode, + HasBWI>, VEX_4V, VEX_L, VEX_W, PD; + defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode, + HasBWI>, VEX_4V, VEX_L, VEX_W, PS; } def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>; def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>; let isCommutable = 1 in { - defm KAND : avx512_mask_binop_w<0x41, "kand", and>; - let isCommutable = 0 in - defm KANDN : avx512_mask_binop_w<0x42, "kandn", andn>; - defm KOR : avx512_mask_binop_w<0x45, "kor", or>; - defm KXNOR : avx512_mask_binop_w<0x46, "kxnor", xnor>; - defm KXOR : avx512_mask_binop_w<0x47, "kxor", xor>; + defm KAND : avx512_mask_binop_all<0x41, "kand", and>; + defm KOR : avx512_mask_binop_all<0x45, "kor", or>; + defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", xnor>; + defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor>; } +let isCommutable = 0 in + defm KANDN : avx512_mask_binop_all<0x42, "kandn", andn>; def : Pat<(xor VK1:$src1, VK1:$src2), (COPY_TO_REGCLASS (KXORWrr (COPY_TO_REGCLASS VK1:$src1, VK16), @@ -1325,6 +1892,17 @@ def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))), def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))), (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>; +let Predicates = [HasVLX] in { + def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>; + def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>; + def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), + (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>; + def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), + (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>; +} + def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))), (v8i1 (COPY_TO_REGCLASS (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>; @@ -1334,104 +1912,176 @@ def : Pat<(v8i1 (X86vsrli VK8:$src, (i8 imm:$imm))), // AVX-512 - Aligned and unaligned load and store // -multiclass avx512_load<bits<8> opc, RegisterClass RC, RegisterClass KRC, - X86MemOperand x86memop, PatFrag ld_frag, - string asm, Domain d, - ValueType vt, bit IsReMaterializable = 1> { +multiclass avx512_load<bits<8> opc, string OpcodeStr, PatFrag ld_frag, + RegisterClass KRC, RegisterClass RC, + ValueType vt, ValueType zvt, X86MemOperand memop, + Domain d, bit IsReMaterializable = 1> { let hasSideEffects = 0 in { def rr : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(asm, " \t{$src, $dst|$dst, $src}"), [], d>, - EVEX; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], + d>, EVEX; def rrkz : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src), - !strconcat(asm, - " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - [], d>, EVEX, EVEX_KZ; + !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", + "${dst} {${mask}} {z}, $src}"), [], d>, EVEX, EVEX_KZ; } - let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in - def rm : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(asm, " \t{$src, $dst|$dst, $src}"), - [(set (vt RC:$dst), (ld_frag addr:$src))], d>, EVEX; - let Constraints = "$src1 = $dst", hasSideEffects = 0 in { - def rrk : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, RC:$src2), - !strconcat(asm, - " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [], d>, - EVEX, EVEX_K; - let mayLoad = 1 in - def rmk : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, x86memop:$src2), - !strconcat(asm, - " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), - [], d>, EVEX, EVEX_K; + let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable, + SchedRW = [WriteLoad] in + def rm : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), (ins memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (vt (bitconvert (ld_frag addr:$src))))], + d>, EVEX; + + let AddedComplexity = 20 in { + let Constraints = "$src0 = $dst", hasSideEffects = 0 in { + let hasSideEffects = 0 in + def rrk : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src0, KRC:$mask, RC:$src1), + !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", + "${dst} {${mask}}, $src1}"), + [(set RC:$dst, (vt (vselect KRC:$mask, + (vt RC:$src1), + (vt RC:$src0))))], + d>, EVEX, EVEX_K; + let mayLoad = 1, SchedRW = [WriteLoad] in + def rmk : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src0, KRC:$mask, memop:$src1), + !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", + "${dst} {${mask}}, $src1}"), + [(set RC:$dst, (vt + (vselect KRC:$mask, + (vt (bitconvert (ld_frag addr:$src1))), + (vt RC:$src0))))], + d>, EVEX, EVEX_K; + } + let mayLoad = 1, SchedRW = [WriteLoad] in + def rmkz : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), + (ins KRC:$mask, memop:$src), + !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", + "${dst} {${mask}} {z}, $src}"), + [(set RC:$dst, (vt + (vselect KRC:$mask, + (vt (bitconvert (ld_frag addr:$src))), + (vt (bitconvert (zvt immAllZerosV))))))], + d>, EVEX, EVEX_KZ; + } +} + +multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, string ld_pat, + string elty, string elsz, string vsz512, + string vsz256, string vsz128, Domain d, + Predicate prd, bit IsReMaterializable = 1> { + let Predicates = [prd] in + defm Z : avx512_load<opc, OpcodeStr, + !cast<PatFrag>(ld_pat##"v"##vsz512##elty##elsz), + !cast<RegisterClass>("VK"##vsz512##"WM"), VR512, + !cast<ValueType>("v"##vsz512##elty##elsz), v16i32, + !cast<X86MemOperand>(elty##"512mem"), d, + IsReMaterializable>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_load<opc, OpcodeStr, + !cast<PatFrag>(ld_pat##!if(!eq(elty,"f"), + "v"##vsz256##elty##elsz, "v4i64")), + !cast<RegisterClass>("VK"##vsz256##"WM"), VR256X, + !cast<ValueType>("v"##vsz256##elty##elsz), v8i32, + !cast<X86MemOperand>(elty##"256mem"), d, + IsReMaterializable>, EVEX_V256; + + defm Z128 : avx512_load<opc, OpcodeStr, + !cast<PatFrag>(ld_pat##!if(!eq(elty,"f"), + "v"##vsz128##elty##elsz, "v2i64")), + !cast<RegisterClass>("VK"##vsz128##"WM"), VR128X, + !cast<ValueType>("v"##vsz128##elty##elsz), v4i32, + !cast<X86MemOperand>(elty##"128mem"), d, + IsReMaterializable>, EVEX_V128; } - let mayLoad = 1 in - def rmkz : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, x86memop:$src2), - !strconcat(asm, - " \t{$src2, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src2}"), - [], d>, EVEX, EVEX_KZ; } -multiclass avx512_store<bits<8> opc, RegisterClass RC, RegisterClass KRC, - X86MemOperand x86memop, PatFrag store_frag, - string asm, Domain d, ValueType vt> { + +multiclass avx512_store<bits<8> opc, string OpcodeStr, PatFrag st_frag, + ValueType OpVT, RegisterClass KRC, RegisterClass RC, + X86MemOperand memop, Domain d> { let isAsmParserOnly = 1, hasSideEffects = 0 in { def rr_alt : AVX512PI<opc, MRMDestReg, (outs RC:$dst), (ins RC:$src), - !strconcat(asm, " \t{$src, $dst|$dst, $src}"), [], d>, + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], d>, EVEX; let Constraints = "$src1 = $dst" in - def alt_rrk : AVX512PI<opc, MRMDestReg, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, RC:$src2), - !strconcat(asm, - " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [], d>, + def rrk_alt : AVX512PI<opc, MRMDestReg, (outs RC:$dst), + (ins RC:$src1, KRC:$mask, RC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [], d>, EVEX, EVEX_K; - def alt_rrkz : AVX512PI<opc, MRMDestReg, (outs RC:$dst), + def rrkz_alt : AVX512PI<opc, MRMDestReg, (outs RC:$dst), (ins KRC:$mask, RC:$src), - !strconcat(asm, - " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + !strconcat(OpcodeStr, + "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), [], d>, EVEX, EVEX_KZ; } let mayStore = 1 in { - def mr : AVX512PI<opc, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), - !strconcat(asm, " \t{$src, $dst|$dst, $src}"), - [(store_frag (vt RC:$src), addr:$dst)], d>, EVEX; + def mr : AVX512PI<opc, MRMDestMem, (outs), (ins memop:$dst, RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(st_frag (OpVT RC:$src), addr:$dst)], d>, EVEX; def mrk : AVX512PI<opc, MRMDestMem, (outs), - (ins x86memop:$dst, KRC:$mask, RC:$src), - !strconcat(asm, - " \t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), + (ins memop:$dst, KRC:$mask, RC:$src), + !strconcat(OpcodeStr, + "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), [], d>, EVEX, EVEX_K; - def mrkz : AVX512PI<opc, MRMDestMem, (outs), - (ins x86memop:$dst, KRC:$mask, RC:$src), - !strconcat(asm, - " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - [], d>, EVEX, EVEX_KZ; } } -defm VMOVAPSZ : avx512_load<0x28, VR512, VK16WM, f512mem, alignedloadv16f32, - "vmovaps", SSEPackedSingle, v16f32>, - avx512_store<0x29, VR512, VK16WM, f512mem, alignedstore512, - "vmovaps", SSEPackedSingle, v16f32>, - PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VMOVAPDZ : avx512_load<0x28, VR512, VK8WM, f512mem, alignedloadv8f64, - "vmovapd", SSEPackedDouble, v8f64>, - avx512_store<0x29, VR512, VK8WM, f512mem, alignedstore512, - "vmovapd", SSEPackedDouble, v8f64>, - PD, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; -defm VMOVUPSZ : avx512_load<0x10, VR512, VK16WM, f512mem, loadv16f32, - "vmovups", SSEPackedSingle, v16f32>, - avx512_store<0x11, VR512, VK16WM, f512mem, store, - "vmovups", SSEPackedSingle, v16f32>, - PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VMOVUPDZ : avx512_load<0x10, VR512, VK8WM, f512mem, loadv8f64, - "vmovupd", SSEPackedDouble, v8f64, 0>, - avx512_store<0x11, VR512, VK8WM, f512mem, store, - "vmovupd", SSEPackedDouble, v8f64>, - PD, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; + +multiclass avx512_store_vl<bits<8> opc, string OpcodeStr, string st_pat, + string st_suff_512, string st_suff_256, + string st_suff_128, string elty, string elsz, + string vsz512, string vsz256, string vsz128, + Domain d, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_store<opc, OpcodeStr, !cast<PatFrag>(st_pat##st_suff_512), + !cast<ValueType>("v"##vsz512##elty##elsz), + !cast<RegisterClass>("VK"##vsz512##"WM"), VR512, + !cast<X86MemOperand>(elty##"512mem"), d>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_store<opc, OpcodeStr, !cast<PatFrag>(st_pat##st_suff_256), + !cast<ValueType>("v"##vsz256##elty##elsz), + !cast<RegisterClass>("VK"##vsz256##"WM"), VR256X, + !cast<X86MemOperand>(elty##"256mem"), d>, EVEX_V256; + + defm Z128 : avx512_store<opc, OpcodeStr, !cast<PatFrag>(st_pat##st_suff_128), + !cast<ValueType>("v"##vsz128##elty##elsz), + !cast<RegisterClass>("VK"##vsz128##"WM"), VR128X, + !cast<X86MemOperand>(elty##"128mem"), d>, EVEX_V128; + } +} + +defm VMOVAPS : avx512_load_vl<0x28, "vmovaps", "alignedload", "f", "32", + "16", "8", "4", SSEPackedSingle, HasAVX512>, + avx512_store_vl<0x29, "vmovaps", "alignedstore", + "512", "256", "", "f", "32", "16", "8", "4", + SSEPackedSingle, HasAVX512>, + PS, EVEX_CD8<32, CD8VF>; + +defm VMOVAPD : avx512_load_vl<0x28, "vmovapd", "alignedload", "f", "64", + "8", "4", "2", SSEPackedDouble, HasAVX512>, + avx512_store_vl<0x29, "vmovapd", "alignedstore", + "512", "256", "", "f", "64", "8", "4", "2", + SSEPackedDouble, HasAVX512>, + PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMOVUPS : avx512_load_vl<0x10, "vmovups", "load", "f", "32", + "16", "8", "4", SSEPackedSingle, HasAVX512>, + avx512_store_vl<0x11, "vmovups", "store", "", "", "", "f", "32", + "16", "8", "4", SSEPackedSingle, HasAVX512>, + PS, EVEX_CD8<32, CD8VF>; + +defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", "load", "f", "64", + "8", "4", "2", SSEPackedDouble, HasAVX512, 0>, + avx512_store_vl<0x11, "vmovupd", "store", "", "", "", "f", "64", + "8", "4", "2", SSEPackedDouble, HasAVX512>, + PD, VEX_W, EVEX_CD8<64, CD8VF>; + def: Pat<(v8f64 (int_x86_avx512_mask_loadu_pd_512 addr:$ptr, - (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)), + (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)), (VMOVUPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; def: Pat<(v16f32 (int_x86_avx512_mask_loadu_ps_512 addr:$ptr, @@ -1447,75 +2097,80 @@ def: Pat<(int_x86_avx512_mask_storeu_pd_512 addr:$ptr, (v8f64 VR512:$src), (VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src)>; -defm VMOVDQA32: avx512_load<0x6F, VR512, VK16WM, i512mem, alignedloadv16i32, - "vmovdqa32", SSEPackedInt, v16i32>, - avx512_store<0x7F, VR512, VK16WM, i512mem, alignedstore512, - "vmovdqa32", SSEPackedInt, v16i32>, - PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VMOVDQA64: avx512_load<0x6F, VR512, VK8WM, i512mem, alignedloadv8i64, - "vmovdqa64", SSEPackedInt, v8i64>, - avx512_store<0x7F, VR512, VK8WM, i512mem, alignedstore512, - "vmovdqa64", SSEPackedInt, v8i64>, - PD, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -defm VMOVDQU32: avx512_load<0x6F, VR512, VK16WM, i512mem, load, - "vmovdqu32", SSEPackedInt, v16i32>, - avx512_store<0x7F, VR512, VK16WM, i512mem, store, - "vmovdqu32", SSEPackedInt, v16i32>, - XS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VMOVDQU64: avx512_load<0x6F, VR512, VK8WM, i512mem, load, - "vmovdqu64", SSEPackedInt, v8i64>, - avx512_store<0x7F, VR512, VK8WM, i512mem, store, - "vmovdqu64", SSEPackedInt, v8i64>, - XS, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; +defm VMOVDQA32 : avx512_load_vl<0x6F, "vmovdqa32", "alignedload", "i", "32", + "16", "8", "4", SSEPackedInt, HasAVX512>, + avx512_store_vl<0x7F, "vmovdqa32", "alignedstore", + "512", "256", "", "i", "32", "16", "8", "4", + SSEPackedInt, HasAVX512>, + PD, EVEX_CD8<32, CD8VF>; + +defm VMOVDQA64 : avx512_load_vl<0x6F, "vmovdqa64", "alignedload", "i", "64", + "8", "4", "2", SSEPackedInt, HasAVX512>, + avx512_store_vl<0x7F, "vmovdqa64", "alignedstore", + "512", "256", "", "i", "64", "8", "4", "2", + SSEPackedInt, HasAVX512>, + PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", "load", "i", "8", + "64", "32", "16", SSEPackedInt, HasBWI>, + avx512_store_vl<0x7F, "vmovdqu8", "store", "", "", "", + "i", "8", "64", "32", "16", SSEPackedInt, + HasBWI>, XD, EVEX_CD8<8, CD8VF>; + +defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", "load", "i", "16", + "32", "16", "8", SSEPackedInt, HasBWI>, + avx512_store_vl<0x7F, "vmovdqu16", "store", "", "", "", + "i", "16", "32", "16", "8", SSEPackedInt, + HasBWI>, XD, VEX_W, EVEX_CD8<16, CD8VF>; + +defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", "load", "i", "32", + "16", "8", "4", SSEPackedInt, HasAVX512>, + avx512_store_vl<0x7F, "vmovdqu32", "store", "", "", "", + "i", "32", "16", "8", "4", SSEPackedInt, + HasAVX512>, XS, EVEX_CD8<32, CD8VF>; + +defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", "load", "i", "64", + "8", "4", "2", SSEPackedInt, HasAVX512>, + avx512_store_vl<0x7F, "vmovdqu64", "store", "", "", "", + "i", "64", "8", "4", "2", SSEPackedInt, + HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>; def: Pat<(v16i32 (int_x86_avx512_mask_loadu_d_512 addr:$ptr, (v16i32 immAllZerosV), GR16:$mask)), - (VMOVDQU32rmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>; + (VMOVDQU32Zrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>; def: Pat<(v8i64 (int_x86_avx512_mask_loadu_q_512 addr:$ptr, - (bc_v8i64 (v16i32 immAllZerosV)), GR8:$mask)), - (VMOVDQU64rmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; + (bc_v8i64 (v16i32 immAllZerosV)), GR8:$mask)), + (VMOVDQU64Zrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; def: Pat<(int_x86_avx512_mask_storeu_d_512 addr:$ptr, (v16i32 VR512:$src), - GR16:$mask), - (VMOVDQU32mrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), + GR16:$mask), + (VMOVDQU32Zmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), VR512:$src)>; def: Pat<(int_x86_avx512_mask_storeu_q_512 addr:$ptr, (v8i64 VR512:$src), - GR8:$mask), - (VMOVDQU64mrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), + GR8:$mask), + (VMOVDQU64Zmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src)>; let AddedComplexity = 20 in { def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src), - (bc_v8i64 (v16i32 immAllZerosV)))), - (VMOVDQU64rrkz VK8WM:$mask, VR512:$src)>; + (bc_v8i64 (v16i32 immAllZerosV)))), + (VMOVDQU64Zrrkz VK8WM:$mask, VR512:$src)>; def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)), - (v8i64 VR512:$src))), - (VMOVDQU64rrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)), + (v8i64 VR512:$src))), + (VMOVDQU64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)), VK8), VR512:$src)>; def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 VR512:$src), (v16i32 immAllZerosV))), - (VMOVDQU32rrkz VK16WM:$mask, VR512:$src)>; + (VMOVDQU32Zrrkz VK16WM:$mask, VR512:$src)>; def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV), - (v16i32 VR512:$src))), - (VMOVDQU32rrkz (KNOTWrr VK16WM:$mask), VR512:$src)>; - -def : Pat<(v16f32 (vselect VK16WM:$mask, (v16f32 VR512:$src1), - (v16f32 VR512:$src2))), - (VMOVUPSZrrk VR512:$src2, VK16WM:$mask, VR512:$src1)>; -def : Pat<(v8f64 (vselect VK8WM:$mask, (v8f64 VR512:$src1), - (v8f64 VR512:$src2))), - (VMOVUPDZrrk VR512:$src2, VK8WM:$mask, VR512:$src1)>; -def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 VR512:$src1), - (v16i32 VR512:$src2))), - (VMOVDQU32rrk VR512:$src2, VK16WM:$mask, VR512:$src1)>; -def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src1), - (v8i64 VR512:$src2))), - (VMOVDQU64rrk VR512:$src2, VK8WM:$mask, VR512:$src1)>; + (v16i32 VR512:$src))), + (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>; } + // Move Int Doubleword to Packed Double Int // def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src), @@ -1641,10 +2296,16 @@ multiclass avx512_move_scalar <string asm, RegisterClass RC, !strconcat(asm, " \t{$src, $dst|$dst, $src}"), [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>, EVEX, VEX_LIG; + let mayStore = 1 in { def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), !strconcat(asm, " \t{$src, $dst|$dst, $src}"), [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>, EVEX, VEX_LIG; + def mrk: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, VK1WM:$mask, RC:$src), + !strconcat(asm, " \t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), + [], IIC_SSE_MOV_S_MR>, + EVEX, VEX_LIG, EVEX_K; + } // mayStore } //hasSideEffects = 0 } @@ -1664,6 +2325,10 @@ def : Pat<(f64 (X86select VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))), (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X), VK1WM:$mask, (f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>; +def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask), + (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)), + (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + // For the disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { def VMOVSSZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst), @@ -1882,136 +2547,201 @@ def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), //===----------------------------------------------------------------------===// // AVX-512 - Non-temporals //===----------------------------------------------------------------------===// +let SchedRW = [WriteLoad] in { + def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst), + (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", + [(set VR512:$dst, (int_x86_avx512_movntdqa addr:$src))], + SSEPackedInt>, EVEX, T8PD, EVEX_V512, + EVEX_CD8<64, CD8VF>; + + let Predicates = [HasAVX512, HasVLX] in { + def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst), + (ins i256mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", [], + SSEPackedInt>, EVEX, T8PD, EVEX_V256, + EVEX_CD8<64, CD8VF>; + + def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst), + (ins i128mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", [], + SSEPackedInt>, EVEX, T8PD, EVEX_V128, + EVEX_CD8<64, CD8VF>; + } +} + +multiclass avx512_movnt<bits<8> opc, string OpcodeStr, PatFrag st_frag, + ValueType OpVT, RegisterClass RC, X86MemOperand memop, + Domain d, InstrItinClass itin = IIC_SSE_MOVNT> { + let SchedRW = [WriteStore], mayStore = 1, + AddedComplexity = 400 in + def mr : AVX512PI<opc, MRMDestMem, (outs), (ins memop:$dst, RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(st_frag (OpVT RC:$src), addr:$dst)], d, itin>, EVEX; +} -def VMOVNTDQAZrm : AVX5128I<0x2A, MRMSrcMem, (outs VR512:$dst), - (ins i512mem:$src), - "vmovntdqa\t{$src, $dst|$dst, $src}", - [(set VR512:$dst, - (int_x86_avx512_movntdqa addr:$src))]>, - EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>; - -// Prefer non-temporal over temporal versions -let AddedComplexity = 400, SchedRW = [WriteStore] in { - -def VMOVNTPSZmr : AVX512PSI<0x2B, MRMDestMem, (outs), - (ins f512mem:$dst, VR512:$src), - "vmovntps\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v16f32 VR512:$src), - addr:$dst)], - IIC_SSE_MOVNT>, - EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; - -def VMOVNTPDZmr : AVX512PDI<0x2B, MRMDestMem, (outs), - (ins f512mem:$dst, VR512:$src), - "vmovntpd\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v8f64 VR512:$src), - addr:$dst)], - IIC_SSE_MOVNT>, - EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - - -def VMOVNTDQZmr : AVX512BI<0xE7, MRMDestMem, (outs), - (ins i512mem:$dst, VR512:$src), - "vmovntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v8i64 VR512:$src), - addr:$dst)], - IIC_SSE_MOVNT>, - EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>; +multiclass avx512_movnt_vl<bits<8> opc, string OpcodeStr, PatFrag st_frag, + string elty, string elsz, string vsz512, + string vsz256, string vsz128, Domain d, + Predicate prd, InstrItinClass itin = IIC_SSE_MOVNT> { + let Predicates = [prd] in + defm Z : avx512_movnt<opc, OpcodeStr, st_frag, + !cast<ValueType>("v"##vsz512##elty##elsz), VR512, + !cast<X86MemOperand>(elty##"512mem"), d, itin>, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_movnt<opc, OpcodeStr, st_frag, + !cast<ValueType>("v"##vsz256##elty##elsz), VR256X, + !cast<X86MemOperand>(elty##"256mem"), d, itin>, + EVEX_V256; + + defm Z128 : avx512_movnt<opc, OpcodeStr, st_frag, + !cast<ValueType>("v"##vsz128##elty##elsz), VR128X, + !cast<X86MemOperand>(elty##"128mem"), d, itin>, + EVEX_V128; + } } +defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", alignednontemporalstore, + "i", "64", "8", "4", "2", SSEPackedInt, + HasAVX512>, PD, EVEX_CD8<64, CD8VF>; + +defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", alignednontemporalstore, + "f", "64", "8", "4", "2", SSEPackedDouble, + HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", alignednontemporalstore, + "f", "32", "16", "8", "4", SSEPackedSingle, + HasAVX512>, PS, EVEX_CD8<32, CD8VF>; + //===----------------------------------------------------------------------===// // AVX-512 - Integer arithmetic // multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, - ValueType OpVT, RegisterClass KRC, - RegisterClass RC, PatFrag memop_frag, - X86MemOperand x86memop, PatFrag scalar_mfrag, - X86MemOperand x86scalar_mop, string BrdcstStr, - OpndItins itins, bit IsCommutable = 0> { - let isCommutable = IsCommutable in - def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))], - itins.rr>, EVEX_4V; - let AddedComplexity = 30 in { - let Constraints = "$src0 = $dst" in - def rrk : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src0, KRC:$mask, RC:$src1, RC:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), - [(set RC:$dst, (OpVT (vselect KRC:$mask, - (OpNode (OpVT RC:$src1), (OpVT RC:$src2)), - RC:$src0)))], - itins.rr>, EVEX_4V, EVEX_K; - def rrkz : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, RC:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst {${mask}} {z}" , - "|$dst {${mask}} {z}, $src1, $src2}"), - [(set RC:$dst, (OpVT (vselect KRC:$mask, - (OpNode (OpVT RC:$src1), (OpVT RC:$src2)), - (OpVT immAllZerosV))))], - itins.rr>, EVEX_4V, EVEX_KZ; + X86VectorVTInfo _, OpndItins itins, + bit IsCommutable = 0> { + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, _.RC:$src2)), + "", itins.rr, IsCommutable>, + AVX512BIBase, EVEX_4V; + + let mayLoad = 1 in + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, + (bitconvert (_.LdFrag addr:$src2)))), + "", itins.rm>, + AVX512BIBase, EVEX_4V; +} + +multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, OpndItins itins, + bit IsCommutable = 0> : + avx512_binop_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> { + let mayLoad = 1 in + defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (_.VT (OpNode _.RC:$src1, + (X86VBroadcast + (_.ScalarLdFrag addr:$src2)))), + "", itins.rm>, + AVX512BIBase, EVEX_4V, EVEX_B; +} + +multiclass avx512_binop_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, OpndItins itins, + Predicate prd, bit IsCommutable = 0> { + let Predicates = [prd] in + defm Z : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info512, itins, + IsCommutable>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info256, itins, + IsCommutable>, EVEX_V256; + defm Z128 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info128, itins, + IsCommutable>, EVEX_V128; } +} - let mayLoad = 1 in { - def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))], - itins.rm>, EVEX_4V; - let AddedComplexity = 30 in { - let Constraints = "$src0 = $dst" in - def rmk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src0, KRC:$mask, RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), - [(set RC:$dst, (OpVT (vselect KRC:$mask, - (OpNode (OpVT RC:$src1), (memop_frag addr:$src2)), - RC:$src0)))], - itins.rm>, EVEX_4V, EVEX_K; - def rmkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"), - [(set RC:$dst, (OpVT (vselect KRC:$mask, - (OpNode (OpVT RC:$src1), (memop_frag addr:$src2)), - (OpVT immAllZerosV))))], - itins.rm>, EVEX_4V, EVEX_KZ; - } - def rmb : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86scalar_mop:$src2), - !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr, - ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"), - [(set RC:$dst, (OpNode RC:$src1, - (OpVT (X86VBroadcast (scalar_mfrag addr:$src2)))))], - itins.rm>, EVEX_4V, EVEX_B; - let AddedComplexity = 30 in { - let Constraints = "$src0 = $dst" in - def rmbk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src0, KRC:$mask, RC:$src1, x86scalar_mop:$src2), - !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr, - ", $src1, $dst {${mask}}|$dst {${mask}}, $src1, ${src2}", - BrdcstStr, "}"), - [(set RC:$dst, (OpVT (vselect KRC:$mask, - (OpNode (OpVT RC:$src1), - (OpVT (X86VBroadcast (scalar_mfrag addr:$src2)))), - RC:$src0)))], - itins.rm>, EVEX_4V, EVEX_B, EVEX_K; - def rmbkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2), - !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr, - ", $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, ${src2}", - BrdcstStr, "}"), - [(set RC:$dst, (OpVT (vselect KRC:$mask, - (OpNode (OpVT RC:$src1), - (OpVT (X86VBroadcast (scalar_mfrag addr:$src2)))), - (OpVT immAllZerosV))))], - itins.rm>, EVEX_4V, EVEX_B, EVEX_KZ; - } +multiclass avx512_binop_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, OpndItins itins, + Predicate prd, bit IsCommutable = 0> { + let Predicates = [prd] in + defm Z : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, itins, + IsCommutable>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, itins, + IsCommutable>, EVEX_V256; + defm Z128 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, itins, + IsCommutable>, EVEX_V128; } } +multiclass avx512_binop_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info, + itins, prd, IsCommutable>, + VEX_W, EVEX_CD8<64, CD8VF>; +} + +multiclass avx512_binop_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info, + itins, prd, IsCommutable>, EVEX_CD8<32, CD8VF>; +} + +multiclass avx512_binop_rm_vl_w<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i16_info, + itins, prd, IsCommutable>, EVEX_CD8<16, CD8VF>; +} + +multiclass avx512_binop_rm_vl_b<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i8_info, + itins, prd, IsCommutable>, EVEX_CD8<8, CD8VF>; +} + +multiclass avx512_binop_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr, + SDNode OpNode, OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr, OpNode, itins, prd, + IsCommutable>; + + defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr, OpNode, itins, prd, + IsCommutable>; +} + +multiclass avx512_binop_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr, + SDNode OpNode, OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr, OpNode, itins, prd, + IsCommutable>; + + defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr, OpNode, itins, prd, + IsCommutable>; +} + +multiclass avx512_binop_rm_vl_all<bits<8> opc_b, bits<8> opc_w, + bits<8> opc_d, bits<8> opc_q, + string OpcodeStr, SDNode OpNode, + OpndItins itins, bit IsCommutable = 0> { + defm NAME : avx512_binop_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode, + itins, HasAVX512, IsCommutable>, + avx512_binop_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode, + itins, HasBWI, IsCommutable>; +} + multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, ValueType DstVT, ValueType SrcVT, RegisterClass KRC, RegisterClass RC, PatFrag memop_frag, X86MemOperand x86memop, @@ -2069,25 +2799,16 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, ValueType DstVT, } } -defm VPADDDZ : avx512_binop_rm<0xFE, "vpaddd", add, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, EVEX_V512, EVEX_CD8<32, CD8VF>; - -defm VPSUBDZ : avx512_binop_rm<0xFA, "vpsubd", sub, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>; - -defm VPMULLDZ : avx512_binop_rm<0x40, "vpmulld", mul, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; - -defm VPADDQZ : avx512_binop_rm<0xD4, "vpaddq", add, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 1>, EVEX_CD8<64, CD8VF>, EVEX_V512, VEX_W; - -defm VPSUBQZ : avx512_binop_rm<0xFB, "vpsubq", sub, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 0>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add, + SSE_INTALU_ITINS_P, 1>; +defm VPSUB : avx512_binop_rm_vl_all<0xF8, 0xF9, 0xFA, 0xFB, "vpsub", sub, + SSE_INTALU_ITINS_P, 0>; +defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmull", mul, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; +defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmull", mul, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmull", mul, + SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD; defm VPMULDQZ : avx512_binop_rm2<0x28, "vpmuldq", v8i64, v16i32, VK8WM, VR512, memopv8i64, i512mem, loadi64, i64mem, "{1to8}", @@ -2108,41 +2829,33 @@ def : Pat<(v8i64 (int_x86_avx512_mask_pmul_dq_512 (v16i32 VR512:$src1), (v16i32 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), (VPMULDQZrr VR512:$src1, VR512:$src2)>; -defm VPMAXUDZ : avx512_binop_rm<0x3F, "vpmaxud", X86umax, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, - T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPMAXUQZ : avx512_binop_rm<0x3F, "vpmaxuq", X86umax, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 0>, - T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VPMAXSDZ : avx512_binop_rm<0x3D, "vpmaxsd", X86smax, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, - T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPMAXSQZ : avx512_binop_rm<0x3D, "vpmaxsq", X86smax, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 0>, - T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VPMINUDZ : avx512_binop_rm<0x3B, "vpminud", X86umin, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, - T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPMINUQZ : avx512_binop_rm<0x3B, "vpminuq", X86umin, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 0>, - T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VPMINSDZ : avx512_binop_rm<0x39, "vpminsd", X86smin, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, - T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPMINSQZ : avx512_binop_rm<0x39, "vpminsq", X86smin, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 0>, - T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxs", X86smax, + SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; +defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxs", X86smax, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", X86smax, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; + +defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxu", X86umax, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxu", X86umax, + SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; +defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", X86umax, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; + +defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpmins", X86smin, + SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; +defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpmins", X86smin, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", X86smin, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; + +defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminu", X86umin, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminu", X86umin, + SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; +defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", X86umin, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; def : Pat <(v16i32 (int_x86_avx512_mask_pmaxs_d_512 (v16i32 VR512:$src1), (v16i32 VR512:$src2), (v16i32 immAllZerosV), (i16 -1))), @@ -2255,48 +2968,18 @@ multiclass avx512_pshuf_imm<bits<8> opc, string OpcodeStr, RegisterClass RC, defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, memopv16i32, i512mem, v16i32>, PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -let ExeDomain = SSEPackedSingle in -defm VPERMILPSZ : avx512_pshuf_imm<0x04, "vpermilps", VR512, X86VPermilp, - memopv16f32, i512mem, v16f32>, TAPD, EVEX_V512, - EVEX_CD8<32, CD8VF>; -let ExeDomain = SSEPackedDouble in -defm VPERMILPDZ : avx512_pshuf_imm<0x05, "vpermilpd", VR512, X86VPermilp, - memopv8f64, i512mem, v8f64>, TAPD, EVEX_V512, - VEX_W, EVEX_CD8<32, CD8VF>; - -def : Pat<(v16i32 (X86VPermilp VR512:$src1, (i8 imm:$imm))), - (VPERMILPSZri VR512:$src1, imm:$imm)>; -def : Pat<(v8i64 (X86VPermilp VR512:$src1, (i8 imm:$imm))), - (VPERMILPDZri VR512:$src1, imm:$imm)>; - //===----------------------------------------------------------------------===// // AVX-512 Logical Instructions //===----------------------------------------------------------------------===// -defm VPANDDZ : avx512_binop_rm<0xDB, "vpandd", and, v16i32, VK16WM, VR512, memopv16i32, - i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPANDQZ : avx512_binop_rm<0xDB, "vpandq", and, v8i64, VK8WM, VR512, memopv8i64, - i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPORDZ : avx512_binop_rm<0xEB, "vpord", or, v16i32, VK16WM, VR512, memopv16i32, - i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPORQZ : avx512_binop_rm<0xEB, "vporq", or, v8i64, VK8WM, VR512, memopv8i64, - i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPXORDZ : avx512_binop_rm<0xEF, "vpxord", xor, v16i32, VK16WM, VR512, memopv16i32, - i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPXORQZ : avx512_binop_rm<0xEF, "vpxorq", xor, v8i64, VK8WM, VR512, memopv8i64, - i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPANDNDZ : avx512_binop_rm<0xDF, "vpandnd", X86andnp, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_BIT_ITINS_P, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPANDNQZ : avx512_binop_rm<0xDF, "vpandnq", X86andnp, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_BIT_ITINS_P, 0>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPAND : avx512_binop_rm_vl_dq<0xDB, 0xDB, "vpand", and, + SSE_INTALU_ITINS_P, HasAVX512, 1>; +defm VPOR : avx512_binop_rm_vl_dq<0xEB, 0xEB, "vpor", or, + SSE_INTALU_ITINS_P, HasAVX512, 1>; +defm VPXOR : avx512_binop_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, + SSE_INTALU_ITINS_P, HasAVX512, 1>; +defm VPANDN : avx512_binop_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, + SSE_INTALU_ITINS_P, HasAVX512, 1>; //===----------------------------------------------------------------------===// // AVX-512 FP arithmetic @@ -2324,118 +3007,58 @@ defm VDIV : avx512_binop_s<0x5E, "div", fdiv, SSE_ALU_ITINS_S>; } multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, - RegisterClass KRC, - RegisterClass RC, ValueType vt, - X86MemOperand x86memop, PatFrag mem_frag, - X86MemOperand x86scalar_mop, PatFrag scalar_mfrag, - string BrdcstStr, - Domain d, OpndItins itins, bit commutable> { - let isCommutable = commutable in { - def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>, - EVEX_4V; - - def rrk: PI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src1, RC:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}} |$dst {${mask}}, $src1, $src2}"), - [], itins.rr, d>, EVEX_4V, EVEX_K; - - def rrkz: PI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src1, RC:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"), - [], itins.rr, d>, EVEX_4V, EVEX_KZ; - } - + X86VectorVTInfo _, bit IsCommutable> { + defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, _.RC:$src2))>, EVEX_4V; let mayLoad = 1 in { - def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], - itins.rm, d>, EVEX_4V; + defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>, EVEX_4V; + defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (OpNode _.RC:$src1, (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2))))>, + EVEX_4V, EVEX_B; + }//let mayLoad = 1 +} - def rmb : PI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86scalar_mop:$src2), - !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr, - ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"), - [(set RC:$dst, (OpNode RC:$src1, - (vt (X86VBroadcast (scalar_mfrag addr:$src2)))))], - itins.rm, d>, EVEX_4V, EVEX_B; - - def rmk : PI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), - [], itins.rm, d>, EVEX_4V, EVEX_K; - - def rmkz : PI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"), - [], itins.rm, d>, EVEX_4V, EVEX_KZ; - - def rmbk : PI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2), !strconcat(OpcodeStr, - " \t{${src2}", BrdcstStr, - ", $src1, $dst {${mask}}|$dst {${mask}}, $src1, ${src2}", BrdcstStr, "}"), - [], itins.rm, d>, EVEX_4V, EVEX_B, EVEX_K; - - def rmbkz : PI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2), !strconcat(OpcodeStr, - " \t{${src2}", BrdcstStr, - ", $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, ${src2}", - BrdcstStr, "}"), - [], itins.rm, d>, EVEX_4V, EVEX_B, EVEX_KZ; +multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, + bit IsCommutable = 0> { + defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info, + IsCommutable>, EVEX_V512, PS, + EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f64_info, + IsCommutable>, EVEX_V512, PD, VEX_W, + EVEX_CD8<64, CD8VF>; + + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f32x_info, + IsCommutable>, EVEX_V128, PS, + EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f32x_info, + IsCommutable>, EVEX_V256, PS, + EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v2f64x_info, + IsCommutable>, EVEX_V128, PD, VEX_W, + EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f64x_info, + IsCommutable>, EVEX_V256, PD, VEX_W, + EVEX_CD8<64, CD8VF>; } } -defm VADDPSZ : avx512_fp_packed<0x58, "addps", fadd, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 1>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; - -defm VADDPDZ : avx512_fp_packed<0x58, "addpd", fadd, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 1>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VMULPSZ : avx512_fp_packed<0x59, "mulps", fmul, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 1>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; -defm VMULPDZ : avx512_fp_packed<0x59, "mulpd", fmul, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 1>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VMINPSZ : avx512_fp_packed<0x5D, "minps", X86fmin, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 1>, - EVEX_V512, PS, EVEX_CD8<32, CD8VF>; -defm VMAXPSZ : avx512_fp_packed<0x5F, "maxps", X86fmax, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 1>, - EVEX_V512, PS, EVEX_CD8<32, CD8VF>; - -defm VMINPDZ : avx512_fp_packed<0x5D, "minpd", X86fmin, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 1>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; -defm VMAXPDZ : avx512_fp_packed<0x5F, "maxpd", X86fmax, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 1>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VSUBPSZ : avx512_fp_packed<0x5C, "subps", fsub, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 0>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; -defm VDIVPSZ : avx512_fp_packed<0x5E, "divps", fdiv, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 0>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; - -defm VSUBPDZ : avx512_fp_packed<0x5C, "subpd", fsub, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 0>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; -defm VDIVPDZ : avx512_fp_packed<0x5E, "divpd", fdiv, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 0>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; +defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>; +defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>; +defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, 1>; +defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, 1>; +defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>; +defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>; def : Pat<(v16f32 (int_x86_avx512_mask_max_ps_512 (v16f32 VR512:$src1), (v16f32 VR512:$src2), (bc_v16f32 (v16i32 immAllZerosV)), @@ -2502,29 +3125,17 @@ def : Pat <(i8 (int_x86_avx512_mask_ptestm_q_512 (v8i64 VR512:$src1), // AVX-512 Shift instructions //===----------------------------------------------------------------------===// multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM, - string OpcodeStr, SDNode OpNode, RegisterClass RC, - ValueType vt, X86MemOperand x86memop, PatFrag mem_frag, - RegisterClass KRC> { - def ri : AVX512BIi8<opc, ImmFormR, (outs RC:$dst), - (ins RC:$src1, i8imm:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (vt (OpNode RC:$src1, (i8 imm:$src2))))], - SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V; - def rik : AVX512BIi8<opc, ImmFormR, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, i8imm:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), - [], SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V, EVEX_K; - def mi: AVX512BIi8<opc, ImmFormM, (outs RC:$dst), - (ins x86memop:$src1, i8imm:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (OpNode (mem_frag addr:$src1), - (i8 imm:$src2)))], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V; - def mik: AVX512BIi8<opc, ImmFormM, (outs RC:$dst), - (ins KRC:$mask, x86memop:$src1, i8imm:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), - [], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V, EVEX_K; + string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst), + (ins _.RC:$src1, i8imm:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))), + " ", SSE_INTSHIFT_ITINS_P.rr>, AVX512BIi8Base, EVEX_4V; + defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), + (ins _.MemOp:$src1, i8imm:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode (_.MemOpFrag addr:$src1), (i8 imm:$src2))), + " ", SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V; } multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -2555,42 +3166,42 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode, } defm VPSRLDZ : avx512_shift_rmi<0x72, MRM2r, MRM2m, "vpsrld", X86vsrli, - VR512, v16i32, i512mem, memopv16i32, VK16WM>, + v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPSRLDZ : avx512_shift_rrm<0xD2, "vpsrld", X86vsrl, VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512, EVEX_CD8<32, CD8VQ>; defm VPSRLQZ : avx512_shift_rmi<0x73, MRM2r, MRM2m, "vpsrlq", X86vsrli, - VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512, + v8i64_info>, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; defm VPSRLQZ : avx512_shift_rrm<0xD3, "vpsrlq", X86vsrl, VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512, EVEX_CD8<64, CD8VQ>, VEX_W; defm VPSLLDZ : avx512_shift_rmi<0x72, MRM6r, MRM6m, "vpslld", X86vshli, - VR512, v16i32, i512mem, memopv16i32, VK16WM>, EVEX_V512, + v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPSLLDZ : avx512_shift_rrm<0xF2, "vpslld", X86vshl, VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512, EVEX_CD8<32, CD8VQ>; defm VPSLLQZ : avx512_shift_rmi<0x73, MRM6r, MRM6m, "vpsllq", X86vshli, - VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512, + v8i64_info>, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; defm VPSLLQZ : avx512_shift_rrm<0xF3, "vpsllq", X86vshl, VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512, EVEX_CD8<64, CD8VQ>, VEX_W; defm VPSRADZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsrad", X86vsrai, - VR512, v16i32, i512mem, memopv16i32, VK16WM>, + v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPSRADZ : avx512_shift_rrm<0xE2, "vpsrad", X86vsra, VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512, EVEX_CD8<32, CD8VQ>; defm VPSRAQZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsraq", X86vsrai, - VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512, + v8i64_info>, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; defm VPSRAQZ : avx512_shift_rrm<0xE2, "vpsraq", X86vsra, VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512, @@ -2713,155 +3324,133 @@ let Predicates = [HasAVX512] in { //===----------------------------------------------------------------------===// // FMA - Fused Multiply Operations // + let Constraints = "$src1 = $dst" in { -multiclass avx512_fma3p_rm<bits<8> opc, string OpcodeStr, - RegisterClass RC, X86MemOperand x86memop, - PatFrag mem_frag, X86MemOperand x86scalar_mop, PatFrag scalar_mfrag, - string BrdcstStr, SDNode OpNode, ValueType OpVT> { - def r: AVX512FMA3<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, RC:$src3), - !strconcat(OpcodeStr," \t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set RC:$dst, (OpVT(OpNode RC:$src1, RC:$src2, RC:$src3)))]>; +// Omitting the parameter OpNode (= null_frag) disables ISel pattern matching. +multiclass avx512_fma3p_rm<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + SDPatternOperator OpNode = null_frag> { + defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>, + AVX512FMA3Base; let mayLoad = 1 in - def m: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, RC:$src2, x86memop:$src3), + def m: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, _.MemOp:$src3), !strconcat(OpcodeStr, " \t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, - (mem_frag addr:$src3))))]>; - def mb: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, RC:$src2, x86scalar_mop:$src3), - !strconcat(OpcodeStr, " \t{${src3}", BrdcstStr, - ", $src2, $dst|$dst, $src2, ${src3}", BrdcstStr, "}"), - [(set RC:$dst, (OpNode RC:$src1, RC:$src2, - (OpVT (X86VBroadcast (scalar_mfrag addr:$src3)))))]>, EVEX_B; + [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2, + (_.MemOpFrag addr:$src3))))]>; + def mb: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, _.ScalarMemOp:$src3), + !strconcat(OpcodeStr, " \t{${src3}", _.BroadcastStr, + ", $src2, $dst|$dst, $src2, ${src3}", _.BroadcastStr, "}"), + [(set _.RC:$dst, (OpNode _.RC:$src1, _.RC:$src2, + (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))]>, EVEX_B; } } // Constraints = "$src1 = $dst" +multiclass avx512_fma3p_forms<bits<8> opc213, bits<8> opc231, + string OpcodeStr, X86VectorVTInfo VTI, + SDPatternOperator OpNode> { + defm v213 : avx512_fma3p_rm<opc213, !strconcat(OpcodeStr, "213", VTI.Suffix), + VTI, OpNode>, + EVEX_V512, EVEX_CD8<VTI.EltSize, CD8VF>; + + defm v231 : avx512_fma3p_rm<opc231, !strconcat(OpcodeStr, "231", VTI.Suffix), + VTI>, + EVEX_V512, EVEX_CD8<VTI.EltSize, CD8VF>; +} + let ExeDomain = SSEPackedSingle in { - defm VFMADD213PSZ : avx512_fma3p_rm<0xA8, "vfmadd213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmadd, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFMSUB213PSZ : avx512_fma3p_rm<0xAA, "vfmsub213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmsub, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFMADDSUB213PSZ : avx512_fma3p_rm<0xA6, "vfmaddsub213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmaddsub, v16f32>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMSUBADD213PSZ : avx512_fma3p_rm<0xA7, "vfmsubadd213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmsubadd, v16f32>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFNMADD213PSZ : avx512_fma3p_rm<0xAC, "vfnmadd213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fnmadd, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFNMSUB213PSZ : avx512_fma3p_rm<0xAE, "vfnmsub213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fnmsub, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; + defm VFMADDPSZ : avx512_fma3p_forms<0xA8, 0xB8, "vfmadd", + v16f32_info, X86Fmadd>; + defm VFMSUBPSZ : avx512_fma3p_forms<0xAA, 0xBA, "vfmsub", + v16f32_info, X86Fmsub>; + defm VFMADDSUBPSZ : avx512_fma3p_forms<0xA6, 0xB6, "vfmaddsub", + v16f32_info, X86Fmaddsub>; + defm VFMSUBADDPSZ : avx512_fma3p_forms<0xA7, 0xB7, "vfmsubadd", + v16f32_info, X86Fmsubadd>; + defm VFNMADDPSZ : avx512_fma3p_forms<0xAC, 0xBC, "vfnmadd", + v16f32_info, X86Fnmadd>; + defm VFNMSUBPSZ : avx512_fma3p_forms<0xAE, 0xBE, "vfnmsub", + v16f32_info, X86Fnmsub>; } let ExeDomain = SSEPackedDouble in { - defm VFMADD213PDZ : avx512_fma3p_rm<0xA8, "vfmadd213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmadd, v8f64>, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMSUB213PDZ : avx512_fma3p_rm<0xAA, "vfmsub213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmsub, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFMADDSUB213PDZ : avx512_fma3p_rm<0xA6, "vfmaddsub213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmaddsub, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFMSUBADD213PDZ : avx512_fma3p_rm<0xA7, "vfmsubadd213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmsubadd, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFNMADD213PDZ : avx512_fma3p_rm<0xAC, "vfnmadd213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fnmadd, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFNMSUB213PDZ : avx512_fma3p_rm<0xAE, "vfnmsub213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fnmsub, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; + defm VFMADDPDZ : avx512_fma3p_forms<0xA8, 0xB8, "vfmadd", + v8f64_info, X86Fmadd>, VEX_W; + defm VFMSUBPDZ : avx512_fma3p_forms<0xAA, 0xBA, "vfmsub", + v8f64_info, X86Fmsub>, VEX_W; + defm VFMADDSUBPDZ : avx512_fma3p_forms<0xA6, 0xB6, "vfmaddsub", + v8f64_info, X86Fmaddsub>, VEX_W; + defm VFMSUBADDPDZ : avx512_fma3p_forms<0xA7, 0xB7, "vfmsubadd", + v8f64_info, X86Fmsubadd>, VEX_W; + defm VFNMADDPDZ : avx512_fma3p_forms<0xAC, 0xBC, "vfnmadd", + v8f64_info, X86Fnmadd>, VEX_W; + defm VFNMSUBPDZ : avx512_fma3p_forms<0xAE, 0xBE, "vfnmsub", + v8f64_info, X86Fnmsub>, VEX_W; } let Constraints = "$src1 = $dst" in { -multiclass avx512_fma3p_m132<bits<8> opc, string OpcodeStr, - RegisterClass RC, X86MemOperand x86memop, - PatFrag mem_frag, X86MemOperand x86scalar_mop, PatFrag scalar_mfrag, - string BrdcstStr, SDNode OpNode, ValueType OpVT> { +multiclass avx512_fma3p_m132<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { let mayLoad = 1 in - def m: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, RC:$src3, x86memop:$src2), + def m: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src3, _.MemOp:$src2), !strconcat(OpcodeStr, " \t{$src2, $src3, $dst|$dst, $src3, $src2}"), - [(set RC:$dst, (OpVT (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3)))]>; - def mb: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, RC:$src3, x86scalar_mop:$src2), - !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr, - ", $src3, $dst|$dst, $src3, ${src2}", BrdcstStr, "}"), - [(set RC:$dst, (OpNode RC:$src1, - (OpVT (X86VBroadcast (scalar_mfrag addr:$src2))), RC:$src3))]>, EVEX_B; + [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, (_.MemOpFrag addr:$src2), + _.RC:$src3)))]>; + def mb: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src3, _.ScalarMemOp:$src2), + !strconcat(OpcodeStr, " \t{${src2}", _.BroadcastStr, + ", $src3, $dst|$dst, $src3, ${src2}", _.BroadcastStr, "}"), + [(set _.RC:$dst, + (OpNode _.RC:$src1, (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2))), + _.RC:$src3))]>, EVEX_B; } } // Constraints = "$src1 = $dst" let ExeDomain = SSEPackedSingle in { - defm VFMADD132PSZ : avx512_fma3p_m132<0x98, "vfmadd132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmadd, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFMSUB132PSZ : avx512_fma3p_m132<0x9A, "vfmsub132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmsub, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFMADDSUB132PSZ : avx512_fma3p_m132<0x96, "vfmaddsub132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmaddsub, v16f32>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMSUBADD132PSZ : avx512_fma3p_m132<0x97, "vfmsubadd132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmsubadd, v16f32>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFNMADD132PSZ : avx512_fma3p_m132<0x9C, "vfnmadd132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fnmadd, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFNMSUB132PSZ : avx512_fma3p_m132<0x9E, "vfnmsub132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fnmsub, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; + defm VFMADD132PSZ : avx512_fma3p_m132<0x98, "vfmadd132ps", X86Fmadd, + v16f32_info>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm VFMSUB132PSZ : avx512_fma3p_m132<0x9A, "vfmsub132ps", X86Fmsub, + v16f32_info>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm VFMADDSUB132PSZ : avx512_fma3p_m132<0x96, "vfmaddsub132ps", X86Fmaddsub, + v16f32_info>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm VFMSUBADD132PSZ : avx512_fma3p_m132<0x97, "vfmsubadd132ps", X86Fmsubadd, + v16f32_info>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm VFNMADD132PSZ : avx512_fma3p_m132<0x9C, "vfnmadd132ps", X86Fnmadd, + v16f32_info>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm VFNMSUB132PSZ : avx512_fma3p_m132<0x9E, "vfnmsub132ps", X86Fnmsub, + v16f32_info>, + EVEX_V512, EVEX_CD8<32, CD8VF>; } let ExeDomain = SSEPackedDouble in { - defm VFMADD132PDZ : avx512_fma3p_m132<0x98, "vfmadd132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmadd, v8f64>, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMSUB132PDZ : avx512_fma3p_m132<0x9A, "vfmsub132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmsub, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFMADDSUB132PDZ : avx512_fma3p_m132<0x96, "vfmaddsub132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmaddsub, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFMSUBADD132PDZ : avx512_fma3p_m132<0x97, "vfmsubadd132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmsubadd, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFNMADD132PDZ : avx512_fma3p_m132<0x9C, "vfnmadd132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fnmadd, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFNMSUB132PDZ : avx512_fma3p_m132<0x9E, "vfnmsub132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fnmsub, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; + defm VFMADD132PDZ : avx512_fma3p_m132<0x98, "vfmadd132pd", X86Fmadd, + v8f64_info>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + defm VFMSUB132PDZ : avx512_fma3p_m132<0x9A, "vfmsub132pd", X86Fmsub, + v8f64_info>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + defm VFMADDSUB132PDZ : avx512_fma3p_m132<0x96, "vfmaddsub132pd", X86Fmaddsub, + v8f64_info>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + defm VFMSUBADD132PDZ : avx512_fma3p_m132<0x97, "vfmsubadd132pd", X86Fmsubadd, + v8f64_info>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + defm VFNMADD132PDZ : avx512_fma3p_m132<0x9C, "vfnmadd132pd", X86Fnmadd, + v8f64_info>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + defm VFNMSUB132PDZ : avx512_fma3p_m132<0x9E, "vfnmsub132pd", X86Fnmsub, + v8f64_info>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; } // Scalar FMA @@ -3482,26 +4071,49 @@ def : Pat <(v2f64 (int_x86_avx512_rsqrt14_sd (v2f64 VR128X:$src1), /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode, - RegisterClass RC, X86MemOperand x86memop, - PatFrag mem_frag, ValueType OpVt> { - def r : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, - " \t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (OpVt (OpNode RC:$src)))]>, - EVEX; - def m : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (OpVt (OpNode (mem_frag addr:$src))))]>, - EVEX; -} -defm VRSQRT14PSZ : avx512_fp14_p<0x4E, "vrsqrt14ps", X86frsqrt, VR512, f512mem, - memopv16f32, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VRSQRT14PDZ : avx512_fp14_p<0x4E, "vrsqrt14pd", X86frsqrt, VR512, f512mem, - memopv8f64, v8f64>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -defm VRCP14PSZ : avx512_fp14_p<0x4C, "vrcp14ps", X86frcp, VR512, f512mem, - memopv16f32, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VRCP14PDZ : avx512_fp14_p<0x4C, "vrcp14pd", X86frcp, VR512, f512mem, - memopv8f64, v8f64>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; + X86VectorVTInfo _> { + defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src), OpcodeStr, "$src", "$src", + (_.FloatVT (OpNode _.RC:$src))>, EVEX, T8PD; + let mayLoad = 1 in { + defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src), OpcodeStr, "$src", "$src", + (OpNode (_.FloatVT + (bitconvert (_.LdFrag addr:$src))))>, EVEX, T8PD; + defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src), OpcodeStr, + "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, + (OpNode (_.FloatVT + (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + EVEX, T8PD, EVEX_B; + } +} + +multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, v16f32_info>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, v8f64_info>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), + OpNode, v4f32x_info>, + EVEX_V128, EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), + OpNode, v8f32x_info>, + EVEX_V256, EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), + OpNode, v2f64x_info>, + EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), + OpNode, v4f64x_info>, + EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>; + } +} + +defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>; +defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>; def : Pat <(v16f32 (int_x86_avx512_rsqrt14_ps_512 (v16f32 VR512:$src), (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))), @@ -3573,93 +4185,63 @@ def : Pat <(v2f64 (int_x86_avx512_rsqrt28_sd (v2f64 VR128X:$src1), (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>; /// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd -multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, - RegisterClass RC, X86MemOperand x86memop> { - let hasSideEffects = 0, Predicates = [HasERI] in { - def r : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, - " \t{$src, $dst|$dst, $src}"), - []>, EVEX; - def rb : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, - " \t{{sae}, $src, $dst|$dst, $src, {sae}}"), - []>, EVEX, EVEX_B; - def m : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), - []>, EVEX; - } -} -defm VRSQRT28PSZ : avx512_fp28_p<0xCC, "vrsqrt28ps", VR512, f512mem>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VRSQRT28PDZ : avx512_fp28_p<0xCC, "vrsqrt28pd", VR512, f512mem>, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -defm VRCP28PSZ : avx512_fp28_p<0xCA, "vrcp28ps", VR512, f512mem>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VRCP28PDZ : avx512_fp28_p<0xCA, "vrcp28pd", VR512, f512mem>, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; - -def : Pat <(v16f32 (int_x86_avx512_rsqrt28_ps (v16f32 VR512:$src), - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_NO_EXC)), - (VRSQRT28PSZrb VR512:$src)>; -def : Pat <(v8f64 (int_x86_avx512_rsqrt28_pd (v8f64 VR512:$src), - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_NO_EXC)), - (VRSQRT28PDZrb VR512:$src)>; - -def : Pat <(v16f32 (int_x86_avx512_rcp28_ps (v16f32 VR512:$src), - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_NO_EXC)), - (VRCP28PSZrb VR512:$src)>; -def : Pat <(v8f64 (int_x86_avx512_rcp28_pd (v8f64 VR512:$src), - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_NO_EXC)), - (VRCP28PDZrb VR512:$src)>; - -multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, - Intrinsic V16F32Int, Intrinsic V8F64Int, - OpndItins itins_s, OpndItins itins_d> { - def PSZrr :AVX512PSI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src), - !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR512:$dst, (v16f32 (OpNode VR512:$src)))], itins_s.rr>, - EVEX, EVEX_V512; - let mayLoad = 1 in - def PSZrm : AVX512PSI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src), - !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR512:$dst, - (OpNode (v16f32 (bitconvert (memopv16f32 addr:$src)))))], - itins_s.rm>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; +multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + SDNode OpNode> { - def PDZrr : AVX512PDI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src), - !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR512:$dst, (v8f64 (OpNode VR512:$src)))], itins_d.rr>, - EVEX, EVEX_V512; + defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src), OpcodeStr, "$src", "$src", + (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>; - let mayLoad = 1 in - def PDZrm : AVX512PDI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src), - !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR512:$dst, (OpNode - (v8f64 (bitconvert (memopv16f32 addr:$src)))))], - itins_d.rm>, EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>; + defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src), OpcodeStr, + "$src", "$src", + (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC)), "{sae}">, EVEX_B; -let isCodeGenOnly = 1 in { - def PSZr_Int : AVX512PSI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src), - !strconcat(OpcodeStr, - "ps\t{$src, $dst|$dst, $src}"), - [(set VR512:$dst, (V16F32Int VR512:$src))]>, - EVEX, EVEX_V512; - def PSZm_Int : AVX512PSI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src), - !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR512:$dst, - (V16F32Int (memopv16f32 addr:$src)))]>, EVEX, - EVEX_V512, EVEX_CD8<32, CD8VF>; - def PDZr_Int : AVX512PDI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src), - !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR512:$dst, (V8F64Int VR512:$src))]>, - EVEX, EVEX_V512, VEX_W; - def PDZm_Int : AVX512PDI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src), - !strconcat(OpcodeStr, - "pd\t{$src, $dst|$dst, $src}"), - [(set VR512:$dst, (V8F64Int (memopv8f64 addr:$src)))]>, - EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -} // isCodeGenOnly = 1 + defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src), OpcodeStr, "$src", "$src", + (OpNode (_.FloatVT + (bitconvert (_.LdFrag addr:$src))), (i32 FROUND_CURRENT))>; + + defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src), OpcodeStr, "$src", "$src", + (OpNode (_.FloatVT + (X86VBroadcast (_.ScalarLdFrag addr:$src))), + (i32 FROUND_CURRENT))>, EVEX_B; +} + +multiclass avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm PS : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode>, + EVEX_CD8<32, CD8VF>; + defm PD : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode>, + VEX_W, EVEX_CD8<32, CD8VF>; +} + +let Predicates = [HasERI], hasSideEffects = 0 in { + + defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28>, EVEX, EVEX_V512, T8PD; + defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28>, EVEX, EVEX_V512, T8PD; + defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2>, EVEX, EVEX_V512, T8PD; +} + +multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, + SDNode OpNode, X86VectorVTInfo _>{ + defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src), OpcodeStr, "$src", "$src", + (_.FloatVT (OpNode _.RC:$src))>, EVEX; + let mayLoad = 1 in { + defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src), OpcodeStr, "$src", "$src", + (OpNode (_.FloatVT + (bitconvert (_.LdFrag addr:$src))))>, EVEX; + + defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src), OpcodeStr, + "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, + (OpNode (_.FloatVT + (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + EVEX, EVEX_B; + } } multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, @@ -3723,15 +4305,45 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, } } +multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, + v16f32_info>, + EVEX_V512, PS, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, + v8f64_info>, + EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>; + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), + OpNode, v4f32x_info>, + EVEX_V128, PS, EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), + OpNode, v8f32x_info>, + EVEX_V256, PS, EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), + OpNode, v2f64x_info>, + EVEX_V128, VEX_W, PD, EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), + OpNode, v4f64x_info>, + EVEX_V256, VEX_W, PD, EVEX_CD8<64, CD8VF>; + } +} + +defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>; defm VSQRT : avx512_sqrt_scalar<0x51, "sqrt", int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, - SSE_SQRTSS, SSE_SQRTSD>, - avx512_sqrt_packed<0x51, "vsqrt", fsqrt, - int_x86_avx512_sqrt_ps_512, int_x86_avx512_sqrt_pd_512, - SSE_SQRTPS, SSE_SQRTPD>; + SSE_SQRTSS, SSE_SQRTSD>; let Predicates = [HasAVX512] in { + def : Pat<(v16f32 (int_x86_avx512_sqrt_ps_512 (v16f32 VR512:$src1), + (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_CURRENT)), + (VSQRTPSZr VR512:$src1)>; + def : Pat<(v8f64 (int_x86_avx512_sqrt_pd_512 (v8f64 VR512:$src1), + (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_CURRENT)), + (VSQRTPDZr VR512:$src1)>; + def : Pat<(f32 (fsqrt FR32X:$src)), (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; def : Pat<(f32 (fsqrt (load addr:$src))), @@ -4301,33 +4913,29 @@ def : Pat<(v8i64 (X86Shufp VR512:$src1, (memopv8i64 addr:$src2), (i8 imm:$imm))), (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>; -multiclass avx512_alignr<string OpcodeStr, RegisterClass RC, - X86MemOperand x86memop> { - def rri : AVX512AIi8<0x03, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$src3), - !strconcat(OpcodeStr, - " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, EVEX_4V; +multiclass avx512_valign<X86VectorVTInfo _> { + defm rri : AVX512_maskable<0x03, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i8imm:$src3), + "valign"##_.Suffix, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (_.VT (X86VAlign _.RC:$src2, _.RC:$src1, + (i8 imm:$src3)))>, + AVX512AIi8Base, EVEX_4V; + + // Also match valign of packed floats. + def : Pat<(_.FloatVT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 imm:$imm))), + (!cast<Instruction>(NAME##rri) _.RC:$src2, _.RC:$src1, imm:$imm)>; + let mayLoad = 1 in - def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, i8imm:$src3), - !strconcat(OpcodeStr, - " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, i8imm:$src3), + !strconcat("valign"##_.Suffix, + " \t{$src3, $src2, $src1, $dst|" + "$dst, $src1, $src2, $src3}"), []>, EVEX_4V; } -defm VALIGND : avx512_alignr<"valignd", VR512, i512mem>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VALIGNQ : avx512_alignr<"valignq", VR512, i512mem>, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; - -def : Pat<(v16f32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), - (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>; -def : Pat<(v8f64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), - (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>; -def : Pat<(v16i32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), - (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>; -def : Pat<(v8i64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), - (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>; +defm VALIGND : avx512_valign<v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VALIGNQ : avx512_valign<v8i64_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; // Helper fragments to match sext vXi1 to vXiY. def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>; @@ -4525,3 +5133,32 @@ def truncstorei1 : PatFrag<(ops node:$val, node:$ptr), def : Pat<(truncstorei1 GR8:$src, addr:$dst), (MOV8mr addr:$dst, GR8:$src)>; +multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > { +def rr : AVX512XS8I<opc, MRMDestReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src), + !strconcat(OpcodeStr##Vec.Suffix, " \t{$src, $dst|$dst, $src}"), + [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))]>, EVEX; +} + +multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo, + string OpcodeStr, Predicate prd> { +let Predicates = [prd] in + defm Z : cvt_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256; + defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128; + } +} + +multiclass avx512_convert_mask_to_vector<string OpcodeStr> { + defm NAME##B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info, OpcodeStr, + HasBWI>; + defm NAME##W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, OpcodeStr, + HasBWI>, VEX_W; + defm NAME##D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, OpcodeStr, + HasDQI>; + defm NAME##Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, OpcodeStr, + HasDQI>, VEX_W; +} + +defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">; diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index f2574cc..25e1e80 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -1355,49 +1355,57 @@ let Predicates = [HasBMI2] in { //===----------------------------------------------------------------------===// // ADCX Instruction // -let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in { +let hasSideEffects = 0, Defs = [EFLAGS], Uses = [EFLAGS], + Constraints = "$src0 = $dst", AddedComplexity = 10 in { let SchedRW = [WriteALU] in { - def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), - "adcx{l}\t{$src, $dst|$dst, $src}", - [], IIC_BIN_NONMEM>, T8PD; - - def ADCX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), - "adcx{q}\t{$src, $dst|$dst, $src}", - [], IIC_BIN_NONMEM>, T8PD, Requires<[In64BitMode]>; + def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), + (ins GR32:$src0, GR32:$src), "adcx{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, + (X86adc_flag GR32:$src0, GR32:$src, EFLAGS))], + IIC_BIN_CARRY_NONMEM>, T8PD, Requires<[HasADX]>; + def ADCX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), + (ins GR64:$src0, GR64:$src), "adcx{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, + (X86adc_flag GR64:$src0, GR64:$src, EFLAGS))], + IIC_BIN_CARRY_NONMEM>, T8PD, Requires<[HasADX, In64BitMode]>; } // SchedRW let mayLoad = 1, SchedRW = [WriteALULd] in { - def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), - "adcx{l}\t{$src, $dst|$dst, $src}", - [], IIC_BIN_MEM>, T8PD; - - def ADCX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), - "adcx{q}\t{$src, $dst|$dst, $src}", - [], IIC_BIN_MEM>, T8PD, Requires<[In64BitMode]>; + def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$src0, i32mem:$src), "adcx{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, + (X86adc_flag GR32:$src0, (loadi32 addr:$src), EFLAGS))], + IIC_BIN_CARRY_MEM>, T8PD, Requires<[HasADX]>; + + def ADCX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$src0, i64mem:$src), "adcx{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, + (X86adc_flag GR64:$src0, (loadi64 addr:$src), EFLAGS))], + IIC_BIN_CARRY_MEM>, T8PD, Requires<[HasADX, In64BitMode]>; } } //===----------------------------------------------------------------------===// // ADOX Instruction // -let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in { +let hasSideEffects = 0, Defs = [EFLAGS], Uses = [EFLAGS] in { let SchedRW = [WriteALU] in { def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "adox{l}\t{$src, $dst|$dst, $src}", - [], IIC_BIN_NONMEM>, T8XS; + [], IIC_BIN_NONMEM>, T8XS, Requires<[HasADX]>; def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "adox{q}\t{$src, $dst|$dst, $src}", - [], IIC_BIN_NONMEM>, T8XS, Requires<[In64BitMode]>; + [], IIC_BIN_NONMEM>, T8XS, Requires<[HasADX, In64BitMode]>; } // SchedRW let mayLoad = 1, SchedRW = [WriteALULd] in { def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "adox{l}\t{$src, $dst|$dst, $src}", - [], IIC_BIN_MEM>, T8XS; + [], IIC_BIN_MEM>, T8XS, Requires<[HasADX]>; def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "adox{q}\t{$src, $dst|$dst, $src}", - [], IIC_BIN_MEM>, T8XS, Requires<[In64BitMode]>; + [], IIC_BIN_MEM>, T8XS, Requires<[HasADX, In64BitMode]>; } } diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h index e421f8c..2056056 100644 --- a/lib/Target/X86/X86InstrBuilder.h +++ b/lib/Target/X86/X86InstrBuilder.h @@ -21,8 +21,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86INSTRBUILDER_H -#define X86INSTRBUILDER_H +#ifndef LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H +#define LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index ca4f608..117b6ff 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -46,11 +46,11 @@ let Defs = [ESP, EFLAGS], Uses = [ESP] in { def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt), "#ADJCALLSTACKDOWN", [(X86callseq_start timm:$amt)]>, - Requires<[Not64BitMode]>; + Requires<[NotLP64]>; def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), "#ADJCALLSTACKUP", [(X86callseq_end timm:$amt1, timm:$amt2)]>, - Requires<[Not64BitMode]>; + Requires<[NotLP64]>; } // ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into @@ -62,11 +62,11 @@ let Defs = [RSP, EFLAGS], Uses = [RSP] in { def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt), "#ADJCALLSTACKDOWN", [(X86callseq_start timm:$amt)]>, - Requires<[In64BitMode]>; + Requires<[IsLP64]>; def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), "#ADJCALLSTACKUP", [(X86callseq_end timm:$amt1, timm:$amt2)]>, - Requires<[In64BitMode]>; + Requires<[IsLP64]>; } @@ -118,7 +118,7 @@ def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size), "# variable sized alloca for segmented stacks", [(set GR32:$dst, (X86SegAlloca GR32:$size))]>, - Requires<[Not64BitMode]>; + Requires<[NotLP64]>; let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size), @@ -214,6 +214,8 @@ let isPseudo = 1 in { "#SEH_PushFrame $mode", []>; def SEH_EndPrologue : I<0, Pseudo, (outs), (ins), "#SEH_EndPrologue", []>; + def SEH_Epilogue : I<0, Pseudo, (outs), (ins), + "#SEH_Epilogue", []>; } //===----------------------------------------------------------------------===// @@ -407,7 +409,8 @@ let Defs = [RCX,RDI], isCodeGenOnly = 1 in { // All calls clobber the non-callee saved registers. ESP is marked as // a use to prevent stack-pointer assignments that appear immediately // before calls from potentially appearing dead. -let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, +let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7, + ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7, MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], @@ -426,7 +429,8 @@ def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), // a use to prevent stack-pointer assignments that appear immediately // before calls from potentially appearing dead. let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, - FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1, + FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7, + ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7, MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], @@ -747,18 +751,88 @@ defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add", IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>, TB, LOCK; -def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src), - "#ACQUIRE_MOV PSEUDO!", - [(set GR8:$dst, (atomic_load_8 addr:$src))]>; -def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src), - "#ACQUIRE_MOV PSEUDO!", - [(set GR16:$dst, (atomic_load_16 addr:$src))]>; -def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src), - "#ACQUIRE_MOV PSEUDO!", - [(set GR32:$dst, (atomic_load_32 addr:$src))]>; -def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), - "#ACQUIRE_MOV PSEUDO!", - [(set GR64:$dst, (atomic_load_64 addr:$src))]>; +/* The following multiclass tries to make sure that in code like + * x.store (immediate op x.load(acquire), release) + * an operation directly on memory is generated instead of wasting a register. + * It is not automatic as atomic_store/load are only lowered to MOV instructions + * extremely late to prevent them from being accidentally reordered in the backend + * (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions) + */ +multiclass RELEASE_BINOP_MI<string op> { + def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), + "#RELEASE_BINOP PSEUDO!", + [(atomic_store_8 addr:$dst, (!cast<PatFrag>(op) + (atomic_load_8 addr:$dst), (i8 imm:$src)))]>; + // NAME#16 is not generated as 16-bit arithmetic instructions are considered + // costly and avoided as far as possible by this backend anyway + def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), + "#RELEASE_BINOP PSEUDO!", + [(atomic_store_32 addr:$dst, (!cast<PatFrag>(op) + (atomic_load_32 addr:$dst), (i32 imm:$src)))]>; + def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), + "#RELEASE_BINOP PSEUDO!", + [(atomic_store_64 addr:$dst, (!cast<PatFrag>(op) + (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>; +} +defm RELEASE_ADD : RELEASE_BINOP_MI<"add">; +defm RELEASE_AND : RELEASE_BINOP_MI<"and">; +defm RELEASE_OR : RELEASE_BINOP_MI<"or">; +defm RELEASE_XOR : RELEASE_BINOP_MI<"xor">; +// Note: we don't deal with sub, because substractions of constants are +// optimized into additions before this code can run + +multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> { + def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst), + "#RELEASE_UNOP PSEUDO!", + [(atomic_store_8 addr:$dst, dag8)]>; + def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst), + "#RELEASE_UNOP PSEUDO!", + [(atomic_store_16 addr:$dst, dag16)]>; + def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst), + "#RELEASE_UNOP PSEUDO!", + [(atomic_store_32 addr:$dst, dag32)]>; + def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst), + "#RELEASE_UNOP PSEUDO!", + [(atomic_store_64 addr:$dst, dag64)]>; +} + +defm RELEASE_INC : RELEASE_UNOP< + (add (atomic_load_8 addr:$dst), (i8 1)), + (add (atomic_load_16 addr:$dst), (i16 1)), + (add (atomic_load_32 addr:$dst), (i32 1)), + (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>; +defm RELEASE_DEC : RELEASE_UNOP< + (add (atomic_load_8 addr:$dst), (i8 -1)), + (add (atomic_load_16 addr:$dst), (i16 -1)), + (add (atomic_load_32 addr:$dst), (i32 -1)), + (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>; +/* +TODO: These don't work because the type inference of TableGen fails. +TODO: find a way to fix it. +defm RELEASE_NEG : RELEASE_UNOP< + (ineg (atomic_load_8 addr:$dst)), + (ineg (atomic_load_16 addr:$dst)), + (ineg (atomic_load_32 addr:$dst)), + (ineg (atomic_load_64 addr:$dst))>; +defm RELEASE_NOT : RELEASE_UNOP< + (not (atomic_load_8 addr:$dst)), + (not (atomic_load_16 addr:$dst)), + (not (atomic_load_32 addr:$dst)), + (not (atomic_load_64 addr:$dst))>; +*/ + +def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), + "#RELEASE_MOV PSEUDO !", + [(atomic_store_8 addr:$dst, (i8 imm:$src))]>; +def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src), + "#RELEASE_MOV PSEUDO !", + [(atomic_store_16 addr:$dst, (i16 imm:$src))]>; +def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), + "#RELEASE_MOV PSEUDO !", + [(atomic_store_32 addr:$dst, (i32 imm:$src))]>; +def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), + "#RELEASE_MOV PSEUDO !", + [(atomic_store_64 addr:$dst, i64immSExt32:$src)]>; def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src), "#RELEASE_MOV PSEUDO!", @@ -773,11 +847,22 @@ def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), "#RELEASE_MOV PSEUDO!", [(atomic_store_64 addr:$dst, GR64:$src)]>; +def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR8:$dst, (atomic_load_8 addr:$src))]>; +def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR16:$dst, (atomic_load_16 addr:$src))]>; +def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR32:$dst, (atomic_load_32 addr:$src))]>; +def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR64:$dst, (atomic_load_64 addr:$src))]>; //===----------------------------------------------------------------------===// // Conditional Move Pseudo Instructions. //===----------------------------------------------------------------------===// - // CMOV* - Used to implement the SSE SELECT DAG operation. Expanded after // instruction selection into a branch sequence. let Uses = [EFLAGS], usesCustomInserter = 1 in { @@ -1106,6 +1191,7 @@ def def32 : PatLeaf<(i32 GR32:$src), [{ return N->getOpcode() != ISD::TRUNCATE && N->getOpcode() != TargetOpcode::EXTRACT_SUBREG && N->getOpcode() != ISD::CopyFromReg && + N->getOpcode() != ISD::AssertSext && N->getOpcode() != X86ISD::CMOV; }]>; diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td index 6be6a1f..b38129a 100644 --- a/lib/Target/X86/X86InstrExtension.td +++ b/lib/Target/X86/X86InstrExtension.td @@ -97,13 +97,23 @@ def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), let neverHasSideEffects = 1, isCodeGenOnly = 1 in { def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg, (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src), - "movz{bl|x}\t{$src, $dst|$dst, $src}", + "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOVZX>, TB, Sched<[WriteALU]>; let mayLoad = 1 in def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem, (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src), - "movz{bl|x}\t{$src, $dst|$dst, $src}", + "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOVZX>, TB, Sched<[WriteALULd]>; + +def MOVSX32_NOREXrr8 : I<0xBE, MRMSrcReg, + (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src), + "movs{bl|x}\t{$src, $dst|$dst, $src} # NOREX", + [], IIC_MOVSX>, TB, Sched<[WriteALU]>; +let mayLoad = 1 in +def MOVSX32_NOREXrm8 : I<0xBE, MRMSrcMem, + (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src), + "movs{bl|x}\t{$src, $dst|$dst, $src} # NOREX", + [], IIC_MOVSX>, TB, Sched<[WriteALULd]>; } // MOVSX64rr8 always has a REX prefix and it has an 8-bit register diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td index 4ad7b7e..d9f173e 100644 --- a/lib/Target/X86/X86InstrFPStack.td +++ b/lib/Target/X86/X86InstrFPStack.td @@ -114,9 +114,6 @@ let usesCustomInserter = 1 in { // Expanded after instruction selection. // a pattern) and the FPI instruction should have emission info (e.g. opcode // encoding and asm printing info). -// Pseudo Instruction for FP stack return values. -def FpPOP_RETVAL : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>; - // FpIf32, FpIf64 - Floating Point Pseudo Instruction template. // f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1. // f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2. diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index cc30266..fe4ead1 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -36,20 +36,21 @@ def MRM6m : Format<30>; def MRM7m : Format<31>; def MRM_C0 : Format<32>; def MRM_C1 : Format<33>; def MRM_C2 : Format<34>; def MRM_C3 : Format<35>; def MRM_C4 : Format<36>; def MRM_C8 : Format<37>; def MRM_C9 : Format<38>; def MRM_CA : Format<39>; def MRM_CB : Format<40>; -def MRM_D0 : Format<41>; def MRM_D1 : Format<42>; def MRM_D4 : Format<43>; -def MRM_D5 : Format<44>; def MRM_D6 : Format<45>; def MRM_D8 : Format<46>; -def MRM_D9 : Format<47>; def MRM_DA : Format<48>; def MRM_DB : Format<49>; -def MRM_DC : Format<50>; def MRM_DD : Format<51>; def MRM_DE : Format<52>; -def MRM_DF : Format<53>; def MRM_E0 : Format<54>; def MRM_E1 : Format<55>; -def MRM_E2 : Format<56>; def MRM_E3 : Format<57>; def MRM_E4 : Format<58>; -def MRM_E5 : Format<59>; def MRM_E8 : Format<60>; def MRM_E9 : Format<61>; -def MRM_EA : Format<62>; def MRM_EB : Format<63>; def MRM_EC : Format<64>; -def MRM_ED : Format<65>; def MRM_EE : Format<66>; def MRM_F0 : Format<67>; -def MRM_F1 : Format<68>; def MRM_F2 : Format<69>; def MRM_F3 : Format<70>; -def MRM_F4 : Format<71>; def MRM_F5 : Format<72>; def MRM_F6 : Format<73>; -def MRM_F7 : Format<74>; def MRM_F8 : Format<75>; def MRM_F9 : Format<76>; -def MRM_FA : Format<77>; def MRM_FB : Format<78>; def MRM_FC : Format<79>; -def MRM_FD : Format<80>; def MRM_FE : Format<81>; def MRM_FF : Format<82>; +def MRM_CF : Format<41>; def MRM_D0 : Format<42>; def MRM_D1 : Format<43>; +def MRM_D4 : Format<44>; def MRM_D5 : Format<45>; def MRM_D6 : Format<46>; +def MRM_D7 : Format<47>; def MRM_D8 : Format<48>; def MRM_D9 : Format<49>; +def MRM_DA : Format<50>; def MRM_DB : Format<51>; def MRM_DC : Format<52>; +def MRM_DD : Format<53>; def MRM_DE : Format<54>; def MRM_DF : Format<55>; +def MRM_E0 : Format<56>; def MRM_E1 : Format<57>; def MRM_E2 : Format<58>; +def MRM_E3 : Format<59>; def MRM_E4 : Format<60>; def MRM_E5 : Format<61>; +def MRM_E8 : Format<62>; def MRM_E9 : Format<63>; def MRM_EA : Format<64>; +def MRM_EB : Format<65>; def MRM_EC : Format<66>; def MRM_ED : Format<67>; +def MRM_EE : Format<68>; def MRM_F0 : Format<69>; def MRM_F1 : Format<70>; +def MRM_F2 : Format<71>; def MRM_F3 : Format<72>; def MRM_F4 : Format<73>; +def MRM_F5 : Format<74>; def MRM_F6 : Format<75>; def MRM_F7 : Format<76>; +def MRM_F8 : Format<77>; def MRM_F9 : Format<78>; def MRM_FA : Format<79>; +def MRM_FB : Format<80>; def MRM_FC : Format<81>; def MRM_FD : Format<82>; +def MRM_FE : Format<83>; def MRM_FF : Format<84>; // ImmType - This specifies the immediate type used by an instruction. This is // part of the ad-hoc solution used to emit machine instruction encodings by our @@ -100,6 +101,7 @@ def CD8VF : CD8VForm<0>; // v := VL def CD8VH : CD8VForm<1>; // v := VL/2 def CD8VQ : CD8VForm<2>; // v := VL/4 def CD8VO : CD8VForm<3>; // v := VL/8 +// The tuple (subvector) forms. def CD8VT1 : CD8VForm<4>; // v := 1 def CD8VT2 : CD8VForm<5>; // v := 2 def CD8VT4 : CD8VForm<6>; // v := 4 @@ -184,13 +186,16 @@ class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; } class EVEX_B { bit hasEVEX_B = 1; } class EVEX_RC { bit hasEVEX_RC = 1; } class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; } +class EVEX_V256 { bit hasEVEX_L2 = 0; bit hasVEX_L = 1; } +class EVEX_V128 { bit hasEVEX_L2 = 0; bit hasVEX_L = 0; } + +// Specify AVX512 8-bit compressed displacement encoding based on the vector +// element size in bits (8, 16, 32, 64) and the CDisp8 form. class EVEX_CD8<int esize, CD8VForm form> { - bits<2> EVEX_CD8E = !if(!eq(esize, 8), 0b00, - !if(!eq(esize, 16), 0b01, - !if(!eq(esize, 32), 0b10, - !if(!eq(esize, 64), 0b11, ?)))); - bits<3> EVEX_CD8V = form.Value; + int CD8_EltSize = !srl(esize, 3); + bits<3> CD8_Form = form.Value; } + class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; } class MemOp4 { bit hasMemOp4Prefix = 1; } class XOP { Encoding OpEnc = EncXOP; } @@ -253,12 +258,32 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, bit hasEVEX_Z = 0; // Does this inst set the EVEX_Z field? bit hasEVEX_L2 = 0; // Does this inst set the EVEX_L2 field? bit hasEVEX_B = 0; // Does this inst set the EVEX_B field? - bits<2> EVEX_CD8E = 0; // Compressed disp8 form - element-size. - bits<3> EVEX_CD8V = 0; // Compressed disp8 form - vector-width. + bits<3> CD8_Form = 0; // Compressed disp8 form - vector-width. + // Declare it int rather than bits<4> so that all bits are defined when + // assigning to bits<7>. + int CD8_EltSize = 0; // Compressed disp8 form - element-size in bytes. bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding? bit hasMemOp4Prefix = 0; // Same bit as VEX_W, but used for swapping operands bit hasEVEX_RC = 0; // Explicitly specified rounding control in FP instruction. + bits<2> EVEX_LL; + let EVEX_LL{0} = hasVEX_L; + let EVEX_LL{1} = hasEVEX_L2; + // Vector size in bytes. + bits<7> VectSize = !shl(16, EVEX_LL); + + // The scaling factor for AVX512's compressed displacement is either + // - the size of a power-of-two number of elements or + // - the size of a single element for broadcasts or + // - the total vector size divided by a power-of-two number. + // Possible values are: 0 (non-AVX512 inst), 1, 2, 4, 8, 16, 32 and 64. + bits<7> CD8_Scale = !if (!eq (OpEnc.Value, EncEVEX.Value), + !if (CD8_Form{2}, + !shl(CD8_EltSize, CD8_Form{1-0}), + !if (hasEVEX_B, + CD8_EltSize, + !srl(VectSize, CD8_Form{1-0}))), 0); + // TSFlags layout should be kept in sync with X86InstrInfo.h. let TSFlags{6-0} = FormBits; let TSFlags{8-7} = OpSizeBits; @@ -283,11 +308,11 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, let TSFlags{45} = hasEVEX_Z; let TSFlags{46} = hasEVEX_L2; let TSFlags{47} = hasEVEX_B; - let TSFlags{49-48} = EVEX_CD8E; - let TSFlags{52-50} = EVEX_CD8V; - let TSFlags{53} = has3DNow0F0FOpcode; - let TSFlags{54} = hasMemOp4Prefix; - let TSFlags{55} = hasEVEX_RC; + // If we run out of TSFlags bits, it's possible to encode this in 3 bits. + let TSFlags{54-48} = CD8_Scale; + let TSFlags{55} = has3DNow0F0FOpcode; + let TSFlags{56} = hasMemOp4Prefix; + let TSFlags{57} = hasEVEX_RC; } class PseudoI<dag oops, dag iops, list<dag> pattern> @@ -690,14 +715,25 @@ class AVX512BI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD, Requires<[HasAVX512]>; +class AVX512BIBase : PD { + Domain ExeDomain = SSEPackedInt; +} class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD, Requires<[HasAVX512]>; +class AVX512BIi8Base : PD { + Domain ExeDomain = SSEPackedInt; + ImmType ImmT = Imm8; +} class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD, Requires<[HasAVX512]>; +class AVX512AIi8Base : TAPD { + Domain ExeDomain = SSEPackedInt; + ImmType ImmT = Imm8; +} class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, @@ -720,6 +756,11 @@ class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag>pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, T8PD, EVEX_4V, Requires<[HasAVX512]>; +class AVX512FMA3Base : T8PD, EVEX_4V; + +class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, Requires<[HasAVX512]>; // AES Instruction Templates: // diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 6f0fa94..1c7215c 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -83,7 +83,7 @@ def X86pinsrw : SDNode<"X86ISD::PINSRW", SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>; def X86insertps : SDNode<"X86ISD::INSERTPS", SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>, - SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>; + SDTCisVT<2, v4f32>, SDTCisVT<3, i8>]>>; def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL", SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>; @@ -188,6 +188,8 @@ def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>; +def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVec<2>]>; def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisInt<2>]>; def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, @@ -197,12 +199,15 @@ def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>; def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>; + SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>; def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>; +def STDFp1SrcRm : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, + SDTCisVec<0>, SDTCisInt<2>]>; def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>; +def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>; def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>; def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>; @@ -231,10 +236,11 @@ def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>; def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>; def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>; -def X86VPermilp : SDNode<"X86ISD::VPERMILP", SDTShuff2OpI>; -def X86VPermv : SDNode<"X86ISD::VPERMV", SDTShuff2Op>; -def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>; -def X86VPermv3 : SDNode<"X86ISD::VPERMV3", SDTShuff3Op>; +def X86VPermilpv : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>; +def X86VPermilpi : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>; +def X86VPermv : SDNode<"X86ISD::VPERMV", SDTShuff2Op>; +def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>; +def X86VPermv3 : SDNode<"X86ISD::VPERMV3", SDTShuff3Op>; def X86VPermiv3 : SDNode<"X86ISD::VPERMIV3", SDTShuff3Op>; def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; @@ -247,6 +253,9 @@ def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2, [SDTCisVec<1>, SDTCisPtrTy<2>]>, []>; def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; + +def X86Addsub : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>; + def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>; def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFma>; @@ -254,6 +263,10 @@ def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFma>; def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFma>; def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFma>; +def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", STDFp1SrcRm>; +def X86rcp28 : SDNode<"X86ISD::RCP28", STDFp1SrcRm>; +def X86exp2 : SDNode<"X86ISD::EXP2", STDFp1SrcRm>; + def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>, SDTCisVT<4, i8>]>; @@ -311,6 +324,8 @@ def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>; // 512-bit load pattern fragments def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>; def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>; +def loadv64i8 : PatFrag<(ops node:$ptr), (v64i8 (load node:$ptr))>; +def loadv32i16 : PatFrag<(ops node:$ptr), (v32i16 (load node:$ptr))>; def loadv16i32 : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>; def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>; @@ -509,7 +524,9 @@ def I8Imm : SDNodeXForm<imm, [{ }]>; def FROUND_NO_EXC : ImmLeaf<i32, [{ return Imm == 8; }]>; -def FROUND_CURRENT : ImmLeaf<i32, [{ return Imm == 4; }]>; +def FROUND_CURRENT : ImmLeaf<i32, [{ + return Imm == X86::STATIC_ROUNDING::CUR_DIRECTION; +}]>; // BYTE_imm - Transform bit immediates into byte immediates. def BYTE_imm : SDNodeXForm<imm, [{ diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 0d3afc4..7f87bdd 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -26,6 +26,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" @@ -100,8 +101,8 @@ void X86InstrInfo::anchor() {} X86InstrInfo::X86InstrInfo(X86Subtarget &STI) : X86GenInstrInfo( - (STI.is64Bit() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), - (STI.is64Bit() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)), + (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), + (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)), Subtarget(STI), RI(STI) { static const X86OpTblEntry OpTbl2Addr[] = { @@ -377,7 +378,39 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE }, { X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE }, // AVX-512 foldable instructions - { X86::VMOVPDI2DIZrr,X86::VMOVPDI2DIZmr, TB_FOLDED_STORE } + { X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE }, + { X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, + { X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, + { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 }, + { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zmr, TB_FOLDED_STORE | TB_ALIGN_64 }, + { X86::VMOVUPDZrr, X86::VMOVUPDZmr, TB_FOLDED_STORE }, + { X86::VMOVUPSZrr, X86::VMOVUPSZmr, TB_FOLDED_STORE }, + { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zmr, TB_FOLDED_STORE }, + { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE }, + { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE }, + { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE }, + // AVX-512 foldable instructions (256-bit versions) + { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256mr, TB_FOLDED_STORE }, + { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256mr, TB_FOLDED_STORE }, + { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256mr, TB_FOLDED_STORE }, + { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE }, + { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE }, + { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE }, + // AVX-512 foldable instructions (128-bit versions) + { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128mr, TB_FOLDED_STORE }, + { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128mr, TB_FOLDED_STORE }, + { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128mr, TB_FOLDED_STORE }, + { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128mr, TB_FOLDED_STORE }, + { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128mr, TB_FOLDED_STORE }, + { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE } }; for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) { @@ -415,6 +448,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 }, { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, 0 }, { X86::CVTSS2SIrr, X86::CVTSS2SIrm, 0 }, + { X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 }, + { X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 }, + { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 }, { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 }, { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 }, { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 }, @@ -493,6 +529,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 }, { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 }, { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 }, + { X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 }, + { X86::VCVTPD2DQrr, X86::VCVTPD2DQXrm, 0 }, + { X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 }, + { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, 0 }, + { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 }, { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 }, { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 }, { X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 }, @@ -526,6 +567,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE }, // AVX 256-bit foldable instructions + { X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 }, + { X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 }, + { X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 }, + { X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 }, + { X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 }, { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 }, { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 }, { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 }, @@ -533,6 +579,13 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 }, { X86::VPERMILPDYri, X86::VPERMILPDYmi, 0 }, { X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 }, + { X86::VRCPPSYr, X86::VRCPPSYm, 0 }, + { X86::VRCPPSYr_Int, X86::VRCPPSYm_Int, 0 }, + { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 }, + { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 }, + { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 }, + { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE }, + { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE }, // AVX2 foldable instructions { X86::VPABSBrr256, X86::VPABSBrm256, 0 }, @@ -541,13 +594,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 }, { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 }, { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 }, - { X86::VRCPPSYr, X86::VRCPPSYm, 0 }, - { X86::VRCPPSYr_Int, X86::VRCPPSYm_Int, 0 }, - { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 }, - { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 }, - { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 }, - { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE }, - { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE }, // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions { X86::BEXTR32rr, X86::BEXTR32rm, 0 }, @@ -601,18 +647,46 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // AVX-512 foldable instructions { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 }, { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 }, - { X86::VMOVDQA32rr, X86::VMOVDQA32rm, TB_ALIGN_64 }, - { X86::VMOVDQA64rr, X86::VMOVDQA64rm, TB_ALIGN_64 }, - { X86::VMOVDQU32rr, X86::VMOVDQU32rm, 0 }, - { X86::VMOVDQU64rr, X86::VMOVDQU64rm, 0 }, + { X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 }, + { X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 }, + { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 }, + { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zrm, TB_ALIGN_64 }, + { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zrm, 0 }, + { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zrm, 0 }, + { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zrm, 0 }, + { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zrm, 0 }, + { X86::VMOVUPDZrr, X86::VMOVUPDZrm, 0 }, + { X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 }, { X86::VPABSDZrr, X86::VPABSDZrm, 0 }, { X86::VPABSQZrr, X86::VPABSQZrm, 0 }, + // AVX-512 foldable instructions (256-bit versions) + { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 }, + { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 }, + { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 }, + { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256rm, TB_ALIGN_32 }, + { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256rm, 0 }, + { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256rm, 0 }, + { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256rm, 0 }, + { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 }, + { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 }, + { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 }, + // AVX-512 foldable instructions (256-bit versions) + { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 }, + { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 }, + { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 }, + { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128rm, TB_ALIGN_16 }, + { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128rm, 0 }, + { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128rm, 0 }, + { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128rm, 0 }, + { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 }, + { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 }, + { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 }, // AES foldable instructions { X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16 }, { X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16 }, { X86::VAESIMCrr, X86::VAESIMCrm, TB_ALIGN_16 }, - { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, TB_ALIGN_16 }, + { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, TB_ALIGN_16 } }; for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) { @@ -869,8 +943,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 }, { X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 }, { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 }, - { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, 0 }, - { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 }, { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 }, { X86::VSQRTSDr, X86::VSQRTSDm, 0 }, { X86::VSQRTSSr, X86::VSQRTSSm, 0 }, @@ -1543,8 +1615,11 @@ static bool isFrameLoadOpcode(int Opcode) { case X86::VMOVAPSrm: case X86::VMOVAPDrm: case X86::VMOVDQArm: + case X86::VMOVUPSYrm: case X86::VMOVAPSYrm: + case X86::VMOVUPDYrm: case X86::VMOVAPDYrm: + case X86::VMOVDQUYrm: case X86::VMOVDQAYrm: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: @@ -1572,8 +1647,11 @@ static bool isFrameStoreOpcode(int Opcode) { case X86::VMOVAPSmr: case X86::VMOVAPDmr: case X86::VMOVDQAmr: + case X86::VMOVUPSYmr: case X86::VMOVAPSYmr: + case X86::VMOVUPDYmr: case X86::VMOVAPDYmr: + case X86::VMOVDQUYmr: case X86::VMOVDQAYmr: case X86::VMOVUPSZmr: case X86::VMOVAPSZmr: @@ -2078,34 +2156,6 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned MIOpc = MI->getOpcode(); switch (MIOpc) { - case X86::SHUFPSrri: { - assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!"); - if (!Subtarget.hasSSE2()) return nullptr; - - unsigned B = MI->getOperand(1).getReg(); - unsigned C = MI->getOperand(2).getReg(); - if (B != C) return nullptr; - unsigned M = MI->getOperand(3).getImm(); - NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri)) - .addOperand(Dest).addOperand(Src).addImm(M); - break; - } - case X86::SHUFPDrri: { - assert(MI->getNumOperands() == 4 && "Unknown shufpd instruction!"); - if (!Subtarget.hasSSE2()) return nullptr; - - unsigned B = MI->getOperand(1).getReg(); - unsigned C = MI->getOperand(2).getReg(); - if (B != C) return nullptr; - unsigned M = MI->getOperand(3).getImm(); - - // Convert to PSHUFD mask. - M = ((M & 1) << 1) | ((M & 1) << 3) | ((M & 2) << 4) | ((M & 2) << 6)| 0x44; - - NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri)) - .addOperand(Dest).addOperand(Src).addImm(M); - break; - } case X86::SHL64ri: { assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); @@ -2387,6 +2437,42 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { MI->getOperand(3).setImm(Size-Amt); return TargetInstrInfo::commuteInstruction(MI, NewMI); } + case X86::BLENDPDrri: + case X86::BLENDPSrri: + case X86::PBLENDWrri: + case X86::VBLENDPDrri: + case X86::VBLENDPSrri: + case X86::VBLENDPDYrri: + case X86::VBLENDPSYrri: + case X86::VPBLENDDrri: + case X86::VPBLENDWrri: + case X86::VPBLENDDYrri: + case X86::VPBLENDWYrri:{ + unsigned Mask; + switch (MI->getOpcode()) { + default: llvm_unreachable("Unreachable!"); + case X86::BLENDPDrri: Mask = 0x03; break; + case X86::BLENDPSrri: Mask = 0x0F; break; + case X86::PBLENDWrri: Mask = 0xFF; break; + case X86::VBLENDPDrri: Mask = 0x03; break; + case X86::VBLENDPSrri: Mask = 0x0F; break; + case X86::VBLENDPDYrri: Mask = 0x0F; break; + case X86::VBLENDPSYrri: Mask = 0xFF; break; + case X86::VPBLENDDrri: Mask = 0x0F; break; + case X86::VPBLENDWrri: Mask = 0xFF; break; + case X86::VPBLENDDYrri: Mask = 0xFF; break; + case X86::VPBLENDWYrri: Mask = 0xFF; break; + } + // Only the least significant bits of Imm are used. + unsigned Imm = MI->getOperand(3).getImm() & Mask; + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->getOperand(3).setImm(Mask ^ Imm); + return TargetInstrInfo::commuteInstruction(MI, NewMI); + } case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr: case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr: case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr: @@ -2471,6 +2557,20 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { switch (MI->getOpcode()) { + case X86::BLENDPDrri: + case X86::BLENDPSrri: + case X86::PBLENDWrri: + case X86::VBLENDPDrri: + case X86::VBLENDPSrri: + case X86::VBLENDPDYrri: + case X86::VBLENDPSYrri: + case X86::VPBLENDDrri: + case X86::VPBLENDDYrri: + case X86::VPBLENDWrri: + case X86::VPBLENDWYrri: + SrcOpIdx1 = 1; + SrcOpIdx2 = 2; + return true; case X86::VFMADDPDr231r: case X86::VFMADDPSr231r: case X86::VFMADDSDr231r: @@ -3067,6 +3167,8 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, inline static bool MaskRegClassContains(unsigned Reg) { return X86::VK8RegClass.contains(Reg) || X86::VK16RegClass.contains(Reg) || + X86::VK32RegClass.contains(Reg) || + X86::VK64RegClass.contains(Reg) || X86::VK1RegClass.contains(Reg); } static @@ -3143,7 +3245,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Moving EFLAGS to / from another register requires a push and a pop. // Notice that we have to adjust the stack if we don't want to clobber the - // first frame index. See X86FrameLowering.cpp - colobbersTheStack. + // first frame index. See X86FrameLowering.cpp - clobbersTheStack. if (SrcReg == X86::EFLAGS) { if (X86::GR64RegClass.contains(DestReg)) { BuildMI(MBB, MI, DL, get(X86::PUSHF64)); @@ -3287,9 +3389,11 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() && "Stack slot too small for store"); unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); - bool isAligned = - (MF.getTarget().getFrameLowering()->getStackAlignment() >= Alignment) || - RI.canRealignStack(MF); + bool isAligned = (MF.getTarget() + .getSubtargetImpl() + ->getFrameLowering() + ->getStackAlignment() >= Alignment) || + RI.canRealignStack(MF); unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); DebugLoc DL = MBB.findDebugLoc(MI); addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx) @@ -3324,9 +3428,11 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); - bool isAligned = - (MF.getTarget().getFrameLowering()->getStackAlignment() >= Alignment) || - RI.canRealignStack(MF); + bool isAligned = (MF.getTarget() + .getSubtargetImpl() + ->getFrameLowering() + ->getStackAlignment() >= Alignment) || + RI.canRealignStack(MF); unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); DebugLoc DL = MBB.findDebugLoc(MI); addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx); @@ -3868,10 +3974,10 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, /// operand at the use. We fold the load instructions if load defines a virtual /// register, the virtual register is used once in the same BB, and the /// instructions in-between do not load or store, and have no side effects. -MachineInstr* X86InstrInfo:: -optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI, - unsigned &FoldAsLoadDefReg, - MachineInstr *&DefMI) const { +MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI, + const MachineRegisterInfo *MRI, + unsigned &FoldAsLoadDefReg, + MachineInstr *&DefMI) const { if (FoldAsLoadDefReg == 0) return nullptr; // To be conservative, if there exists another load, clear the load candidate. @@ -3887,55 +3993,35 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI, if (!DefMI->isSafeToMove(this, nullptr, SawStore)) return nullptr; - // We try to commute MI if possible. - unsigned IdxEnd = (MI->isCommutable()) ? 2 : 1; - for (unsigned Idx = 0; Idx < IdxEnd; Idx++) { - // Collect information about virtual register operands of MI. - unsigned SrcOperandId = 0; - bool FoundSrcOperand = false; - for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); - if (!MO.isReg()) - continue; - unsigned Reg = MO.getReg(); - if (Reg != FoldAsLoadDefReg) - continue; - // Do not fold if we have a subreg use or a def or multiple uses. - if (MO.getSubReg() || MO.isDef() || FoundSrcOperand) - return nullptr; - - SrcOperandId = i; - FoundSrcOperand = true; - } - if (!FoundSrcOperand) return nullptr; - - // Check whether we can fold the def into SrcOperandId. - SmallVector<unsigned, 8> Ops; - Ops.push_back(SrcOperandId); - MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI); - if (FoldMI) { - FoldAsLoadDefReg = 0; - return FoldMI; - } - - if (Idx == 1) { - // MI was changed but it didn't help, commute it back! - commuteInstruction(MI, false); + // Collect information about virtual register operands of MI. + unsigned SrcOperandId = 0; + bool FoundSrcOperand = false; + for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (Reg != FoldAsLoadDefReg) + continue; + // Do not fold if we have a subreg use or a def or multiple uses. + if (MO.getSubReg() || MO.isDef() || FoundSrcOperand) return nullptr; - } - // Check whether we can commute MI and enable folding. - if (MI->isCommutable()) { - MachineInstr *NewMI = commuteInstruction(MI, false); - // Unable to commute. - if (!NewMI) return nullptr; - if (NewMI != MI) { - // New instruction. It doesn't need to be kept. - NewMI->eraseFromParent(); - return nullptr; - } - } + SrcOperandId = i; + FoundSrcOperand = true; + } + if (!FoundSrcOperand) + return nullptr; + + // Check whether we can fold the def into SrcOperandId. + SmallVector<unsigned, 8> Ops; + Ops.push_back(SrcOperandId); + MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI); + if (FoldMI) { + FoldAsLoadDefReg = 0; + return FoldMI; } + return nullptr; } @@ -3961,6 +4047,28 @@ static bool Expand2AddrUndef(MachineInstrBuilder &MIB, return true; } +// LoadStackGuard has so far only been implemented for 64-bit MachO. Different +// code sequence is needed for other targets. +static void expandLoadStackGuard(MachineInstrBuilder &MIB, + const TargetInstrInfo &TII) { + MachineBasicBlock &MBB = *MIB->getParent(); + DebugLoc DL = MIB->getDebugLoc(); + unsigned Reg = MIB->getOperand(0).getReg(); + const GlobalValue *GV = + cast<GlobalValue>((*MIB->memoperands_begin())->getValue()); + unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; + MachineMemOperand *MMO = MBB.getParent()-> + getMachineMemOperand(MachinePointerInfo::getGOT(), Flag, 8, 8); + MachineBasicBlock::iterator I = MIB.getInstr(); + + BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1) + .addReg(0).addGlobalAddress(GV, 0, X86II::MO_GOTPCREL).addReg(0) + .addMemOperand(MMO); + MIB->setDebugLoc(DL); + MIB->setDesc(TII.get(X86::MOV64rm)); + MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0); +} + bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { bool HasAVX = Subtarget.hasAVX(); MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); @@ -3995,6 +4103,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr)); case X86::KSET1B: case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr)); + case TargetOpcode::LOAD_STACK_GUARD: + expandLoadStackGuard(MIB, *this); + return true; } return false; } @@ -4070,7 +4181,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, unsigned i, const SmallVectorImpl<MachineOperand> &MOs, - unsigned Size, unsigned Align) const { + unsigned Size, unsigned Align, + bool AllowCommute) const { const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr; bool isCallRegIndirect = Subtarget.callRegIndirect(); @@ -4138,8 +4250,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4) return nullptr; // If this is a 64-bit load, but the spill slot is 32, then we can do - // a 32-bit load which is implicitly zero-extended. This likely is due - // to liveintervalanalysis remat'ing a load from stack slot. + // a 32-bit load which is implicitly zero-extended. This likely is + // due to live interval analysis remat'ing a load from stack slot. if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg()) return nullptr; Opcode = X86::MOV32rm; @@ -4158,8 +4270,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // to a 32-bit one. unsigned DstReg = NewMI->getOperand(0).getReg(); if (TargetRegisterInfo::isPhysicalRegister(DstReg)) - NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, - X86::sub_32bit)); + NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit)); else NewMI->getOperand(0).setSubReg(X86::sub_32bit); } @@ -4167,6 +4278,65 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, } } + // If the instruction and target operand are commutable, commute the + // instruction and try again. + if (AllowCommute) { + unsigned OriginalOpIdx = i, CommuteOpIdx1, CommuteOpIdx2; + if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) { + bool HasDef = MI->getDesc().getNumDefs(); + unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0; + unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg(); + unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg(); + bool Tied0 = + 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); + bool Tied1 = + 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO); + + // If either of the commutable operands are tied to the destination + // then we can not commute + fold. + if ((HasDef && Reg0 == Reg1 && Tied0) || + (HasDef && Reg0 == Reg2 && Tied1)) + return nullptr; + + if ((CommuteOpIdx1 == OriginalOpIdx) || + (CommuteOpIdx2 == OriginalOpIdx)) { + MachineInstr *CommutedMI = commuteInstruction(MI, false); + if (!CommutedMI) { + // Unable to commute. + return nullptr; + } + if (CommutedMI != MI) { + // New instruction. We can't fold from this. + CommutedMI->eraseFromParent(); + return nullptr; + } + + // Attempt to fold with the commuted version of the instruction. + unsigned CommuteOp = + (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1); + NewMI = foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, Size, Align, + /*AllowCommute=*/false); + if (NewMI) + return NewMI; + + // Folding failed again - undo the commute before returning. + MachineInstr *UncommutedMI = commuteInstruction(MI, false); + if (!UncommutedMI) { + // Unable to commute. + return nullptr; + } + if (UncommutedMI != MI) { + // New instruction. It doesn't need to be kept. + UncommutedMI->eraseFromParent(); + return nullptr; + } + + // Return here to prevent duplicate fuse failure report. + return nullptr; + } + } + } + // No fusion if (PrintFailedFusing && !MI->isCopy()) dbgs() << "We failed to fuse operand " << i << " in " << *MI; @@ -4350,8 +4520,10 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, // If the function stack isn't realigned we don't want to fold instructions // that need increased alignment. if (!RI.needsStackRealignment(MF)) - Alignment = std::min( - Alignment, MF.getTarget().getFrameLowering()->getStackAlignment()); + Alignment = std::min(Alignment, MF.getTarget() + .getSubtargetImpl() + ->getFrameLowering() + ->getStackAlignment()); if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { unsigned NewOpc = 0; unsigned RCSize = 0; @@ -4374,7 +4546,27 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, SmallVector<MachineOperand,4> MOs; MOs.push_back(MachineOperand::CreateFI(FrameIndex)); - return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment); + return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, + Size, Alignment, /*AllowCommute=*/true); +} + +static bool isPartialRegisterLoad(const MachineInstr &LoadMI, + const MachineFunction &MF) { + unsigned Opc = LoadMI.getOpcode(); + unsigned RegSize = + MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize(); + + if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4) + // These instructions only load 32 bits, we can't fold them if the + // destination register is wider than 32 bits (4 bytes). + return true; + + if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8) + // These instructions only load 64 bits, we can't fold them if the + // destination register is wider than 64 bits (8 bytes). + return true; + + return false; } MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, @@ -4384,8 +4576,11 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // If loading from a FrameIndex, fold directly from the FrameIndex. unsigned NumOps = LoadMI->getDesc().getNumOperands(); int FrameIndex; - if (isLoadFromStackSlot(LoadMI, FrameIndex)) + if (isLoadFromStackSlot(LoadMI, FrameIndex)) { + if (isPartialRegisterLoad(*LoadMI, MF)) + return nullptr; return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex); + } // Check switch flag if (NoFusing) return nullptr; @@ -4496,19 +4691,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, break; } default: { - if ((LoadMI->getOpcode() == X86::MOVSSrm || - LoadMI->getOpcode() == X86::VMOVSSrm) && - MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize() - > 4) - // These instructions only load 32 bits, we can't fold them if the - // destination register is wider than 32 bits (4 bytes). - return nullptr; - if ((LoadMI->getOpcode() == X86::MOVSDrm || - LoadMI->getOpcode() == X86::VMOVSDrm) && - MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize() - > 8) - // These instructions only load 64 bits, we can't fold them if the - // destination register is wider than 64 bits (8 bytes). + if (isPartialRegisterLoad(*LoadMI, MF)) return nullptr; // Folding a normal load. Just copy the load's address operands. @@ -4517,7 +4700,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, break; } } - return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, 0, Alignment); + return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, + /*Size=*/0, Alignment, /*AllowCommute=*/true); } @@ -5299,16 +5483,32 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { NopInst.setOpcode(X86::NOOP); } +// This code must remain in sync with getJumpInstrTableEntryBound in this class! +// In particular, getJumpInstrTableEntryBound must always return an upper bound +// on the encoding lengths of the instructions generated by +// getUnconditionalBranch and getTrap. void X86InstrInfo::getUnconditionalBranch( MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const { Branch.setOpcode(X86::JMP_4); Branch.addOperand(MCOperand::CreateExpr(BranchTarget)); } +// This code must remain in sync with getJumpInstrTableEntryBound in this class! +// In particular, getJumpInstrTableEntryBound must always return an upper bound +// on the encoding lengths of the instructions generated by +// getUnconditionalBranch and getTrap. void X86InstrInfo::getTrap(MCInst &MI) const { MI.setOpcode(X86::TRAP); } +// See getTrap and getUnconditionalBranch for conditions on the value returned +// by this function. +unsigned X86InstrInfo::getJumpInstrTableEntryBound() const { + // 5 bytes suffice: JMP_4 Symbol@PLT is uses 1 byte (E9) for the JMP_4 and 4 + // bytes for the symbol offset. And TRAP is ud2, which is two bytes (0F 0B). + return 5; +} + bool X86InstrInfo::isHighLatencyDef(int opc) const { switch (opc) { default: return false; @@ -5351,10 +5551,10 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const { case X86::VSQRTSSm: case X86::VSQRTSSm_Int: case X86::VSQRTSSr: - case X86::VSQRTPDZrm: - case X86::VSQRTPDZrr: - case X86::VSQRTPSZrm: - case X86::VSQRTPSZrr: + case X86::VSQRTPDZm: + case X86::VSQRTPDZr: + case X86::VSQRTPSZm: + case X86::VSQRTPSZr: case X86::VSQRTSDZm: case X86::VSQRTSDZm_Int: case X86::VSQRTSDZr: @@ -5426,7 +5626,7 @@ namespace { MachineBasicBlock::iterator MBBI = FirstMBB.begin(); DebugLoc DL = FirstMBB.findDebugLoc(MBBI); MachineRegisterInfo &RegInfo = MF.getRegInfo(); - const X86InstrInfo *TII = TM->getInstrInfo(); + const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); unsigned PC; if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT()) @@ -5524,7 +5724,7 @@ namespace { const X86TargetMachine *TM = static_cast<const X86TargetMachine *>(&MF->getTarget()); const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit(); - const X86InstrInfo *TII = TM->getInstrInfo(); + const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); // Insert a Copy from TLSBaseAddrReg to RAX/EAX. MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(), @@ -5545,7 +5745,7 @@ namespace { const X86TargetMachine *TM = static_cast<const X86TargetMachine *>(&MF->getTarget()); const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit(); - const X86InstrInfo *TII = TM->getInstrInfo(); + const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); // Create a virtual register for the TLS base address. MachineRegisterInfo &RegInfo = MF->getRegInfo(); diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index c177e3a..57b1958 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86INSTRUCTIONINFO_H -#define X86INSTRUCTIONINFO_H +#ifndef LLVM_LIB_TARGET_X86_X86INSTRINFO_H +#define LLVM_LIB_TARGET_X86_X86INSTRINFO_H #include "MCTargetDesc/X86BaseInfo.h" #include "X86RegisterInfo.h" @@ -404,7 +404,8 @@ public: MachineInstr* MI, unsigned OpNum, const SmallVectorImpl<MachineOperand> &MOs, - unsigned Size, unsigned Alignment) const; + unsigned Size, unsigned Alignment, + bool AllowCommute) const; void getUnconditionalBranch(MCInst &Branch, @@ -412,6 +413,8 @@ public: void getTrap(MCInst &MI) const override; + unsigned getJumpInstrTableEntryBound() const override; + bool isHighLatencyDef(int opc) const override; bool hasHighOperandLatency(const InstrItineraryData *ItinData, diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index e7b532c..3dbf819 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -551,11 +551,6 @@ class ImmSExtAsmOperandClass : AsmOperandClass { let RenderMethod = "addImmOperands"; } -class ImmZExtAsmOperandClass : AsmOperandClass { - let SuperClasses = [ImmAsmOperand]; - let RenderMethod = "addImmOperands"; -} - def X86GR32orGR64AsmOperand : AsmOperandClass { let Name = "GR32orGR64"; } @@ -568,6 +563,7 @@ def AVX512RC : Operand<i32> { let PrintMethod = "printRoundingControl"; let OperandType = "OPERAND_IMMEDIATE"; } + // Sign-extended immediate classes. We don't need to define the full lattice // here because there is no instruction with an ambiguity between ImmSExti64i32 // and ImmSExti32i8. @@ -595,12 +591,6 @@ def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass { let Name = "ImmSExti32i8"; } -// [0, 0x000000FF] -def ImmZExtu32u8AsmOperand : ImmZExtAsmOperandClass { - let Name = "ImmZExtu32u8"; -} - - // [0, 0x0000007F] | // [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass { @@ -620,11 +610,6 @@ def i32i8imm : Operand<i32> { let ParserMatchClass = ImmSExti32i8AsmOperand; let OperandType = "OPERAND_IMMEDIATE"; } -// 32-bits but only 8 bits are significant, and those 8 bits are unsigned. -def u32u8imm : Operand<i32> { - let ParserMatchClass = ImmZExtu32u8AsmOperand; - let OperandType = "OPERAND_IMMEDIATE"; -} // 64-bits but only 32 bits are significant. def i64i32imm : Operand<i64> { @@ -708,6 +693,7 @@ def UseSSE3 : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">; def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">; def UseSSSE3 : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">; def HasSSE41 : Predicate<"Subtarget->hasSSE41()">; +def NoSSE41 : Predicate<"!Subtarget->hasSSE41()">; def UseSSE41 : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">; def HasSSE42 : Predicate<"Subtarget->hasSSE42()">; def UseSSE42 : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">; @@ -719,10 +705,16 @@ def HasAVX512 : Predicate<"Subtarget->hasAVX512()">, AssemblerPredicate<"FeatureAVX512", "AVX-512 ISA">; def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">; def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">; -def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">; +def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">; def HasCDI : Predicate<"Subtarget->hasCDI()">; def HasPFI : Predicate<"Subtarget->hasPFI()">; def HasERI : Predicate<"Subtarget->hasERI()">; +def HasDQI : Predicate<"Subtarget->hasDQI()">; +def NoDQI : Predicate<"!Subtarget->hasDQI()">; +def HasBWI : Predicate<"Subtarget->hasBWI()">; +def HasVLX : Predicate<"Subtarget->hasVLX()">, + AssemblerPredicate<"FeatureVLX", "AVX-512 VLX ISA">; +def NoVLX : Predicate<"!Subtarget->hasVLX()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; def HasAES : Predicate<"Subtarget->hasAES()">; @@ -744,8 +736,10 @@ def HasHLE : Predicate<"Subtarget->hasHLE()">; def HasTSX : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">; def HasADX : Predicate<"Subtarget->hasADX()">; def HasSHA : Predicate<"Subtarget->hasSHA()">; +def HasSGX : Predicate<"Subtarget->hasSGX()">; def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">; +def HasSMAP : Predicate<"Subtarget->hasSMAP()">; def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">; def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; @@ -754,6 +748,8 @@ def Not64BitMode : Predicate<"!Subtarget->is64Bit()">, AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">; def In64BitMode : Predicate<"Subtarget->is64Bit()">, AssemblerPredicate<"Mode64Bit", "64-bit mode">; +def IsLP64 : Predicate<"Subtarget->isTarget64BitLP64()">; +def NotLP64 : Predicate<"!Subtarget->isTarget64BitLP64()">; def In16BitMode : Predicate<"Subtarget->is16Bit()">, AssemblerPredicate<"Mode16Bit", "16-bit mode">; def Not16BitMode : Predicate<"!Subtarget->is16Bit()">, @@ -2396,6 +2392,7 @@ include "X86InstrVMX.td" include "X86InstrSVM.td" include "X86InstrTSX.td" +include "X86InstrSGX.td" // System instructions. include "X86InstrSystem.td" @@ -2514,7 +2511,7 @@ def : MnemonicAlias<"fldcww", "fldcw", "att">; def : MnemonicAlias<"fnstcww", "fnstcw", "att">; def : MnemonicAlias<"fnstsww", "fnstsw", "att">; def : MnemonicAlias<"fucomip", "fucompi", "att">; -def : MnemonicAlias<"fwait", "wait", "att">; +def : MnemonicAlias<"fwait", "wait">; class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond, diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index ecf80a1..9001fba 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -38,12 +38,17 @@ def MMX_PHADDSUBD : OpndItins< >; } +let Sched = WriteVecLogic in +def MMX_INTALU_ITINS_VECLOGICSCHED : OpndItins< + IIC_MMX_ALU_RR, IIC_MMX_ALU_RM +>; + let Sched = WriteVecIMul in def MMX_PMUL_ITINS : OpndItins< IIC_MMX_PMUL, IIC_MMX_PMUL >; -let Sched = WriteVecALU in { +let Sched = WriteVecIMul in { def MMX_PSADBW_ITINS : OpndItins< IIC_MMX_PSADBW, IIC_MMX_PSADBW >; @@ -167,12 +172,14 @@ multiclass ssse3_palign_mm<string asm, Intrinsic IntId> { def R64irr : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2, i8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>; + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>, + Sched<[WriteShuffle]>; def R64irm : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2, i8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set VR64:$dst, (IntId VR64:$src1, - (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>; + (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>, + Sched<[WriteShuffleLd, ReadAfterLd]>; } multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, @@ -192,11 +199,11 @@ multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC, def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), asm, [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], - NoItinerary, d>; + NoItinerary, d>, Sched<[WriteCvtI2F]>; def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins DstRC:$src1, x86memop:$src2), asm, [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], - NoItinerary, d>; + NoItinerary, d>, Sched<[WriteCvtI2FLd]>; } //===----------------------------------------------------------------------===// @@ -427,13 +434,13 @@ let Constraints = "$src1 = $dst" in // Logical Instructions defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand, - MMX_INTALU_ITINS, 1>; + MMX_INTALU_ITINS_VECLOGICSCHED, 1>; defm MMX_POR : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por, - MMX_INTALU_ITINS, 1>; + MMX_INTALU_ITINS_VECLOGICSCHED, 1>; defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor, - MMX_INTALU_ITINS, 1>; + MMX_INTALU_ITINS_VECLOGICSCHED, 1>; defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn, - MMX_INTALU_ITINS>; + MMX_INTALU_ITINS_VECLOGICSCHED>; // Shift Instructions defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", diff --git a/lib/Target/X86/X86InstrSGX.td b/lib/Target/X86/X86InstrSGX.td new file mode 100644 index 0000000..47c5dc5 --- /dev/null +++ b/lib/Target/X86/X86InstrSGX.td @@ -0,0 +1,24 @@ +//===-- X86InstrSGX.td - SGX Instruction Set Extension -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the instructions that make up the Intel SGX instruction +// set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// SGX instructions + +// ENCLS - Execute an Enclave System Function of Specified Leaf Number +def ENCLS : I<0x01, MRM_CF, (outs), (ins), + "encls", []>, TB, Requires<[HasSGX]>; + +// ENCLU - Execute an Enclave User Function of Specified Leaf Number +def ENCLU : I<0x01, MRM_D7, (outs), (ins), + "enclu", []>, TB, Requires<[HasSGX]>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index f9a5ae1..cc896f0 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -181,6 +181,7 @@ def SSE_MPSADBW_ITINS : OpndItins< IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM >; +let Sched = WriteVecIMul in def SSE_PMULLD_ITINS : OpndItins< IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM >; @@ -218,11 +219,21 @@ def DEFAULT_ITINS_BLENDSCHED : OpndItins< IIC_ALU_NONMEM, IIC_ALU_MEM >; +let Sched = WriteVarBlend in +def DEFAULT_ITINS_VARBLENDSCHED : OpndItins< + IIC_ALU_NONMEM, IIC_ALU_MEM +>; + let Sched = WriteFBlend in def SSE_INTALU_ITINS_FBLEND_P : OpndItins< IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM >; +let Sched = WriteBlend in +def SSE_INTALU_ITINS_BLEND_P : OpndItins< + IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM +>; + //===----------------------------------------------------------------------===// // SSE 1 & 2 Instructions Classes //===----------------------------------------------------------------------===// @@ -601,29 +612,6 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in { // Patterns let Predicates = [UseAVX] in { - let AddedComplexity = 15 in { - // Move scalar to XMM zero-extended, zeroing a VR128 then do a - // MOVS{S,D} to the lower bits. - def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), - (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>; - def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), - (VMOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; - def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), - (VMOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; - def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), - (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; - - // Move low f32 and clear high bits. - def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSSrr (v4f32 (V_SET0)), - (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), sub_xmm)>; - def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSSrr (v4i32 (V_SET0)), - (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), sub_xmm)>; - } - let AddedComplexity = 20 in { // MOVSSrm zeros the high parts of the register; represent this // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 @@ -659,31 +647,10 @@ let Predicates = [UseAVX] in { (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; } - def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), - (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)), - sub_xmm)>; - def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))), - (SUBREG_TO_REG (i64 0), - (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), - sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>; - // Move low f64 and clear high bits. - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSDrr (v2f64 (V_SET0)), - (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), sub_xmm)>; - - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSDrr (v2i64 (V_SET0)), - (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), sub_xmm)>; - // Extract and store. def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), addr:$dst), @@ -734,7 +701,6 @@ let Predicates = [UseAVX] in { (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)), sub_xmm)>; - // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem // is during lowering, where it's not possible to recognize the fold cause // it has two uses through a bitcast. One use disappears at isel time and the @@ -750,7 +716,7 @@ let Predicates = [UseAVX] in { } let Predicates = [UseSSE1] in { - let AddedComplexity = 15 in { + let Predicates = [NoSSE41], AddedComplexity = 15 in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSS to the lower bits. def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), @@ -784,7 +750,7 @@ let Predicates = [UseSSE1] in { } let Predicates = [UseSSE2] in { - let AddedComplexity = 15 in { + let Predicates = [NoSSE41], AddedComplexity = 15 in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSD to the lower bits. def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), @@ -854,6 +820,7 @@ let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in Sched<[WriteLoad]>; } +let Predicates = [HasAVX, NoVLX] in { defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, PS, VEX; @@ -879,20 +846,26 @@ defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, PD, VEX, VEX_L; +} + +let Predicates = [UseSSE1] in { defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, PS; -defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, - "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, - PD; defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", SSEPackedSingle, SSE_MOVU_ITINS>, PS; +} +let Predicates = [UseSSE2] in { +defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, + "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, + PD; defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, PD; +} -let SchedRW = [WriteStore] in { +let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX] in { def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [(alignedstore (v4f32 VR128:$src), addr:$dst)], @@ -1006,7 +979,7 @@ def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, - SchedRW = [WriteMove] in { + SchedRW = [WriteFShuffle] in { def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>; @@ -1036,7 +1009,7 @@ let Predicates = [UseSSE2] in (MOVUPDmr addr:$dst, VR128:$src)>; // Use vmovaps/vmovups for AVX integer load/store. -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { // 128-bit load/store def : Pat<(alignedloadv2i64 addr:$src), (VMOVAPSrm addr:$src)>; @@ -1251,6 +1224,9 @@ let Predicates = [HasAVX] in { (VMOVLPDrm VR128:$src1, addr:$src2)>; def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), (VMOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, + (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (VMOVLPDrm VR128:$src1, addr:$src2)>; // Store patterns def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), @@ -1298,6 +1274,9 @@ let Predicates = [UseSSE2] in { (MOVLPDrm VR128:$src1, addr:$src2)>; def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), (MOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, + (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (MOVLPDrm VR128:$src1, addr:$src2)>; // Store patterns def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)), @@ -1360,6 +1339,11 @@ let Predicates = [HasAVX] in { def : Pat<(v2f64 (X86Unpckl VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))), (VMOVHPDrm VR128:$src1, addr:$src2)>; + // Also handle an i64 load because that may get selected as a faster way to + // load the data. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, + (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), + (VMOVHPDrm VR128:$src1, addr:$src2)>; } let Predicates = [UseSSE1] in { @@ -1380,6 +1364,11 @@ let Predicates = [UseSSE2] in { def : Pat<(v2f64 (X86Unpckl VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))), (MOVHPDrm VR128:$src1, addr:$src2)>; + // Also handle an i64 load because that may get selected as a faster way to + // load the data. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, + (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), + (MOVHPDrm VR128:$src1, addr:$src2)>; } //===----------------------------------------------------------------------===// @@ -2577,18 +2566,17 @@ def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), /// sse12_shuffle - sse 1 & 2 fp shuffle instructions multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, ValueType vt, string asm, PatFrag mem_frag, - Domain d, bit IsConvertibleToThreeAddress = 0> { + Domain d> { def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, Sched<[WriteFShuffleLd, ReadAfterLd]>; - let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in - def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$src3), asm, - [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, - (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, - Sched<[WriteFShuffle]>; + def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, i8imm:$src3), asm, + [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, + (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, + Sched<[WriteFShuffle]>; } defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, @@ -2607,10 +2595,10 @@ defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, let Constraints = "$src1 = $dst" in { defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", - memopv4f32, SSEPackedSingle, 1 /* cvt to pshufd */>, PS; + memopv4f32, SSEPackedSingle>, PS; defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", - memopv2f64, SSEPackedDouble, 1 /* cvt to pshufd */>, PD; + memopv2f64, SSEPackedDouble>, PD; } let Predicates = [HasAVX] in { @@ -3136,7 +3124,6 @@ let Predicates = [UseSSE1] in { let Predicates = [UseSSE2] in { // SSE2 patterns to select scalar double-precision fp arithmetic instructions - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), FR64:$src))))), @@ -3156,10 +3143,10 @@ let Predicates = [UseSSE2] in { } let Predicates = [UseSSE41] in { - // If the subtarget has SSE4.1 but not AVX, the vector insert - // instruction is lowered into a X86insertps rather than a X86Movss. - // When selecting SSE scalar single-precision fp arithmetic instructions, - // make sure that we correctly match the X86insertps. + // If the subtarget has SSE4.1 but not AVX, the vector insert instruction is + // lowered into a X86insertps or a X86Blendi rather than a X86Movss. When + // selecting SSE scalar single-precision fp arithmetic instructions, make + // sure that we correctly match them. def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), @@ -3177,6 +3164,57 @@ let Predicates = [UseSSE41] in { (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (iPTR 0))), (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; } let Predicates = [HasAVX] in { @@ -3215,6 +3253,57 @@ let Predicates = [HasAVX] in { (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (iPTR 0))), (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; } // Patterns used to select SSE scalar fp arithmetic instructions from @@ -3269,6 +3358,49 @@ let Predicates = [UseSSE2] in { (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; } +let Predicates = [UseSSE41] in { + // With SSE4.1 we may see these operations using X86Blendi rather than + // X86Movs{s,d}. + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (ADDSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (SUBSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (MULSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (DIVSSrr_Int v4f32:$dst, v4f32:$src)>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (MULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; + + def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (MULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; +} + let Predicates = [HasAVX] in { // The following patterns select AVX Scalar single/double precision fp // arithmetic instructions from a packed single precision fp instruction @@ -3298,6 +3430,46 @@ let Predicates = [HasAVX] in { def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))), (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; + + // Also handle X86Blendi-based patterns. + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VADDSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VMULSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; + + def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; } /// Unop Arithmetic @@ -3326,6 +3498,16 @@ def SSE_SQRTSD : OpndItins< >; } +let Sched = WriteFRsqrt in { +def SSE_RSQRTPS : OpndItins< + IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM +>; + +def SSE_RSQRTSS : OpndItins< + IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM +>; +} + let Sched = WriteFRcp in { def SSE_RCPP : OpndItins< IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM @@ -3604,10 +3786,10 @@ defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss, // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. -defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>, - sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>, +defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>, + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>, sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps, - int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>; + int_x86_avx_rsqrt_ps_256, SSE_RSQRTPS>; defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>, sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>, sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, @@ -3686,6 +3868,7 @@ let Predicates = [UseSSE1] in { let AddedComplexity = 400 in { // Prefer non-temporal versions let SchedRW = [WriteStore] in { +let Predicates = [HasAVX, NoVLX] in { def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", @@ -3726,6 +3909,7 @@ def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), [(alignednontemporalstore (v4i64 VR256:$src), addr:$dst)], IIC_SSE_MOVNT>, VEX, VEX_L; +} def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", @@ -3755,6 +3939,14 @@ def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), PS, Requires<[HasSSE2]>; } // SchedRW = [WriteStore] +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), + (VMOVNTPSmr addr:$dst, VR128:$src)>; +} + +def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), + (MOVNTPSmr addr:$dst, VR128:$src)>; + } // AddedComplexity //===----------------------------------------------------------------------===// @@ -5277,6 +5469,13 @@ let Predicates = [HasAVX] in { (VMOVDDUPYrr VR256:$src)>; } +let Predicates = [UseAVX, OptForSize] in { + def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), + (VMOVDDUPrm addr:$src)>; + def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), + (VMOVDDUPrm addr:$src)>; +} + let Predicates = [UseSSE3] in { def : Pat<(X86Movddup (memopv2f64 addr:$src)), (MOVDDUPrm addr:$src)>; @@ -5357,56 +5556,34 @@ let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { // Patterns used to select 'addsub' instructions. let Predicates = [HasAVX] in { - // Constant 170 corresponds to the binary mask '10101010'. - // When used as a blend mask, it allows selecting eight elements from two - // input vectors as follow: - // - Even-numbered values in the destination are copied from - // the corresponding elements in the first input vector; - // - Odd-numbered values in the destination are copied from - // the corresponding elements in the second input vector. - - def : Pat<(v8f32 (X86Blendi (v8f32 (fsub VR256:$lhs, VR256:$rhs)), - (v8f32 (fadd VR256:$lhs, VR256:$rhs)), (i32 170))), - (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>; - - // Constant 10 corresponds to the binary mask '1010'. - // In the two pattens below, constant 10 is used as a blend mask to select - // - the 1st and 3rd element from the first input vector (the 'fsub' node); - // - the 2nd and 4th element from the second input vector (the 'fadd' node). - - def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)), - (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))), - (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; - def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)), - (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))), - (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; - def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)), - (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))), + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)), - (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), - (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)), - (v2f64 (fsub VR128:$lhs, VR128:$rhs)))), + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 (memop addr:$rhs)))), + (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 (memop addr:$rhs)))), + (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; + + def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))), + (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>; + def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 (memop addr:$rhs)))), + (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>; + def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))), + (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; + def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 (memop addr:$rhs)))), + (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>; } let Predicates = [UseSSE3] in { - // Constant 10 corresponds to the binary mask '1010'. - // In the pattern below, it is used as a blend mask to select: - // - the 1st and 3rd element from the first input vector (the fsub node); - // - the 2nd and 4th element from the second input vector (the fadd node). - - def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)), - (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))), + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>; - - def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)), - (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), - (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)), - (v2f64 (fsub VR128:$lhs, VR128:$rhs)))), + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 (memop addr:$rhs)))), + (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 (memop addr:$rhs)))), + (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; } //===---------------------------------------------------------------------===// @@ -6692,7 +6869,7 @@ let Constraints = "$src1 = $dst" in multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1, OpndItins itins = DEFAULT_ITINS> { def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, u32u8imm:$src3), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -6701,7 +6878,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1, (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>, Sched<[WriteFShuffle]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3), + (ins VR128:$src1, f32mem:$src2, i8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -7308,7 +7485,7 @@ let Constraints = "$src1 = $dst" in { let Predicates = [HasAVX] in { defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, - memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + memopv2i64, i128mem, 0, SSE_PMULLD_ITINS>, VEX_4V; defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, @@ -7316,7 +7493,7 @@ let Predicates = [HasAVX] in { } let Predicates = [HasAVX2] in { defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, - memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + memopv4i64, i256mem, 0, SSE_PMULLD_ITINS>, VEX_4V, VEX_L; defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, @@ -7337,7 +7514,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, OpndItins itins = DEFAULT_ITINS> { let isCommutable = 1 in def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, u32u8imm:$src3), + (ins RC:$src1, RC:$src2, i8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -7346,7 +7523,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>, Sched<[itins.Sched]>; def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, u32u8imm:$src3), + (ins RC:$src1, x86memop:$src2, i8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -7360,31 +7537,33 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, let Predicates = [HasAVX] in { let isCommutable = 0 in { - let ExeDomain = SSEPackedSingle in { - defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, - VR128, loadv4f32, f128mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; - defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", - int_x86_avx_blend_ps_256, VR256, loadv8f32, - f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, - VEX_4V, VEX_L; - } - let ExeDomain = SSEPackedDouble in { - defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, - VR128, loadv2f64, f128mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; - defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", - int_x86_avx_blend_pd_256,VR256, loadv4f64, - f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, - VEX_4V, VEX_L; - } + defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, + VR128, loadv2i64, i128mem, 0, + DEFAULT_ITINS_MPSADSCHED>, VEX_4V; + } + + let ExeDomain = SSEPackedSingle in { + defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, + VR128, loadv4f32, f128mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; + defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", + int_x86_avx_blend_ps_256, VR256, loadv8f32, + f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, + VEX_4V, VEX_L; + } + let ExeDomain = SSEPackedDouble in { + defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, + VR128, loadv2f64, f128mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; + defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", + int_x86_avx_blend_pd_256,VR256, loadv4f64, + f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, + VEX_4V, VEX_L; + } defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw, VR128, loadv2i64, i128mem, 0, DEFAULT_ITINS_BLENDSCHED>, VEX_4V; - defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, - VR128, loadv2i64, i128mem, 0, - DEFAULT_ITINS_MPSADSCHED>, VEX_4V; - } + let ExeDomain = SSEPackedSingle in defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, VR128, loadv4f32, f128mem, 0, @@ -7412,6 +7591,10 @@ let Predicates = [HasAVX2] in { let Constraints = "$src1 = $dst" in { let isCommutable = 0 in { + defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, + VR128, memopv2i64, i128mem, + 1, SSE_MPSADBW_ITINS>; + } let ExeDomain = SSEPackedSingle in defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps, VR128, memopv4f32, f128mem, @@ -7422,11 +7605,7 @@ let Constraints = "$src1 = $dst" in { 1, SSE_INTALU_ITINS_FBLEND_P>; defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw, VR128, memopv2i64, i128mem, - 1, SSE_INTALU_ITINS_FBLEND_P>; - defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, - VR128, memopv2i64, i128mem, - 1, SSE_MPSADBW_ITINS>; - } + 1, SSE_INTALU_ITINS_BLEND_P>; let ExeDomain = SSEPackedSingle in defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, VR128, memopv4f32, f128mem, 1, @@ -7545,6 +7724,57 @@ let Predicates = [HasAVX2] in { (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>; } +// Patterns +let Predicates = [UseAVX] in { + let AddedComplexity = 15 in { + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVS{S,D} to the lower bits. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), + (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>; + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (VBLENDPSrri (v4i32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), + (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; + + // Move low f32 and clear high bits. + def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), + (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>; + def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), + (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>; + } + + def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), + (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)), + sub_xmm)>; + def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))), + (SUBREG_TO_REG (i64 0), + (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), + sub_xmm)>; + + // Move low f64 and clear high bits. + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), + (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>; + + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), + (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>; +} + +let Predicates = [UseSSE41] in { + // With SSE41 we can use blends for these patterns. + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), + (BLENDPDrri (v2f64 (V_SET0)), VR128:$src, (i8 1))>; +} + + /// SS41I_ternary_int - SSE 4.1 ternary operator let Uses = [XMM0], Constraints = "$src1 = $dst" in { multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag, @@ -7555,7 +7785,7 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in { !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))], - itins.rr>; + itins.rr>, Sched<[itins.Sched]>; def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, x86memop:$src2), @@ -7564,18 +7794,21 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in { [(set VR128:$dst, (IntId VR128:$src1, (bitconvert (mem_frag addr:$src2)), XMM0))], - itins.rm>; + itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } } let ExeDomain = SSEPackedDouble in defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem, - int_x86_sse41_blendvpd>; + int_x86_sse41_blendvpd, + DEFAULT_ITINS_FBLENDSCHED>; let ExeDomain = SSEPackedSingle in defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem, - int_x86_sse41_blendvps>; + int_x86_sse41_blendvps, + DEFAULT_ITINS_FBLENDSCHED>; defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem, - int_x86_sse41_pblendvb>; + int_x86_sse41_pblendvb, + DEFAULT_ITINS_VARBLENDSCHED>; // Aliases with the implicit xmm0 argument def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", @@ -8393,13 +8626,13 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (vt (X86VPermilp RC:$src1, (i8 imm:$src2))))]>, VEX, + [(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffle]>; def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), (ins x86memop_f:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, - (vt (X86VPermilp (memop addr:$src1), (i8 imm:$src2))))]>, VEX, + (vt (X86VPermilpi (memop addr:$src1), (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffleLd]>; } @@ -8417,19 +8650,37 @@ let ExeDomain = SSEPackedDouble in { } let Predicates = [HasAVX] in { -def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))), +def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))), + (VPERMILPSYrr VR256:$src1, VR256:$src2)>; +def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), + (VPERMILPSYrm VR256:$src1, addr:$src2)>; +def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))), + (VPERMILPDYrr VR256:$src1, VR256:$src2)>; +def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))), + (VPERMILPDYrm VR256:$src1, addr:$src2)>; + +def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))), (VPERMILPSYri VR256:$src1, imm:$imm)>; -def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))), +def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))), (VPERMILPDYri VR256:$src1, imm:$imm)>; -def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (loadv4i64 addr:$src1)), +def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)), (i8 imm:$imm))), (VPERMILPSYmi addr:$src1, imm:$imm)>; -def : Pat<(v4i64 (X86VPermilp (loadv4i64 addr:$src1), (i8 imm:$imm))), +def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))), (VPERMILPDYmi addr:$src1, imm:$imm)>; -def : Pat<(v2i64 (X86VPermilp VR128:$src1, (i8 imm:$imm))), +def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))), + (VPERMILPSrr VR128:$src1, VR128:$src2)>; +def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))), + (VPERMILPSrm VR128:$src1, addr:$src2)>; +def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))), + (VPERMILPDrr VR128:$src1, VR128:$src2)>; +def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))), + (VPERMILPDrm VR128:$src1, addr:$src2)>; + +def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))), (VPERMILPDri VR128:$src1, imm:$imm)>; -def : Pat<(v2i64 (X86VPermilp (loadv2i64 addr:$src1), (i8 imm:$imm))), +def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))), (VPERMILPDmi addr:$src1, imm:$imm)>; } @@ -8540,15 +8791,15 @@ let Predicates = [HasF16C] in { // Patterns for matching conversions from float to half-float and vice versa. let Predicates = [HasF16C] in { - def : Pat<(f32_to_f16 FR32:$src), + def : Pat<(fp_to_f16 FR32:$src), (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 0)), sub_16bit))>; - def : Pat<(f16_to_f32 GR16:$src), + def : Pat<(f16_to_fp GR16:$src), (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >; - def : Pat<(f16_to_f32 (i16 (f32_to_f16 FR32:$src))), + def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))), (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 0)), FR32)) >; } @@ -8563,13 +8814,13 @@ multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr, X86MemOperand x86memop> { let isCommutable = 1 in def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, u32u8imm:$src3), + (ins RC:$src1, RC:$src2, i8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, Sched<[WriteBlend]>, VEX_4V; def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, u32u8imm:$src3), + (ins RC:$src1, x86memop:$src2, i8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, @@ -8578,12 +8829,10 @@ multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr, Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V; } -let isCommutable = 0 in { defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128, VR128, loadv2i64, i128mem>; defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256, VR256, loadv4i64, i256mem>, VEX_L; -} def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$mask)), @@ -8675,6 +8924,27 @@ let Predicates = [HasAVX2] in { def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))), (VBROADCASTSDYrr VR128:$src)>; + // Provide aliases for broadcast from the same regitser class that + // automatically does the extract. + def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))), + (VPBROADCASTBYrr (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), + sub_xmm)))>; + def : Pat<(v16i16 (X86VBroadcast (v16i16 VR256:$src))), + (VPBROADCASTWYrr (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), + sub_xmm)))>; + def : Pat<(v8i32 (X86VBroadcast (v8i32 VR256:$src))), + (VPBROADCASTDYrr (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), + sub_xmm)))>; + def : Pat<(v4i64 (X86VBroadcast (v4i64 VR256:$src))), + (VPBROADCASTQYrr (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), + sub_xmm)))>; + def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))), + (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), + sub_xmm)))>; + def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))), + (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), + sub_xmm)))>; + // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. let AddedComplexity = 20 in { @@ -8756,6 +9026,9 @@ let Predicates = [HasAVX] in { (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm), (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>; } + + def : Pat<(v2f64 (X86VBroadcast f64:$src)), + (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>; } //===----------------------------------------------------------------------===// @@ -8763,14 +9036,14 @@ let Predicates = [HasAVX] in { // multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, - ValueType OpVT> { + ValueType OpVT, X86FoldableSchedWrite Sched> { def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, - Sched<[WriteFShuffle256]>, VEX_4V, VEX_L; + Sched<[Sched]>, VEX_4V, VEX_L; def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), !strconcat(OpcodeStr, @@ -8778,22 +9051,22 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, [(set VR256:$dst, (OpVT (X86VPermv VR256:$src1, (bitconvert (mem_frag addr:$src2)))))]>, - Sched<[WriteFShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; + Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L; } -defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32>; +defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256>; let ExeDomain = SSEPackedSingle in -defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32>; +defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>; multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, - ValueType OpVT> { + ValueType OpVT, X86FoldableSchedWrite Sched> { def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, - Sched<[WriteShuffle256]>, VEX, VEX_L; + Sched<[Sched]>, VEX, VEX_L; def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src1, i8imm:$src2), !strconcat(OpcodeStr, @@ -8801,12 +9074,14 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, [(set VR256:$dst, (OpVT (X86VPermi (mem_frag addr:$src1), (i8 imm:$src2))))]>, - Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX, VEX_L; + Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L; } -defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64>, VEX_W; +defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64, + WriteShuffle256>, VEX_W; let ExeDomain = SSEPackedDouble in -defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64>, VEX_W; +defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, + WriteFShuffle256>, VEX_W; //===----------------------------------------------------------------------===// // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td index 5402780..8cabdd0 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -462,11 +462,7 @@ def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src), "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB; let Defs = [EAX, EBX, ECX, EDX], Uses = [EAX, ECX] in - def CPUID32 : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB, - Requires<[Not64BitMode]>; -let Defs = [RAX, RBX, RCX, RDX], Uses = [RAX, RCX] in - def CPUID64 : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB, - Requires<[In64BitMode]>; + def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB; } // SchedRW //===----------------------------------------------------------------------===// @@ -479,10 +475,10 @@ def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB; //===----------------------------------------------------------------------===// // XSAVE instructions let SchedRW = [WriteSystem] in { -let Defs = [RDX, RAX], Uses = [RCX] in +let Defs = [EDX, EAX], Uses = [ECX] in def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB; -let Uses = [RDX, RAX, RCX] in +let Uses = [EDX, EAX, ECX] in def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB; let Uses = [RDX, RAX] in { @@ -563,7 +559,7 @@ def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), //===----------------------------------------------------------------------===// // SMAP Instruction -let Defs = [EFLAGS], Uses = [EFLAGS] in { +let Predicates = [HasSMAP], Defs = [EFLAGS] in { def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB; def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB; } diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h new file mode 100644 index 0000000..d252f72 --- /dev/null +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -0,0 +1,320 @@ +//===-- X86IntinsicsInfo.h - X86 Instrinsics ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the details for lowering X86 intrinsics +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H +#define LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H + +namespace llvm { + +enum IntrinsicType { + INTR_NO_TYPE, + GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, + INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, + CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, + INTR_TYPE_1OP_MASK_RM +}; + +struct IntrinsicData { + + unsigned Id; + IntrinsicType Type; + unsigned Opc0; + unsigned Opc1; + + bool operator<(const IntrinsicData &RHS) const { + return Id < RHS.Id; + } + bool operator==(const IntrinsicData &RHS) const { + return RHS.Id == Id; + } +}; + +#define X86_INTRINSIC_DATA(id, type, op0, op1) \ + { Intrinsic::x86_##id, type, op0, op1 } + +/* + * IntrinsicsWithChain - the table should be sorted by Intrinsic ID - in + * the alphabetical order. + */ +static const IntrinsicData IntrinsicsWithChain[] = { + X86_INTRINSIC_DATA(addcarry_u32, ADX, X86ISD::ADC, 0), + X86_INTRINSIC_DATA(addcarry_u64, ADX, X86ISD::ADC, 0), + X86_INTRINSIC_DATA(addcarryx_u32, ADX, X86ISD::ADC, 0), + X86_INTRINSIC_DATA(addcarryx_u64, ADX, X86ISD::ADC, 0), + + X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0), + X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0), + X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0), + X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0), + X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0), + X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0), + X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0), + X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0), + + X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH, + X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm), + X86_INTRINSIC_DATA(avx512_gatherpf_dps_512, PREFETCH, + X86::VGATHERPF0DPSm, X86::VGATHERPF1DPSm), + X86_INTRINSIC_DATA(avx512_gatherpf_qpd_512, PREFETCH, + X86::VGATHERPF0QPDm, X86::VGATHERPF1QPDm), + X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH, + X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm), + + X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0), + X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0), + X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0), + X86_INTRINSIC_DATA(avx512_scatter_dps_512, SCATTER, X86::VSCATTERDPSZmr, 0), + X86_INTRINSIC_DATA(avx512_scatter_qpd_512, SCATTER, X86::VSCATTERQPDZmr, 0), + X86_INTRINSIC_DATA(avx512_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0), + X86_INTRINSIC_DATA(avx512_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0), + X86_INTRINSIC_DATA(avx512_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0), + + X86_INTRINSIC_DATA(avx512_scatterpf_dpd_512, PREFETCH, + X86::VSCATTERPF0DPDm, X86::VSCATTERPF1DPDm), + X86_INTRINSIC_DATA(avx512_scatterpf_dps_512, PREFETCH, + X86::VSCATTERPF0DPSm, X86::VSCATTERPF1DPSm), + X86_INTRINSIC_DATA(avx512_scatterpf_qpd_512, PREFETCH, + X86::VSCATTERPF0QPDm, X86::VSCATTERPF1QPDm), + X86_INTRINSIC_DATA(avx512_scatterpf_qps_512, PREFETCH, + X86::VSCATTERPF0QPSm, X86::VSCATTERPF1QPSm), + + X86_INTRINSIC_DATA(rdpmc, RDPMC, X86ISD::RDPMC_DAG, 0), + X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0), + X86_INTRINSIC_DATA(rdrand_32, RDRAND, X86ISD::RDRAND, 0), + X86_INTRINSIC_DATA(rdrand_64, RDRAND, X86ISD::RDRAND, 0), + X86_INTRINSIC_DATA(rdseed_16, RDSEED, X86ISD::RDSEED, 0), + X86_INTRINSIC_DATA(rdseed_32, RDSEED, X86ISD::RDSEED, 0), + X86_INTRINSIC_DATA(rdseed_64, RDSEED, X86ISD::RDSEED, 0), + X86_INTRINSIC_DATA(rdtsc, RDTSC, X86ISD::RDTSC_DAG, 0), + X86_INTRINSIC_DATA(rdtscp, RDTSC, X86ISD::RDTSCP_DAG, 0), + + X86_INTRINSIC_DATA(subborrow_u32, ADX, X86ISD::SBB, 0), + X86_INTRINSIC_DATA(subborrow_u64, ADX, X86ISD::SBB, 0), + X86_INTRINSIC_DATA(xtest, XTEST, X86ISD::XTEST, 0), +}; + +/* + * Find Intrinsic data by intrinsic ID + */ +static const IntrinsicData* getIntrinsicWithChain(unsigned IntNo) { + + IntrinsicData IntrinsicToFind = {IntNo, INTR_NO_TYPE, 0, 0 }; + const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithChain), + std::end(IntrinsicsWithChain), + IntrinsicToFind); + if (Data != std::end(IntrinsicsWithChain) && *Data == IntrinsicToFind) + return Data; + return nullptr; +} + +/* + * IntrinsicsWithoutChain - the table should be sorted by Intrinsic ID - in + * the alphabetical order. + */ +static const IntrinsicData IntrinsicsWithoutChain[] = { + X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), + X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), + X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(avx2_pmaxs_b, INTR_TYPE_2OP, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxs_d, INTR_TYPE_2OP, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxs_w, INTR_TYPE_2OP, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxu_b, INTR_TYPE_2OP, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxu_d, INTR_TYPE_2OP, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxu_w, INTR_TYPE_2OP, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx2_pmins_b, INTR_TYPE_2OP, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx2_pmins_d, INTR_TYPE_2OP, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx2_pmins_w, INTR_TYPE_2OP, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx2_pminu_b, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx2_pminu_d, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx2_pminu_w, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx2_pslli_d, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx2_pslli_q, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx2_pslli_w, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx2_psrai_d, VSHIFT, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx2_psrai_w, VSHIFT, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx2_psrli_d, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx2_psrli_q, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx2_psrli_w, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::EXP2, 0), + X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::EXP2, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_b_128, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_b_256, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_b_512, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_d_128, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_d_256, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_d_512, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_q_128, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_q_256, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_q_512, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_128, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_256, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_512, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_128, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_256, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_512, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_128, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_256, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_512, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_128, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_256, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_512, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_b_128, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_b_256, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_b_512, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_d_128, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_d_256, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_d_512, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_q_128, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_q_256, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_q_512, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pslli_d, VSHIFT_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_pslli_q, VSHIFT_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrai_d, VSHIFT_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrai_q, VSHIFT_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrli_d, VSHIFT_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrli_q, VSHIFT_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_d_128, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_d_256, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_d_512, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_q_128, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_q_256, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_q_512, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), + X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), + X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), + X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), + X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0), + X86_INTRINSIC_DATA(avx_hsub_ps_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0), + X86_INTRINSIC_DATA(avx_sqrt_pd_256, INTR_TYPE_1OP, ISD::FSQRT, 0), + X86_INTRINSIC_DATA(avx_sqrt_ps_256, INTR_TYPE_1OP, ISD::FSQRT, 0), + X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(sse2_comieq_sd, COMI, X86ISD::COMI, ISD::SETEQ), + X86_INTRINSIC_DATA(sse2_comige_sd, COMI, X86ISD::COMI, ISD::SETGE), + X86_INTRINSIC_DATA(sse2_comigt_sd, COMI, X86ISD::COMI, ISD::SETGT), + X86_INTRINSIC_DATA(sse2_comile_sd, COMI, X86ISD::COMI, ISD::SETLE), + X86_INTRINSIC_DATA(sse2_comilt_sd, COMI, X86ISD::COMI, ISD::SETLT), + X86_INTRINSIC_DATA(sse2_comineq_sd, COMI, X86ISD::COMI, ISD::SETNE), + X86_INTRINSIC_DATA(sse2_pmaxs_w, INTR_TYPE_2OP, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(sse2_pmaxu_b, INTR_TYPE_2OP, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(sse2_pmins_w, INTR_TYPE_2OP, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(sse2_pminu_b, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(sse2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(sse2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(sse2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(sse2_pslli_d, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(sse2_pslli_q, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(sse2_pslli_w, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(sse2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(sse2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(sse2_psrai_d, VSHIFT, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(sse2_psrai_w, VSHIFT, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(sse2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(sse2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(sse2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(sse2_psrli_d, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(sse2_psrli_q, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(sse2_psrli_w, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(sse2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(sse2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(sse2_sqrt_pd, INTR_TYPE_1OP, ISD::FSQRT, 0), + X86_INTRINSIC_DATA(sse2_ucomieq_sd, COMI, X86ISD::UCOMI, ISD::SETEQ), + X86_INTRINSIC_DATA(sse2_ucomige_sd, COMI, X86ISD::UCOMI, ISD::SETGE), + X86_INTRINSIC_DATA(sse2_ucomigt_sd, COMI, X86ISD::UCOMI, ISD::SETGT), + X86_INTRINSIC_DATA(sse2_ucomile_sd, COMI, X86ISD::UCOMI, ISD::SETLE), + X86_INTRINSIC_DATA(sse2_ucomilt_sd, COMI, X86ISD::UCOMI, ISD::SETLT), + X86_INTRINSIC_DATA(sse2_ucomineq_sd, COMI, X86ISD::UCOMI, ISD::SETNE), + X86_INTRINSIC_DATA(sse3_hadd_pd, INTR_TYPE_2OP, X86ISD::FHADD, 0), + X86_INTRINSIC_DATA(sse3_hadd_ps, INTR_TYPE_2OP, X86ISD::FHADD, 0), + X86_INTRINSIC_DATA(sse3_hsub_pd, INTR_TYPE_2OP, X86ISD::FHSUB, 0), + X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0), + X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0), + X86_INTRINSIC_DATA(sse41_pmaxsb, INTR_TYPE_2OP, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(sse41_pmaxsd, INTR_TYPE_2OP, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(sse41_pmaxud, INTR_TYPE_2OP, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(sse41_pmaxuw, INTR_TYPE_2OP, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(sse41_pminsb, INTR_TYPE_2OP, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(sse41_pminsd, INTR_TYPE_2OP, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(sse41_pminud, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(sse41_pminuw, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ), + X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE), + X86_INTRINSIC_DATA(sse_comigt_ss, COMI, X86ISD::COMI, ISD::SETGT), + X86_INTRINSIC_DATA(sse_comile_ss, COMI, X86ISD::COMI, ISD::SETLE), + X86_INTRINSIC_DATA(sse_comilt_ss, COMI, X86ISD::COMI, ISD::SETLT), + X86_INTRINSIC_DATA(sse_comineq_ss, COMI, X86ISD::COMI, ISD::SETNE), + X86_INTRINSIC_DATA(sse_sqrt_ps, INTR_TYPE_1OP, ISD::FSQRT, 0), + X86_INTRINSIC_DATA(sse_ucomieq_ss, COMI, X86ISD::UCOMI, ISD::SETEQ), + X86_INTRINSIC_DATA(sse_ucomige_ss, COMI, X86ISD::UCOMI, ISD::SETGE), + X86_INTRINSIC_DATA(sse_ucomigt_ss, COMI, X86ISD::UCOMI, ISD::SETGT), + X86_INTRINSIC_DATA(sse_ucomile_ss, COMI, X86ISD::UCOMI, ISD::SETLE), + X86_INTRINSIC_DATA(sse_ucomilt_ss, COMI, X86ISD::UCOMI, ISD::SETLT), + X86_INTRINSIC_DATA(sse_ucomineq_ss, COMI, X86ISD::UCOMI, ISD::SETNE), + X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0), + X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0), + X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0) +}; + +/* + * Retrieve data for Intrinsic without chain. + * Return nullptr if intrinsic is not defined in the table. + */ +static const IntrinsicData* getIntrinsicWithoutChain(unsigned IntNo) { + IntrinsicData IntrinsicToFind = { IntNo, INTR_NO_TYPE, 0, 0 }; + const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithoutChain), + std::end(IntrinsicsWithoutChain), + IntrinsicToFind); + if (Data != std::end(IntrinsicsWithoutChain) && *Data == IntrinsicToFind) + return Data; + return nullptr; +} + +static void verifyIntrinsicTables() { + assert(std::is_sorted(std::begin(IntrinsicsWithoutChain), + std::end(IntrinsicsWithoutChain)) && + std::is_sorted(std::begin(IntrinsicsWithChain), + std::end(IntrinsicsWithChain)) && + "Intrinsic data tables should be sorted by Intrinsic ID"); +} + +} // End llvm namespace + +#endif diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp deleted file mode 100644 index a082c4f..0000000 --- a/lib/Target/X86/X86JITInfo.cpp +++ /dev/null @@ -1,588 +0,0 @@ -//===-- X86JITInfo.cpp - Implement the JIT interfaces for the X86 target --===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the JIT interfaces for the X86 target. -// -//===----------------------------------------------------------------------===// - -#include "X86JITInfo.h" -#include "X86Relocations.h" -#include "X86Subtarget.h" -#include "X86TargetMachine.h" -#include "llvm/IR/Function.h" -#include "llvm/Support/Compiler.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/Valgrind.h" -#include <cstdlib> -#include <cstring> -using namespace llvm; - -#define DEBUG_TYPE "jit" - -// Determine the platform we're running on -#if defined (__x86_64__) || defined (_M_AMD64) || defined (_M_X64) -# define X86_64_JIT -#elif defined(__i386__) || defined(i386) || defined(_M_IX86) -# define X86_32_JIT -#endif - -void X86JITInfo::replaceMachineCodeForFunction(void *Old, void *New) { - unsigned char *OldByte = (unsigned char *)Old; - *OldByte++ = 0xE9; // Emit JMP opcode. - unsigned *OldWord = (unsigned *)OldByte; - unsigned NewAddr = (intptr_t)New; - unsigned OldAddr = (intptr_t)OldWord; - *OldWord = NewAddr - OldAddr - 4; // Emit PC-relative addr of New code. - - // X86 doesn't need to invalidate the processor cache, so just invalidate - // Valgrind's cache directly. - sys::ValgrindDiscardTranslations(Old, 5); -} - - -/// JITCompilerFunction - This contains the address of the JIT function used to -/// compile a function lazily. -static TargetJITInfo::JITCompilerFn JITCompilerFunction; - -// Get the ASMPREFIX for the current host. This is often '_'. -#ifndef __USER_LABEL_PREFIX__ -#define __USER_LABEL_PREFIX__ -#endif -#define GETASMPREFIX2(X) #X -#define GETASMPREFIX(X) GETASMPREFIX2(X) -#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__) - -// For ELF targets, use a .size and .type directive, to let tools -// know the extent of functions defined in assembler. -#if defined(__ELF__) -# define SIZE(sym) ".size " #sym ", . - " #sym "\n" -# define TYPE_FUNCTION(sym) ".type " #sym ", @function\n" -#else -# define SIZE(sym) -# define TYPE_FUNCTION(sym) -#endif - -// Provide a convenient way for disabling usage of CFI directives. -// This is needed for old/broken assemblers (for example, gas on -// Darwin is pretty old and doesn't support these directives) -#if defined(__APPLE__) -# define CFI(x) -#else -// FIXME: Disable this until we really want to use it. Also, we will -// need to add some workarounds for compilers, which support -// only subset of these directives. -# define CFI(x) -#endif - -// Provide a wrapper for LLVMX86CompilationCallback2 that saves non-traditional -// callee saved registers, for the fastcc calling convention. -extern "C" { -#if defined(X86_64_JIT) -# ifndef _MSC_VER - // No need to save EAX/EDX for X86-64. - void X86CompilationCallback(void); - asm( - ".text\n" - ".align 8\n" - ".globl " ASMPREFIX "X86CompilationCallback\n" - TYPE_FUNCTION(X86CompilationCallback) - ASMPREFIX "X86CompilationCallback:\n" - CFI(".cfi_startproc\n") - // Save RBP - "pushq %rbp\n" - CFI(".cfi_def_cfa_offset 16\n") - CFI(".cfi_offset %rbp, -16\n") - // Save RSP - "movq %rsp, %rbp\n" - CFI(".cfi_def_cfa_register %rbp\n") - // Save all int arg registers - "pushq %rdi\n" - CFI(".cfi_rel_offset %rdi, 0\n") - "pushq %rsi\n" - CFI(".cfi_rel_offset %rsi, 8\n") - "pushq %rdx\n" - CFI(".cfi_rel_offset %rdx, 16\n") - "pushq %rcx\n" - CFI(".cfi_rel_offset %rcx, 24\n") - "pushq %r8\n" - CFI(".cfi_rel_offset %r8, 32\n") - "pushq %r9\n" - CFI(".cfi_rel_offset %r9, 40\n") - // Align stack on 16-byte boundary. ESP might not be properly aligned - // (8 byte) if this is called from an indirect stub. - "andq $-16, %rsp\n" - // Save all XMM arg registers - "subq $128, %rsp\n" - "movaps %xmm0, (%rsp)\n" - "movaps %xmm1, 16(%rsp)\n" - "movaps %xmm2, 32(%rsp)\n" - "movaps %xmm3, 48(%rsp)\n" - "movaps %xmm4, 64(%rsp)\n" - "movaps %xmm5, 80(%rsp)\n" - "movaps %xmm6, 96(%rsp)\n" - "movaps %xmm7, 112(%rsp)\n" - // JIT callee -#if defined(_WIN64) || defined(__CYGWIN__) - "subq $32, %rsp\n" - "movq %rbp, %rcx\n" // Pass prev frame and return address - "movq 8(%rbp), %rdx\n" - "call " ASMPREFIX "LLVMX86CompilationCallback2\n" - "addq $32, %rsp\n" -#else - "movq %rbp, %rdi\n" // Pass prev frame and return address - "movq 8(%rbp), %rsi\n" - "call " ASMPREFIX "LLVMX86CompilationCallback2\n" -#endif - // Restore all XMM arg registers - "movaps 112(%rsp), %xmm7\n" - "movaps 96(%rsp), %xmm6\n" - "movaps 80(%rsp), %xmm5\n" - "movaps 64(%rsp), %xmm4\n" - "movaps 48(%rsp), %xmm3\n" - "movaps 32(%rsp), %xmm2\n" - "movaps 16(%rsp), %xmm1\n" - "movaps (%rsp), %xmm0\n" - // Restore RSP - "movq %rbp, %rsp\n" - CFI(".cfi_def_cfa_register %rsp\n") - // Restore all int arg registers - "subq $48, %rsp\n" - CFI(".cfi_adjust_cfa_offset 48\n") - "popq %r9\n" - CFI(".cfi_adjust_cfa_offset -8\n") - CFI(".cfi_restore %r9\n") - "popq %r8\n" - CFI(".cfi_adjust_cfa_offset -8\n") - CFI(".cfi_restore %r8\n") - "popq %rcx\n" - CFI(".cfi_adjust_cfa_offset -8\n") - CFI(".cfi_restore %rcx\n") - "popq %rdx\n" - CFI(".cfi_adjust_cfa_offset -8\n") - CFI(".cfi_restore %rdx\n") - "popq %rsi\n" - CFI(".cfi_adjust_cfa_offset -8\n") - CFI(".cfi_restore %rsi\n") - "popq %rdi\n" - CFI(".cfi_adjust_cfa_offset -8\n") - CFI(".cfi_restore %rdi\n") - // Restore RBP - "popq %rbp\n" - CFI(".cfi_adjust_cfa_offset -8\n") - CFI(".cfi_restore %rbp\n") - "ret\n" - CFI(".cfi_endproc\n") - SIZE(X86CompilationCallback) - ); -# else - // No inline assembler support on this platform. The routine is in external - // file. - void X86CompilationCallback(); - -# endif -#elif defined (X86_32_JIT) -# ifndef _MSC_VER - void X86CompilationCallback(void); - asm( - ".text\n" - ".align 8\n" - ".globl " ASMPREFIX "X86CompilationCallback\n" - TYPE_FUNCTION(X86CompilationCallback) - ASMPREFIX "X86CompilationCallback:\n" - CFI(".cfi_startproc\n") - "pushl %ebp\n" - CFI(".cfi_def_cfa_offset 8\n") - CFI(".cfi_offset %ebp, -8\n") - "movl %esp, %ebp\n" // Standard prologue - CFI(".cfi_def_cfa_register %ebp\n") - "pushl %eax\n" - CFI(".cfi_rel_offset %eax, 0\n") - "pushl %edx\n" // Save EAX/EDX/ECX - CFI(".cfi_rel_offset %edx, 4\n") - "pushl %ecx\n" - CFI(".cfi_rel_offset %ecx, 8\n") -# if defined(__APPLE__) - "andl $-16, %esp\n" // Align ESP on 16-byte boundary -# endif - "subl $16, %esp\n" - "movl 4(%ebp), %eax\n" // Pass prev frame and return address - "movl %eax, 4(%esp)\n" - "movl %ebp, (%esp)\n" - "call " ASMPREFIX "LLVMX86CompilationCallback2\n" - "movl %ebp, %esp\n" // Restore ESP - CFI(".cfi_def_cfa_register %esp\n") - "subl $12, %esp\n" - CFI(".cfi_adjust_cfa_offset 12\n") - "popl %ecx\n" - CFI(".cfi_adjust_cfa_offset -4\n") - CFI(".cfi_restore %ecx\n") - "popl %edx\n" - CFI(".cfi_adjust_cfa_offset -4\n") - CFI(".cfi_restore %edx\n") - "popl %eax\n" - CFI(".cfi_adjust_cfa_offset -4\n") - CFI(".cfi_restore %eax\n") - "popl %ebp\n" - CFI(".cfi_adjust_cfa_offset -4\n") - CFI(".cfi_restore %ebp\n") - "ret\n" - CFI(".cfi_endproc\n") - SIZE(X86CompilationCallback) - ); - - // Same as X86CompilationCallback but also saves XMM argument registers. - void X86CompilationCallback_SSE(void); - asm( - ".text\n" - ".align 8\n" - ".globl " ASMPREFIX "X86CompilationCallback_SSE\n" - TYPE_FUNCTION(X86CompilationCallback_SSE) - ASMPREFIX "X86CompilationCallback_SSE:\n" - CFI(".cfi_startproc\n") - "pushl %ebp\n" - CFI(".cfi_def_cfa_offset 8\n") - CFI(".cfi_offset %ebp, -8\n") - "movl %esp, %ebp\n" // Standard prologue - CFI(".cfi_def_cfa_register %ebp\n") - "pushl %eax\n" - CFI(".cfi_rel_offset %eax, 0\n") - "pushl %edx\n" // Save EAX/EDX/ECX - CFI(".cfi_rel_offset %edx, 4\n") - "pushl %ecx\n" - CFI(".cfi_rel_offset %ecx, 8\n") - "andl $-16, %esp\n" // Align ESP on 16-byte boundary - // Save all XMM arg registers - "subl $64, %esp\n" - // FIXME: provide frame move information for xmm registers. - // This can be tricky, because CFA register is ebp (unaligned) - // and we need to produce offsets relative to it. - "movaps %xmm0, (%esp)\n" - "movaps %xmm1, 16(%esp)\n" - "movaps %xmm2, 32(%esp)\n" - "movaps %xmm3, 48(%esp)\n" - "subl $16, %esp\n" - "movl 4(%ebp), %eax\n" // Pass prev frame and return address - "movl %eax, 4(%esp)\n" - "movl %ebp, (%esp)\n" - "call " ASMPREFIX "LLVMX86CompilationCallback2\n" - "addl $16, %esp\n" - "movaps 48(%esp), %xmm3\n" - CFI(".cfi_restore %xmm3\n") - "movaps 32(%esp), %xmm2\n" - CFI(".cfi_restore %xmm2\n") - "movaps 16(%esp), %xmm1\n" - CFI(".cfi_restore %xmm1\n") - "movaps (%esp), %xmm0\n" - CFI(".cfi_restore %xmm0\n") - "movl %ebp, %esp\n" // Restore ESP - CFI(".cfi_def_cfa_register esp\n") - "subl $12, %esp\n" - CFI(".cfi_adjust_cfa_offset 12\n") - "popl %ecx\n" - CFI(".cfi_adjust_cfa_offset -4\n") - CFI(".cfi_restore %ecx\n") - "popl %edx\n" - CFI(".cfi_adjust_cfa_offset -4\n") - CFI(".cfi_restore %edx\n") - "popl %eax\n" - CFI(".cfi_adjust_cfa_offset -4\n") - CFI(".cfi_restore %eax\n") - "popl %ebp\n" - CFI(".cfi_adjust_cfa_offset -4\n") - CFI(".cfi_restore %ebp\n") - "ret\n" - CFI(".cfi_endproc\n") - SIZE(X86CompilationCallback_SSE) - ); -# else - void LLVMX86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr); - - _declspec(naked) void X86CompilationCallback(void) { - __asm { - push ebp - mov ebp, esp - push eax - push edx - push ecx - and esp, -16 - sub esp, 16 - mov eax, dword ptr [ebp+4] - mov dword ptr [esp+4], eax - mov dword ptr [esp], ebp - call LLVMX86CompilationCallback2 - mov esp, ebp - sub esp, 12 - pop ecx - pop edx - pop eax - pop ebp - ret - } - } - -# endif // _MSC_VER - -#else // Not an i386 host - void X86CompilationCallback() { - llvm_unreachable("Cannot call X86CompilationCallback() on a non-x86 arch!"); - } -#endif -} - -/// This is the target-specific function invoked by the -/// function stub when we did not know the real target of a call. This function -/// must locate the start of the stub or call site and pass it into the JIT -/// compiler function. -extern "C" { -LLVM_ATTRIBUTE_USED // Referenced from inline asm. -LLVM_LIBRARY_VISIBILITY void LLVMX86CompilationCallback2(intptr_t *StackPtr, - intptr_t RetAddr) { - intptr_t *RetAddrLoc = &StackPtr[1]; - // We are reading raw stack data here. Tell MemorySanitizer that it is - // sufficiently initialized. - __msan_unpoison(RetAddrLoc, sizeof(*RetAddrLoc)); - assert(*RetAddrLoc == RetAddr && - "Could not find return address on the stack!"); - - // It's a stub if there is an interrupt marker after the call. - bool isStub = ((unsigned char*)RetAddr)[0] == 0xCE; - - // The call instruction should have pushed the return value onto the stack... -#if defined (X86_64_JIT) - RetAddr--; // Backtrack to the reference itself... -#else - RetAddr -= 4; // Backtrack to the reference itself... -#endif - -#if 0 - DEBUG(dbgs() << "In callback! Addr=" << (void*)RetAddr - << " ESP=" << (void*)StackPtr - << ": Resolving call to function: " - << TheVM->getFunctionReferencedName((void*)RetAddr) << "\n"); -#endif - - // Sanity check to make sure this really is a call instruction. -#if defined (X86_64_JIT) - assert(((unsigned char*)RetAddr)[-2] == 0x41 &&"Not a call instr!"); - assert(((unsigned char*)RetAddr)[-1] == 0xFF &&"Not a call instr!"); -#else - assert(((unsigned char*)RetAddr)[-1] == 0xE8 &&"Not a call instr!"); -#endif - - intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)RetAddr); - - // Rewrite the call target... so that we don't end up here every time we - // execute the call. -#if defined (X86_64_JIT) - assert(isStub && - "X86-64 doesn't support rewriting non-stub lazy compilation calls:" - " the call instruction varies too much."); -#else - *(intptr_t *)RetAddr = (intptr_t)(NewVal-RetAddr-4); -#endif - - if (isStub) { - // If this is a stub, rewrite the call into an unconditional branch - // instruction so that two return addresses are not pushed onto the stack - // when the requested function finally gets called. This also makes the - // 0xCE byte (interrupt) dead, so the marker doesn't effect anything. -#if defined (X86_64_JIT) - // If the target address is within 32-bit range of the stub, use a - // PC-relative branch instead of loading the actual address. (This is - // considerably shorter than the 64-bit immediate load already there.) - // We assume here intptr_t is 64 bits. - intptr_t diff = NewVal-RetAddr+7; - if (diff >= -2147483648LL && diff <= 2147483647LL) { - *(unsigned char*)(RetAddr-0xc) = 0xE9; - *(intptr_t *)(RetAddr-0xb) = diff & 0xffffffff; - } else { - *(intptr_t *)(RetAddr - 0xa) = NewVal; - ((unsigned char*)RetAddr)[0] = (2 | (4 << 3) | (3 << 6)); - } - sys::ValgrindDiscardTranslations((void*)(RetAddr-0xc), 0xd); -#else - ((unsigned char*)RetAddr)[-1] = 0xE9; - sys::ValgrindDiscardTranslations((void*)(RetAddr-1), 5); -#endif - } - - // Change the return address to reexecute the call instruction... -#if defined (X86_64_JIT) - *RetAddrLoc -= 0xd; -#else - *RetAddrLoc -= 5; -#endif -} -} - -TargetJITInfo::LazyResolverFn -X86JITInfo::getLazyResolverFunction(JITCompilerFn F) { - TsanIgnoreWritesBegin(); - JITCompilerFunction = F; - TsanIgnoreWritesEnd(); - -#if defined (X86_32_JIT) && !defined (_MSC_VER) -#if defined(__SSE__) - // SSE Callback should be called for SSE-enabled LLVM. - return X86CompilationCallback_SSE; -#else - if (useSSE) - return X86CompilationCallback_SSE; -#endif -#endif - - return X86CompilationCallback; -} - -X86JITInfo::X86JITInfo(bool UseSSE) { - useSSE = UseSSE; - useGOT = 0; - TLSOffset = nullptr; -} - -void *X86JITInfo::emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr, - JITCodeEmitter &JCE) { -#if defined (X86_64_JIT) - const unsigned Alignment = 8; - uint8_t Buffer[8]; - uint8_t *Cur = Buffer; - MachineCodeEmitter::emitWordLEInto(Cur, (unsigned)(intptr_t)ptr); - MachineCodeEmitter::emitWordLEInto(Cur, (unsigned)(((intptr_t)ptr) >> 32)); -#else - const unsigned Alignment = 4; - uint8_t Buffer[4]; - uint8_t *Cur = Buffer; - MachineCodeEmitter::emitWordLEInto(Cur, (intptr_t)ptr); -#endif - return JCE.allocIndirectGV(GV, Buffer, sizeof(Buffer), Alignment); -} - -TargetJITInfo::StubLayout X86JITInfo::getStubLayout() { - // The 64-bit stub contains: - // movabs r10 <- 8-byte-target-address # 10 bytes - // call|jmp *r10 # 3 bytes - // The 32-bit stub contains a 5-byte call|jmp. - // If the stub is a call to the compilation callback, an extra byte is added - // to mark it as a stub. - StubLayout Result = {14, 4}; - return Result; -} - -void *X86JITInfo::emitFunctionStub(const Function* F, void *Target, - JITCodeEmitter &JCE) { - // Note, we cast to intptr_t here to silence a -pedantic warning that - // complains about casting a function pointer to a normal pointer. -#if defined (X86_32_JIT) && !defined (_MSC_VER) - bool NotCC = (Target != (void*)(intptr_t)X86CompilationCallback && - Target != (void*)(intptr_t)X86CompilationCallback_SSE); -#else - bool NotCC = Target != (void*)(intptr_t)X86CompilationCallback; -#endif - JCE.emitAlignment(4); - void *Result = (void*)JCE.getCurrentPCValue(); - if (NotCC) { -#if defined (X86_64_JIT) - JCE.emitByte(0x49); // REX prefix - JCE.emitByte(0xB8+2); // movabsq r10 - JCE.emitWordLE((unsigned)(intptr_t)Target); - JCE.emitWordLE((unsigned)(((intptr_t)Target) >> 32)); - JCE.emitByte(0x41); // REX prefix - JCE.emitByte(0xFF); // jmpq *r10 - JCE.emitByte(2 | (4 << 3) | (3 << 6)); -#else - JCE.emitByte(0xE9); - JCE.emitWordLE((intptr_t)Target-JCE.getCurrentPCValue()-4); -#endif - return Result; - } - -#if defined (X86_64_JIT) - JCE.emitByte(0x49); // REX prefix - JCE.emitByte(0xB8+2); // movabsq r10 - JCE.emitWordLE((unsigned)(intptr_t)Target); - JCE.emitWordLE((unsigned)(((intptr_t)Target) >> 32)); - JCE.emitByte(0x41); // REX prefix - JCE.emitByte(0xFF); // callq *r10 - JCE.emitByte(2 | (2 << 3) | (3 << 6)); -#else - JCE.emitByte(0xE8); // Call with 32 bit pc-rel destination... - - JCE.emitWordLE((intptr_t)Target-JCE.getCurrentPCValue()-4); -#endif - - // This used to use 0xCD, but that value is used by JITMemoryManager to - // initialize the buffer with garbage, which means it may follow a - // noreturn function call, confusing LLVMX86CompilationCallback2. PR 4929. - JCE.emitByte(0xCE); // Interrupt - Just a marker identifying the stub! - return Result; -} - -/// getPICJumpTableEntry - Returns the value of the jumptable entry for the -/// specific basic block. -uintptr_t X86JITInfo::getPICJumpTableEntry(uintptr_t BB, uintptr_t Entry) { -#if defined(X86_64_JIT) - return BB - Entry; -#else - return BB - PICBase; -#endif -} - -template<typename T> static void addUnaligned(void *Pos, T Delta) { - T Value; - std::memcpy(reinterpret_cast<char*>(&Value), reinterpret_cast<char*>(Pos), - sizeof(T)); - Value += Delta; - std::memcpy(reinterpret_cast<char*>(Pos), reinterpret_cast<char*>(&Value), - sizeof(T)); -} - -/// relocate - Before the JIT can run a block of code that has been emitted, -/// it must rewrite the code to contain the actual addresses of any -/// referenced global symbols. -void X86JITInfo::relocate(void *Function, MachineRelocation *MR, - unsigned NumRelocs, unsigned char* GOTBase) { - for (unsigned i = 0; i != NumRelocs; ++i, ++MR) { - void *RelocPos = (char*)Function + MR->getMachineCodeOffset(); - intptr_t ResultPtr = (intptr_t)MR->getResultPointer(); - switch ((X86::RelocationType)MR->getRelocationType()) { - case X86::reloc_pcrel_word: { - // PC relative relocation, add the relocated value to the value already in - // memory, after we adjust it for where the PC is. - ResultPtr = ResultPtr -(intptr_t)RelocPos - 4 - MR->getConstantVal(); - addUnaligned<unsigned>(RelocPos, ResultPtr); - break; - } - case X86::reloc_picrel_word: { - // PIC base relative relocation, add the relocated value to the value - // already in memory, after we adjust it for where the PIC base is. - ResultPtr = ResultPtr - ((intptr_t)Function + MR->getConstantVal()); - addUnaligned<unsigned>(RelocPos, ResultPtr); - break; - } - case X86::reloc_absolute_word: - case X86::reloc_absolute_word_sext: - // Absolute relocation, just add the relocated value to the value already - // in memory. - addUnaligned<unsigned>(RelocPos, ResultPtr); - break; - case X86::reloc_absolute_dword: - addUnaligned<intptr_t>(RelocPos, ResultPtr); - break; - } - } -} - -char* X86JITInfo::allocateThreadLocalMemory(size_t size) { -#if defined(X86_32_JIT) && !defined(__APPLE__) && !defined(_MSC_VER) - TLSOffset -= size; - return TLSOffset; -#else - llvm_unreachable("Cannot allocate thread local storage on this arch!"); -#endif -} diff --git a/lib/Target/X86/X86JITInfo.h b/lib/Target/X86/X86JITInfo.h deleted file mode 100644 index 564343f..0000000 --- a/lib/Target/X86/X86JITInfo.h +++ /dev/null @@ -1,79 +0,0 @@ -//===-- X86JITInfo.h - X86 implementation of the JIT interface --*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the X86 implementation of the TargetJITInfo class. -// -//===----------------------------------------------------------------------===// - -#ifndef X86JITINFO_H -#define X86JITINFO_H - -#include "llvm/CodeGen/JITCodeEmitter.h" -#include "llvm/IR/Function.h" -#include "llvm/Target/TargetJITInfo.h" - -namespace llvm { - class X86Subtarget; - - class X86JITInfo : public TargetJITInfo { - uintptr_t PICBase; - char *TLSOffset; - bool useSSE; - public: - explicit X86JITInfo(bool UseSSE); - - /// replaceMachineCodeForFunction - Make it so that calling the function - /// whose machine code is at OLD turns into a call to NEW, perhaps by - /// overwriting OLD with a branch to NEW. This is used for self-modifying - /// code. - /// - void replaceMachineCodeForFunction(void *Old, void *New) override; - - /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object - /// to emit an indirect symbol which contains the address of the specified - /// ptr. - void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr, - JITCodeEmitter &JCE) override; - - // getStubLayout - Returns the size and alignment of the largest call stub - // on X86. - StubLayout getStubLayout() override; - - /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a - /// small native function that simply calls the function at the specified - /// address. - void *emitFunctionStub(const Function* F, void *Target, - JITCodeEmitter &JCE) override; - - /// getPICJumpTableEntry - Returns the value of the jumptable entry for the - /// specific basic block. - uintptr_t getPICJumpTableEntry(uintptr_t BB, uintptr_t JTBase) override; - - /// getLazyResolverFunction - Expose the lazy resolver to the JIT. - LazyResolverFn getLazyResolverFunction(JITCompilerFn) override; - - /// relocate - Before the JIT can run a block of code that has been emitted, - /// it must rewrite the code to contain the actual addresses of any - /// referenced global symbols. - void relocate(void *Function, MachineRelocation *MR, - unsigned NumRelocs, unsigned char* GOTBase) override; - - /// allocateThreadLocalMemory - Each target has its own way of - /// handling thread local variables. This method returns a value only - /// meaningful to the target. - char* allocateThreadLocalMemory(size_t size) override; - - /// setPICBase / getPICBase - Getter / setter of PICBase, used to compute - /// PIC jumptable entry. - void setPICBase(uintptr_t Base) { PICBase = Base; } - uintptr_t getPICBase() const { return PICBase; } - }; -} - -#endif diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 2bd70a9..4e0d594 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -16,20 +16,25 @@ #include "X86RegisterInfo.h" #include "InstPrinter/X86ATTInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" +#include "Utils/X86ShuffleDecode.h" #include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Mangler.h" #include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstBuilder.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/TargetRegistry.h" using namespace llvm; namespace { @@ -58,6 +63,53 @@ private: } // end anonymous namespace +// Emit a minimal sequence of nops spanning NumBytes bytes. +static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, + const MCSubtargetInfo &STI); + +namespace llvm { + X86AsmPrinter::StackMapShadowTracker::StackMapShadowTracker(TargetMachine &TM) + : TM(TM), InShadow(false), RequiredShadowSize(0), CurrentShadowSize(0) {} + + X86AsmPrinter::StackMapShadowTracker::~StackMapShadowTracker() {} + + void + X86AsmPrinter::StackMapShadowTracker::startFunction(MachineFunction &MF) { + CodeEmitter.reset(TM.getTarget().createMCCodeEmitter( + *TM.getSubtargetImpl()->getInstrInfo(), + *TM.getSubtargetImpl()->getRegisterInfo(), *TM.getSubtargetImpl(), + MF.getContext())); + } + + void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst, + const MCSubtargetInfo &STI) { + if (InShadow) { + SmallString<256> Code; + SmallVector<MCFixup, 4> Fixups; + raw_svector_ostream VecOS(Code); + CodeEmitter->EncodeInstruction(Inst, VecOS, Fixups, STI); + VecOS.flush(); + CurrentShadowSize += Code.size(); + if (CurrentShadowSize >= RequiredShadowSize) + InShadow = false; // The shadow is big enough. Stop counting. + } + } + + void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding( + MCStreamer &OutStreamer, const MCSubtargetInfo &STI) { + if (InShadow && CurrentShadowSize < RequiredShadowSize) { + InShadow = false; + EmitNops(OutStreamer, RequiredShadowSize - CurrentShadowSize, + TM.getSubtarget<X86Subtarget>().is64Bit(), STI); + } + } + + void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) { + OutStreamer.EmitInstruction(Inst, getSubtargetInfo()); + SMShadowTracker.count(Inst, getSubtargetInfo()); + } +} // end llvm namespace + X86MCInstLower::X86MCInstLower(const MachineFunction &mf, X86AsmPrinter &asmprinter) : Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()), @@ -72,7 +124,7 @@ MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const { /// operand to an MCSymbol. MCSymbol *X86MCInstLower:: GetSymbolFromOperand(const MachineOperand &MO) const { - const DataLayout *DL = TM.getDataLayout(); + const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout(); assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference"); SmallString<128> Name; @@ -212,7 +264,8 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO, Expr = MCBinaryExpr::CreateSub(Expr, MCSymbolRefExpr::Create(MF.getPICBaseSymbol(), Ctx), Ctx); - if (MO.isJTI() && MAI.hasSetDirective()) { + if (MO.isJTI()) { + assert(MAI.doesSetDirectiveSuppressesReloc()); // If .set directive is supported, use it to reduce the number of // relocations the assembler will generate for differences between // local labels. This is only safe when the symbols are in the same @@ -531,14 +584,38 @@ ReSimplify: // Atomic load and store require a separate pseudo-inst because Acquire // implies mayStore and Release implies mayLoad; fix these to regular MOV // instructions here - case X86::ACQUIRE_MOV8rm: OutMI.setOpcode(X86::MOV8rm); goto ReSimplify; - case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify; - case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify; - case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify; - case X86::RELEASE_MOV8mr: OutMI.setOpcode(X86::MOV8mr); goto ReSimplify; - case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify; - case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify; - case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify; + case X86::ACQUIRE_MOV8rm: OutMI.setOpcode(X86::MOV8rm); goto ReSimplify; + case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify; + case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify; + case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify; + case X86::RELEASE_MOV8mr: OutMI.setOpcode(X86::MOV8mr); goto ReSimplify; + case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify; + case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify; + case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify; + case X86::RELEASE_MOV8mi: OutMI.setOpcode(X86::MOV8mi); goto ReSimplify; + case X86::RELEASE_MOV16mi: OutMI.setOpcode(X86::MOV16mi); goto ReSimplify; + case X86::RELEASE_MOV32mi: OutMI.setOpcode(X86::MOV32mi); goto ReSimplify; + case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify; + case X86::RELEASE_ADD8mi: OutMI.setOpcode(X86::ADD8mi); goto ReSimplify; + case X86::RELEASE_ADD32mi: OutMI.setOpcode(X86::ADD32mi); goto ReSimplify; + case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify; + case X86::RELEASE_AND8mi: OutMI.setOpcode(X86::AND8mi); goto ReSimplify; + case X86::RELEASE_AND32mi: OutMI.setOpcode(X86::AND32mi); goto ReSimplify; + case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify; + case X86::RELEASE_OR8mi: OutMI.setOpcode(X86::OR8mi); goto ReSimplify; + case X86::RELEASE_OR32mi: OutMI.setOpcode(X86::OR32mi); goto ReSimplify; + case X86::RELEASE_OR64mi32: OutMI.setOpcode(X86::OR64mi32); goto ReSimplify; + case X86::RELEASE_XOR8mi: OutMI.setOpcode(X86::XOR8mi); goto ReSimplify; + case X86::RELEASE_XOR32mi: OutMI.setOpcode(X86::XOR32mi); goto ReSimplify; + case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify; + case X86::RELEASE_INC8m: OutMI.setOpcode(X86::INC8m); goto ReSimplify; + case X86::RELEASE_INC16m: OutMI.setOpcode(X86::INC16m); goto ReSimplify; + case X86::RELEASE_INC32m: OutMI.setOpcode(X86::INC32m); goto ReSimplify; + case X86::RELEASE_INC64m: OutMI.setOpcode(X86::INC64m); goto ReSimplify; + case X86::RELEASE_DEC8m: OutMI.setOpcode(X86::DEC8m); goto ReSimplify; + case X86::RELEASE_DEC16m: OutMI.setOpcode(X86::DEC16m); goto ReSimplify; + case X86::RELEASE_DEC32m: OutMI.setOpcode(X86::DEC32m); goto ReSimplify; + case X86::RELEASE_DEC64m: OutMI.setOpcode(X86::DEC64m); goto ReSimplify; // We don't currently select the correct instruction form for instructions // which have a short %eax, etc. form. Handle this by custom lowering, for @@ -602,10 +679,8 @@ ReSimplify: } } -static void LowerTlsAddr(MCStreamer &OutStreamer, - X86MCInstLower &MCInstLowering, - const MachineInstr &MI, - const MCSubtargetInfo& STI) { +void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering, + const MachineInstr &MI) { bool is64Bits = MI.getOpcode() == X86::TLS_addr64 || MI.getOpcode() == X86::TLS_base_addr64; @@ -615,7 +690,7 @@ static void LowerTlsAddr(MCStreamer &OutStreamer, MCContext &context = OutStreamer.getContext(); if (needsPadding) - OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX), STI); + EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); MCSymbolRefExpr::VariantKind SRVK; switch (MI.getOpcode()) { @@ -662,12 +737,12 @@ static void LowerTlsAddr(MCStreamer &OutStreamer, LEA.addOperand(MCOperand::CreateExpr(symRef)); // disp LEA.addOperand(MCOperand::CreateReg(0)); // seg } - OutStreamer.EmitInstruction(LEA, STI); + EmitAndCountInstruction(LEA); if (needsPadding) { - OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX), STI); - OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX), STI); - OutStreamer.EmitInstruction(MCInstBuilder(X86::REX64_PREFIX), STI); + EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); + EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); + EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX)); } StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr"; @@ -677,9 +752,9 @@ static void LowerTlsAddr(MCStreamer &OutStreamer, MCSymbolRefExpr::VK_PLT, context); - OutStreamer.EmitInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32 - : X86::CALLpcrel32) - .addExpr(tlsRef), STI); + EmitAndCountInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32 + : X86::CALLpcrel32) + .addExpr(tlsRef)); } /// \brief Emit the optimal amount of multi-byte nops on X86. @@ -725,10 +800,9 @@ static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, const MCSu break; case X86::NOOPL: case X86::NOOPW: - OS.EmitInstruction(MCInstBuilder(Opc).addReg(BaseReg).addImm(ScaleVal) - .addReg(IndexReg) - .addImm(Displacement) - .addReg(SegmentReg), STI); + OS.EmitInstruction(MCInstBuilder(Opc).addReg(BaseReg) + .addImm(ScaleVal).addReg(IndexReg) + .addImm(Displacement).addReg(SegmentReg), STI); break; } } // while (NumBytes) @@ -736,22 +810,20 @@ static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, const MCSu // Lower a stackmap of the form: // <id>, <shadowBytes>, ... -static void LowerSTACKMAP(MCStreamer &OS, StackMaps &SM, - const MachineInstr &MI, bool Is64Bit, const MCSubtargetInfo& STI) { - unsigned NumBytes = MI.getOperand(1).getImm(); +void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) { + SMShadowTracker.emitShadowPadding(OutStreamer, getSubtargetInfo()); SM.recordStackMap(MI); - // Emit padding. - // FIXME: These nops ensure that the stackmap's shadow is covered by - // instructions from the same basic block, but the nops should not be - // necessary if instructions from the same block follow the stackmap. - EmitNops(OS, NumBytes, Is64Bit, STI); + unsigned NumShadowBytes = MI.getOperand(1).getImm(); + SMShadowTracker.reset(NumShadowBytes); } // Lower a patchpoint of the form: // [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ... -static void LowerPATCHPOINT(MCStreamer &OS, StackMaps &SM, - const MachineInstr &MI, bool Is64Bit, const MCSubtargetInfo& STI) { - assert(Is64Bit && "Patchpoint currently only supports X86-64"); +void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI) { + assert(Subtarget->is64Bit() && "Patchpoint currently only supports X86-64"); + + SMShadowTracker.emitShadowPadding(OutStreamer, getSubtargetInfo()); + SM.recordPatchPoint(MI); PatchPointOpers opers(&MI); @@ -766,22 +838,111 @@ static void LowerPATCHPOINT(MCStreamer &OS, StackMaps &SM, EncodedBytes = 13; else EncodedBytes = 12; - OS.EmitInstruction(MCInstBuilder(X86::MOV64ri).addReg(ScratchReg) - .addImm(CallTarget), STI); - OS.EmitInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg), STI); + EmitAndCountInstruction(MCInstBuilder(X86::MOV64ri).addReg(ScratchReg) + .addImm(CallTarget)); + EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg)); } // Emit padding. unsigned NumBytes = opers.getMetaOper(PatchPointOpers::NBytesPos).getImm(); assert(NumBytes >= EncodedBytes && "Patchpoint can't request size less than the length of a call."); - EmitNops(OS, NumBytes - EncodedBytes, Is64Bit, STI); + EmitNops(OutStreamer, NumBytes - EncodedBytes, Subtarget->is64Bit(), + getSubtargetInfo()); +} + +// Returns instruction preceding MBBI in MachineFunction. +// If MBBI is the first instruction of the first basic block, returns null. +static MachineBasicBlock::const_iterator +PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) { + const MachineBasicBlock *MBB = MBBI->getParent(); + while (MBBI == MBB->begin()) { + if (MBB == MBB->getParent()->begin()) + return nullptr; + MBB = MBB->getPrevNode(); + MBBI = MBB->end(); + } + return --MBBI; +} + +static const Constant *getConstantFromPool(const MachineInstr &MI, + const MachineOperand &Op) { + if (!Op.isCPI()) + return nullptr; + + ArrayRef<MachineConstantPoolEntry> Constants = + MI.getParent()->getParent()->getConstantPool()->getConstants(); + const MachineConstantPoolEntry &ConstantEntry = + Constants[Op.getIndex()]; + + // Bail if this is a machine constant pool entry, we won't be able to dig out + // anything useful. + if (ConstantEntry.isMachineConstantPoolEntry()) + return nullptr; + + auto *C = dyn_cast<Constant>(ConstantEntry.Val.ConstVal); + assert((!C || ConstantEntry.getType() == C->getType()) && + "Expected a constant of the same type!"); + return C; +} + +static std::string getShuffleComment(const MachineOperand &DstOp, + const MachineOperand &SrcOp, + ArrayRef<int> Mask) { + std::string Comment; + + // Compute the name for a register. This is really goofy because we have + // multiple instruction printers that could (in theory) use different + // names. Fortunately most people use the ATT style (outside of Windows) + // and they actually agree on register naming here. Ultimately, this is + // a comment, and so its OK if it isn't perfect. + auto GetRegisterName = [](unsigned RegNum) -> StringRef { + return X86ATTInstPrinter::getRegisterName(RegNum); + }; + + StringRef DstName = DstOp.isReg() ? GetRegisterName(DstOp.getReg()) : "mem"; + StringRef SrcName = SrcOp.isReg() ? GetRegisterName(SrcOp.getReg()) : "mem"; + + raw_string_ostream CS(Comment); + CS << DstName << " = "; + bool NeedComma = false; + bool InSrc = false; + for (int M : Mask) { + // Wrap up any prior entry... + if (M == SM_SentinelZero && InSrc) { + InSrc = false; + CS << "]"; + } + if (NeedComma) + CS << ","; + else + NeedComma = true; + + // Print this shuffle... + if (M == SM_SentinelZero) { + CS << "zero"; + } else { + if (!InSrc) { + InSrc = true; + CS << SrcName << "["; + } + if (M == SM_SentinelUndef) + CS << "u"; + else + CS << M; + } + } + if (InSrc) + CS << "]"; + CS.flush(); + + return Comment; } void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { X86MCInstLower MCInstLowering(*MF, *this); - const X86RegisterInfo *RI = - static_cast<const X86RegisterInfo *>(TM.getRegisterInfo()); + const X86RegisterInfo *RI = static_cast<const X86RegisterInfo *>( + TM.getSubtargetImpl()->getRegisterInfo()); switch (MI->getOpcode()) { case TargetOpcode::DBG_VALUE: @@ -812,7 +973,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::TLS_addr64: case X86::TLS_base_addr32: case X86::TLS_base_addr64: - return LowerTlsAddr(OutStreamer, MCInstLowering, *MI, getSubtargetInfo()); + return LowerTlsAddr(MCInstLowering, *MI); case X86::MOVPC32r: { // This is a pseudo op for a two instruction sequence with a label, which @@ -825,15 +986,15 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { MCSymbol *PICBase = MF->getPICBaseSymbol(); // FIXME: We would like an efficient form for this, so we don't have to do a // lot of extra uniquing. - EmitToStreamer(OutStreamer, MCInstBuilder(X86::CALLpcrel32) + EmitAndCountInstruction(MCInstBuilder(X86::CALLpcrel32) .addExpr(MCSymbolRefExpr::Create(PICBase, OutContext))); // Emit the label. OutStreamer.EmitLabel(PICBase); // popl $reg - EmitToStreamer(OutStreamer, MCInstBuilder(X86::POP32r) - .addReg(MI->getOperand(0).getReg())); + EmitAndCountInstruction(MCInstBuilder(X86::POP32r) + .addReg(MI->getOperand(0).getReg())); return; } @@ -863,7 +1024,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { DotExpr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(OpSym,OutContext), DotExpr, OutContext); - EmitToStreamer(OutStreamer, MCInstBuilder(X86::ADD32ri) + EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri) .addReg(MI->getOperand(0).getReg()) .addReg(MI->getOperand(1).getReg()) .addExpr(DotExpr)); @@ -871,21 +1032,21 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { } case TargetOpcode::STACKMAP: - return LowerSTACKMAP(OutStreamer, SM, *MI, Subtarget->is64Bit(), getSubtargetInfo()); + return LowerSTACKMAP(*MI); case TargetOpcode::PATCHPOINT: - return LowerPATCHPOINT(OutStreamer, SM, *MI, Subtarget->is64Bit(), getSubtargetInfo()); + return LowerPATCHPOINT(*MI); case X86::MORESTACK_RET: - EmitToStreamer(OutStreamer, MCInstBuilder(getRetOpcode(*Subtarget))); + EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget))); return; case X86::MORESTACK_RET_RESTORE_R10: // Return, then restore R10. - EmitToStreamer(OutStreamer, MCInstBuilder(getRetOpcode(*Subtarget))); - EmitToStreamer(OutStreamer, MCInstBuilder(X86::MOV64rr) - .addReg(X86::R10) - .addReg(X86::RAX)); + EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget))); + EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr) + .addReg(X86::R10) + .addReg(X86::RAX)); return; case X86::SEH_PushReg: @@ -918,9 +1079,151 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::SEH_EndPrologue: OutStreamer.EmitWinCFIEndProlog(); return; + + case X86::SEH_Epilogue: { + MachineBasicBlock::const_iterator MBBI(MI); + // Check if preceded by a call and emit nop if so. + for (MBBI = PrevCrossBBInst(MBBI); MBBI; MBBI = PrevCrossBBInst(MBBI)) { + // Conservatively assume that pseudo instructions don't emit code and keep + // looking for a call. We may emit an unnecessary nop in some cases. + if (!MBBI->isPseudo()) { + if (MBBI->isCall()) + EmitAndCountInstruction(MCInstBuilder(X86::NOOP)); + break; + } + } + return; + } + + // Lower PSHUFB and VPERMILP normally but add a comment if we can find + // a constant shuffle mask. We won't be able to do this at the MC layer + // because the mask isn't an immediate. + case X86::PSHUFBrm: + case X86::VPSHUFBrm: + case X86::VPSHUFBYrm: { + if (!OutStreamer.isVerboseAsm()) + break; + assert(MI->getNumOperands() > 5 && + "We should always have at least 5 operands!"); + const MachineOperand &DstOp = MI->getOperand(0); + const MachineOperand &SrcOp = MI->getOperand(1); + const MachineOperand &MaskOp = MI->getOperand(5); + + if (auto *C = getConstantFromPool(*MI, MaskOp)) { + SmallVector<int, 16> Mask; + DecodePSHUFBMask(C, Mask); + if (!Mask.empty()) + OutStreamer.AddComment(getShuffleComment(DstOp, SrcOp, Mask)); + } + break; + } + case X86::VPERMILPSrm: + case X86::VPERMILPDrm: + case X86::VPERMILPSYrm: + case X86::VPERMILPDYrm: { + if (!OutStreamer.isVerboseAsm()) + break; + assert(MI->getNumOperands() > 5 && + "We should always have at least 5 operands!"); + const MachineOperand &DstOp = MI->getOperand(0); + const MachineOperand &SrcOp = MI->getOperand(1); + const MachineOperand &MaskOp = MI->getOperand(5); + + if (auto *C = getConstantFromPool(*MI, MaskOp)) { + SmallVector<int, 16> Mask; + DecodeVPERMILPMask(C, Mask); + if (!Mask.empty()) + OutStreamer.AddComment(getShuffleComment(DstOp, SrcOp, Mask)); + } + break; + } + + // For loads from a constant pool to a vector register, print the constant + // loaded. + case X86::MOVAPDrm: + case X86::VMOVAPDrm: + case X86::VMOVAPDYrm: + case X86::MOVUPDrm: + case X86::VMOVUPDrm: + case X86::VMOVUPDYrm: + case X86::MOVAPSrm: + case X86::VMOVAPSrm: + case X86::VMOVAPSYrm: + case X86::MOVUPSrm: + case X86::VMOVUPSrm: + case X86::VMOVUPSYrm: + case X86::MOVDQArm: + case X86::VMOVDQArm: + case X86::VMOVDQAYrm: + case X86::MOVDQUrm: + case X86::VMOVDQUrm: + case X86::VMOVDQUYrm: + if (!OutStreamer.isVerboseAsm()) + break; + if (MI->getNumOperands() > 4) + if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) { + std::string Comment; + raw_string_ostream CS(Comment); + const MachineOperand &DstOp = MI->getOperand(0); + CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = "; + if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) { + CS << "["; + for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements; ++i) { + if (i != 0) + CS << ","; + if (CDS->getElementType()->isIntegerTy()) + CS << CDS->getElementAsInteger(i); + else if (CDS->getElementType()->isFloatTy()) + CS << CDS->getElementAsFloat(i); + else if (CDS->getElementType()->isDoubleTy()) + CS << CDS->getElementAsDouble(i); + else + CS << "?"; + } + CS << "]"; + OutStreamer.AddComment(CS.str()); + } else if (auto *CV = dyn_cast<ConstantVector>(C)) { + CS << "<"; + for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) { + if (i != 0) + CS << ","; + Constant *COp = CV->getOperand(i); + if (isa<UndefValue>(COp)) { + CS << "u"; + } else if (auto *CI = dyn_cast<ConstantInt>(COp)) { + CS << CI->getZExtValue(); + } else if (auto *CF = dyn_cast<ConstantFP>(COp)) { + SmallString<32> Str; + CF->getValueAPF().toString(Str); + CS << Str; + } else { + CS << "?"; + } + } + CS << ">"; + OutStreamer.AddComment(CS.str()); + } + } + break; } MCInst TmpInst; MCInstLowering.Lower(MI, TmpInst); - EmitToStreamer(OutStreamer, TmpInst); + + // Stackmap shadows cannot include branch targets, so we can count the bytes + // in a call towards the shadow, but must ensure that the no thread returns + // in to the stackmap shadow. The only way to achieve this is if the call + // is at the end of the shadow. + if (MI->isCall()) { + // Count then size of the call towards the shadow + SMShadowTracker.count(TmpInst, getSubtargetInfo()); + // Then flush the shadow so that we fill with nops before the call, not + // after it. + SMShadowTracker.emitShadowPadding(OutStreamer, getSubtargetInfo()); + // Then emit the call + OutStreamer.EmitInstruction(TmpInst, getSubtargetInfo()); + return; + } + + EmitAndCountInstruction(TmpInst); } diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h index 78d20ce..79a51b3 100644 --- a/lib/Target/X86/X86MachineFunctionInfo.h +++ b/lib/Target/X86/X86MachineFunctionInfo.h @@ -11,10 +11,12 @@ // //===----------------------------------------------------------------------===// -#ifndef X86MACHINEFUNCTIONINFO_H -#define X86MACHINEFUNCTIONINFO_H +#ifndef LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H +#define LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineValueType.h" +#include <vector> namespace llvm { @@ -70,6 +72,22 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { unsigned NumLocalDynamics; public: + /// Describes a register that needs to be forwarded from the prologue to a + /// musttail call. + struct Forward { + Forward(unsigned VReg, MCPhysReg PReg, MVT VT) + : VReg(VReg), PReg(PReg), VT(VT) {} + unsigned VReg; + MCPhysReg PReg; + MVT VT; + }; + +private: + /// ForwardedMustTailRegParms - A list of virtual and physical registers + /// that must be forwarded to every musttail call. + std::vector<Forward> ForwardedMustTailRegParms; + +public: X86MachineFunctionInfo() : ForceFramePointer(false), CalleeSavedFrameSize(0), BytesToPopOnReturn(0), @@ -138,6 +156,9 @@ public: unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; } void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; } + std::vector<Forward> &getForwardedMustTailRegParms() { + return ForwardedMustTailRegParms; + } }; } // End llvm namespace diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp index 6639875..adc05b2 100644 --- a/lib/Target/X86/X86PadShortFunction.cpp +++ b/lib/Target/X86/X86PadShortFunction.cpp @@ -105,7 +105,7 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { if (!TM->getSubtarget<X86Subtarget>().padShortFunctions()) return false; - TII = TM->getInstrInfo(); + TII = TM->getSubtargetImpl()->getInstrInfo(); // Search through basic blocks and mark the ones that have early returns ReturnBBs.clear(); @@ -195,7 +195,8 @@ bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB, return true; } - CyclesToEnd += TII->getInstrLatency(TM->getInstrItineraryData(), MI); + CyclesToEnd += TII->getInstrLatency( + TM->getSubtargetImpl()->getInstrItineraryData(), MI); } VisitedBBs[MBB] = VisitedBBInfo(false, CyclesToEnd); diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index e8a7e84..a4a366d 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -68,8 +68,10 @@ X86RegisterInfo::X86RegisterInfo(const X86Subtarget &STI) if (Is64Bit) { SlotSize = 8; - StackPtr = X86::RSP; - FramePtr = X86::RBP; + StackPtr = (Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64()) ? + X86::RSP : X86::ESP; + FramePtr = (Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64()) ? + X86::RBP : X86::EBP; } else { SlotSize = 4; StackPtr = X86::ESP; @@ -120,7 +122,7 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, const TargetRegisterClass* X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{ // Don't allow super-classes of GR8_NOREX. This class is only used after - // extrating sub_8bit_hi sub-registers. The H sub-registers cannot be copied + // extracting sub_8bit_hi sub-registers. The H sub-registers cannot be copied // to the full GR8 register class in 64-bit mode, so we cannot allow the // reigster class inflation. // @@ -196,7 +198,7 @@ X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { unsigned X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0; switch (RC->getID()) { @@ -324,7 +326,7 @@ X86RegisterInfo::getNoPreservedMask() const { BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); // Set the stack-pointer register and its aliases as reserved. for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid(); @@ -441,7 +443,8 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *F = MF.getFunction(); - unsigned StackAlign = MF.getTarget().getFrameLowering()->getStackAlignment(); + unsigned StackAlign = + MF.getSubtarget().getFrameLowering()->getStackAlignment(); bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, @@ -456,13 +459,9 @@ bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg, int &FrameIdx) const { - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); - - if (Reg == FramePtr && TFI->hasFP(MF)) { - FrameIdx = MF.getFrameInfo()->getObjectIndexBegin(); - return true; - } - return false; + // Since X86 defines assignCalleeSavedSpillSlots which always return true + // this function neither used nor tested. + llvm_unreachable("Unused function on X86. Otherwise need a test case."); } void @@ -473,7 +472,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MachineInstr &MI = *II; MachineFunction &MF = *MI.getParent()->getParent(); - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); unsigned BasePtr; @@ -488,6 +487,12 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, else BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr); + // For LEA64_32r when BasePtr is 32-bits (X32) we can use full-size 64-bit + // register as source operand, semantic is the same and destination is + // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided. + if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr)) + BasePtr = getX86SubSuperRegister(BasePtr, MVT::i64, false); + // This must be part of a four operand memory reference. Replace the // FrameIndex with base register with EBP. Add an offset to the offset. MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false); @@ -526,7 +531,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); return TFI->hasFP(MF) ? FramePtr : StackPtr; } diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index 74efd1f..cc0a7b2 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86REGISTERINFO_H -#define X86REGISTERINFO_H +#ifndef LLVM_LIB_TARGET_X86_X86REGISTERINFO_H +#define LLVM_LIB_TARGET_X86_X86REGISTERINFO_H #include "llvm/Target/TargetRegisterInfo.h" diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index 33c402b..311a717 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -166,6 +166,7 @@ def FP3 : X86Reg<"fp3", 0>; def FP4 : X86Reg<"fp4", 0>; def FP5 : X86Reg<"fp5", 0>; def FP6 : X86Reg<"fp6", 0>; +def FP7 : X86Reg<"fp7", 0>; // XMM Registers, used by the various SSE instruction set extensions. def XMM0: X86Reg<"xmm0", 0>, DwarfRegNum<[17, 21, 21]>; @@ -234,22 +235,18 @@ let SubRegIndices = [sub_ymm] in { def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>; def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>; -class STRegister<string n, bits<16> Enc, list<Register> A> : X86Reg<n, Enc> { - let Aliases = A; -} - // Floating point stack registers. These don't map one-to-one to the FP // pseudo registers, but we still mark them as aliasing FP registers. That // way both kinds can be live without exceeding the stack depth. ST registers // are only live around inline assembly. -def ST0 : STRegister<"st(0)", 0, []>, DwarfRegNum<[33, 12, 11]>; -def ST1 : STRegister<"st(1)", 1, [FP6]>, DwarfRegNum<[34, 13, 12]>; -def ST2 : STRegister<"st(2)", 2, [FP5]>, DwarfRegNum<[35, 14, 13]>; -def ST3 : STRegister<"st(3)", 3, [FP4]>, DwarfRegNum<[36, 15, 14]>; -def ST4 : STRegister<"st(4)", 4, [FP3]>, DwarfRegNum<[37, 16, 15]>; -def ST5 : STRegister<"st(5)", 5, [FP2]>, DwarfRegNum<[38, 17, 16]>; -def ST6 : STRegister<"st(6)", 6, [FP1]>, DwarfRegNum<[39, 18, 17]>; -def ST7 : STRegister<"st(7)", 7, [FP0]>, DwarfRegNum<[40, 19, 18]>; +def ST0 : X86Reg<"st(0)", 0>, DwarfRegNum<[33, 12, 11]>; +def ST1 : X86Reg<"st(1)", 1>, DwarfRegNum<[34, 13, 12]>; +def ST2 : X86Reg<"st(2)", 2>, DwarfRegNum<[35, 14, 13]>; +def ST3 : X86Reg<"st(3)", 3>, DwarfRegNum<[36, 15, 14]>; +def ST4 : X86Reg<"st(4)", 4>, DwarfRegNum<[37, 16, 15]>; +def ST5 : X86Reg<"st(5)", 5>, DwarfRegNum<[38, 17, 16]>; +def ST6 : X86Reg<"st(6)", 6>, DwarfRegNum<[39, 18, 17]>; +def ST7 : X86Reg<"st(7)", 7>, DwarfRegNum<[40, 19, 18]>; // Floating-point status word def FPSW : X86Reg<"fpsw", 0>; @@ -449,7 +446,7 @@ def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> { } // AVX-512 vector/mask registers. -def VR512 : RegisterClass<"X86", [v16f32, v8f64, v16i32, v8i64], 512, +def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], 512, (sequence "ZMM%u", 0, 31)>; // Scalar AVX-512 floating point registers. @@ -463,13 +460,19 @@ def VR128X : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], 256, (sequence "YMM%u", 0, 31)>; -// The size of the all masked registers is 16 bit because we have only one -// KMOVW istruction that can store this register in memory, and it writes 2 bytes -def VK1 : RegisterClass<"X86", [i1], 16, (sequence "K%u", 0, 7)>; -def VK8 : RegisterClass<"X86", [v8i1], 16, (add VK1)> {let Size = 16;} +// Mask registers +def VK1 : RegisterClass<"X86", [i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} +def VK2 : RegisterClass<"X86", [v2i1], 16, (add VK1)> {let Size = 16;} +def VK4 : RegisterClass<"X86", [v4i1], 16, (add VK2)> {let Size = 16;} +def VK8 : RegisterClass<"X86", [v8i1], 16, (add VK4)> {let Size = 16;} def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;} +def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;} +def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;} def VK1WM : RegisterClass<"X86", [i1], 16, (sub VK1, K0)> {let Size = 16;} +def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;} +def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;} def VK8WM : RegisterClass<"X86", [v8i1], 16, (sub VK8, K0)> {let Size = 16;} -def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)>; - +def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)> {let Size = 16;} +def VK32WM : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;} +def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;} diff --git a/lib/Target/X86/X86Relocations.h b/lib/Target/X86/X86Relocations.h deleted file mode 100644 index 0333056..0000000 --- a/lib/Target/X86/X86Relocations.h +++ /dev/null @@ -1,52 +0,0 @@ -//===-- X86Relocations.h - X86 Code Relocations -----------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the X86 target-specific relocation types. -// -//===----------------------------------------------------------------------===// - -#ifndef X86RELOCATIONS_H -#define X86RELOCATIONS_H - -#include "llvm/CodeGen/MachineRelocation.h" - -namespace llvm { - namespace X86 { - /// RelocationType - An enum for the x86 relocation codes. Note that - /// the terminology here doesn't follow x86 convention - word means - /// 32-bit and dword means 64-bit. The relocations will be treated - /// by JIT or ObjectCode emitters, this is transparent to the x86 code - /// emitter but JIT and ObjectCode will treat them differently - enum RelocationType { - /// reloc_pcrel_word - PC relative relocation, add the relocated value to - /// the value already in memory, after we adjust it for where the PC is. - reloc_pcrel_word = 0, - - /// reloc_picrel_word - PIC base relative relocation, add the relocated - /// value to the value already in memory, after we adjust it for where the - /// PIC base is. - reloc_picrel_word = 1, - - /// reloc_absolute_word - absolute relocation, just add the relocated - /// value to the value already in memory. - reloc_absolute_word = 2, - - /// reloc_absolute_word_sext - absolute relocation, just add the relocated - /// value to the value already in memory. In object files, it represents a - /// value which must be sign-extended when resolving the relocation. - reloc_absolute_word_sext = 3, - - /// reloc_absolute_dword - absolute relocation, just add the relocated - /// value to the value already in memory. - reloc_absolute_dword = 4 - }; - } -} - -#endif diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td index 6966d61..73a3230 100644 --- a/lib/Target/X86/X86SchedHaswell.td +++ b/lib/Target/X86/X86SchedHaswell.td @@ -48,13 +48,17 @@ def HWPort6 : ProcResource<1>; def HWPort7 : ProcResource<1>; // Many micro-ops are capable of issuing on multiple ports. +def HWPort01 : ProcResGroup<[HWPort0, HWPort1]>; def HWPort23 : ProcResGroup<[HWPort2, HWPort3]>; def HWPort237 : ProcResGroup<[HWPort2, HWPort3, HWPort7]>; +def HWPort04 : ProcResGroup<[HWPort0, HWPort4]>; def HWPort05 : ProcResGroup<[HWPort0, HWPort5]>; -def HWPort06 : ProcResGroup<[HWPort0, HWPort6]>; +def HWPort06 : ProcResGroup<[HWPort0, HWPort6]>; def HWPort15 : ProcResGroup<[HWPort1, HWPort5]>; def HWPort16 : ProcResGroup<[HWPort1, HWPort6]>; +def HWPort56 : ProcResGroup<[HWPort5, HWPort6]>; def HWPort015 : ProcResGroup<[HWPort0, HWPort1, HWPort5]>; +def HWPort056 : ProcResGroup<[HWPort0, HWPort5, HWPort6]>; def HWPort0156: ProcResGroup<[HWPort0, HWPort1, HWPort5, HWPort6]>; // 60 Entry Unified Scheduler @@ -125,6 +129,7 @@ defm : HWWriteResPair<WriteFAdd, HWPort1, 3>; defm : HWWriteResPair<WriteFMul, HWPort0, 5>; defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles. defm : HWWriteResPair<WriteFRcp, HWPort0, 5>; +defm : HWWriteResPair<WriteFRsqrt, HWPort0, 5>; defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>; defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>; defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>; @@ -261,4 +266,1882 @@ def : WriteRes<WriteSystem, [HWPort0156]> { let Latency = 100; } def : WriteRes<WriteMicrocoded, [HWPort0156]> { let Latency = 100; } def : WriteRes<WriteFence, [HWPort23, HWPort4]>; def : WriteRes<WriteNop, []>; + +//================ Exceptions ================// + +//-- Specific Scheduling Models --// + +// Starting with P0. +def WriteP0 : SchedWriteRes<[HWPort0]>; + +def WriteP0_P1_Lat4 : SchedWriteRes<[HWPort0, HWPort1]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} + +def WriteP0_P1_Lat4Ld : SchedWriteRes<[HWPort0, HWPort1, HWPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} + +def WriteP01 : SchedWriteRes<[HWPort01]>; + +def Write2P01 : SchedWriteRes<[HWPort01]> { + let NumMicroOps = 2; +} +def Write3P01 : SchedWriteRes<[HWPort01]> { + let NumMicroOps = 3; +} + +def WriteP015 : SchedWriteRes<[HWPort015]>; + +def WriteP01_P5 : SchedWriteRes<[HWPort01, HWPort5]> { + let NumMicroOps = 2; +} +def WriteP06 : SchedWriteRes<[HWPort06]>; + +def Write2P06 : SchedWriteRes<[HWPort06]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} + +def Write3P06_Lat2 : SchedWriteRes<[HWPort06]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} + +def WriteP0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> { + let NumMicroOps = 2; +} + +def Write2P0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> { + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} + +def Write2P0156_Lat2 : SchedWriteRes<[HWPort0156]> { + let Latency = 2; + let ResourceCycles = [2]; +} +def Write2P0156_Lat2Ld : SchedWriteRes<[HWPort0156, HWPort23]> { + let Latency = 6; + let ResourceCycles = [2, 1]; +} + +def Write5P0156 : SchedWriteRes<[HWPort0156]> { + let NumMicroOps = 5; + let ResourceCycles = [5]; +} + +def WriteP0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> { + let Latency = 1; + let ResourceCycles = [1, 2, 1]; +} + +def Write2P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> { + let Latency = 1; + let ResourceCycles = [2, 2, 1]; +} + +def Write3P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> { + let Latency = 1; + let ResourceCycles = [3, 2, 1]; +} + +// Starting with P1. +def WriteP1 : SchedWriteRes<[HWPort1]>; + +def WriteP1_P23 : SchedWriteRes<[HWPort1, HWPort23]> { + let NumMicroOps = 2; +} +def WriteP1_Lat3 : SchedWriteRes<[HWPort1]> { + let Latency = 3; +} +def WriteP1_Lat3Ld : SchedWriteRes<[HWPort1, HWPort23]> { + let Latency = 7; +} + +def Write2P1 : SchedWriteRes<[HWPort1]> { + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def Write2P1_P23 : SchedWriteRes<[HWPort1, HWPort23]> { + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def WriteP15 : SchedWriteRes<[HWPort15]>; +def WriteP15Ld : SchedWriteRes<[HWPort15, HWPort23]> { + let Latency = 4; +} + +def WriteP1_P5_Lat4 : SchedWriteRes<[HWPort1, HWPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} + +def WriteP1_P5_Lat4Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} + +def WriteP1_P5_Lat6 : SchedWriteRes<[HWPort1, HWPort5]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} + +def WriteP1_P5_Lat6Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} + +// Starting with P2. +def Write2P237_P4 : SchedWriteRes<[HWPort237, HWPort4]> { + let Latency = 1; + let ResourceCycles = [2, 1]; +} + +// Starting with P5. +def WriteP5 : SchedWriteRes<[HWPort5]>; +def WriteP5Ld : SchedWriteRes<[HWPort5, HWPort23]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} + +// Notation: +// - r: register. +// - mm: 64 bit mmx register. +// - x = 128 bit xmm register. +// - (x)mm = mmx or xmm register. +// - y = 256 bit ymm register. +// - v = any vector register. +// - m = memory. + +//=== Integer Instructions ===// +//-- Move instructions --// + +// MOV. +// r16,m. +def : InstRW<[WriteALULd], (instregex "MOV16rm")>; + +// MOVSX, MOVZX. +// r,m. +def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>; + +// CMOVcc. +// r,r. +def : InstRW<[Write2P0156_Lat2], + (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rr")>; +// r,m. +def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd], + (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rm")>; + +// XCHG. +// r,r. +def WriteXCHG : SchedWriteRes<[HWPort0156]> { + let Latency = 2; + let ResourceCycles = [3]; +} + +def : InstRW<[WriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>; + +// r,m. +def WriteXCHGrm : SchedWriteRes<[]> { + let Latency = 21; + let NumMicroOps = 8; +} +def : InstRW<[WriteXCHGrm], (instregex "XCHG(8|16|32|64)rm")>; + +// XLAT. +def WriteXLAT : SchedWriteRes<[]> { + let Latency = 7; + let NumMicroOps = 3; +} +def : InstRW<[WriteXLAT], (instregex "XLAT")>; + +// PUSH. +// m. +def : InstRW<[Write2P237_P4], (instregex "PUSH(16|32)rmm")>; + +// PUSHF. +def WritePushF : SchedWriteRes<[HWPort1, HWPort4, HWPort237, HWPort06]> { + let NumMicroOps = 4; +} +def : InstRW<[WritePushF], (instregex "PUSHF(16|32)")>; + +// PUSHA. +def WritePushA : SchedWriteRes<[]> { + let NumMicroOps = 19; +} +def : InstRW<[WritePushA], (instregex "PUSHA(16|32)")>; + +// POP. +// m. +def : InstRW<[Write2P237_P4], (instregex "POP(16|32)rmm")>; + +// POPF. +def WritePopF : SchedWriteRes<[]> { + let NumMicroOps = 9; +} +def : InstRW<[WritePopF], (instregex "POPF(16|32)")>; + +// POPA. +def WritePopA : SchedWriteRes<[]> { + let NumMicroOps = 18; +} +def : InstRW<[WritePopA], (instregex "POPA(16|32)")>; + +// LAHF SAHF. +def : InstRW<[WriteP06], (instregex "(S|L)AHF")>; + +// BSWAP. +// r32. +def WriteBSwap32 : SchedWriteRes<[HWPort15]>; +def : InstRW<[WriteBSwap32], (instregex "BSWAP32r")>; + +// r64. +def WriteBSwap64 : SchedWriteRes<[HWPort06, HWPort15]> { + let NumMicroOps = 2; +} +def : InstRW<[WriteBSwap64], (instregex "BSWAP64r")>; + +// MOVBE. +// r16,m16 / r64,m64. +def : InstRW<[Write2P0156_Lat2Ld], (instregex "MOVBE(16|64)rm")>; + +// r32, m32. +def WriteMoveBE32rm : SchedWriteRes<[HWPort15, HWPort23]> { + let NumMicroOps = 2; +} +def : InstRW<[WriteMoveBE32rm], (instregex "MOVBE32rm")>; + +// m16,r16. +def WriteMoveBE16mr : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { + let NumMicroOps = 3; +} +def : InstRW<[WriteMoveBE16mr], (instregex "MOVBE16mr")>; + +// m32,r32. +def WriteMoveBE32mr : SchedWriteRes<[HWPort15, HWPort237, HWPort4]> { + let NumMicroOps = 3; +} +def : InstRW<[WriteMoveBE32mr], (instregex "MOVBE32mr")>; + +// m64,r64. +def WriteMoveBE64mr : SchedWriteRes<[HWPort06, HWPort15, HWPort237, HWPort4]> { + let NumMicroOps = 4; +} +def : InstRW<[WriteMoveBE64mr], (instregex "MOVBE64mr")>; + +//-- Arithmetic instructions --// + +// ADD SUB. +// m,r/i. +def : InstRW<[Write2P0156_2P237_P4], + (instregex "(ADD|SUB)(8|16|32|64)m(r|i)", + "(ADD|SUB)(8|16|32|64)mi8", "(ADD|SUB)64mi32")>; + +// ADC SBB. +// r,r/i. +def : InstRW<[Write2P0156_Lat2], (instregex "(ADC|SBB)(8|16|32|64)r(r|i)", + "(ADC|SBB)(16|32|64)ri8", + "(ADC|SBB)64ri32", + "(ADC|SBB)(8|16|32|64)rr_REV")>; + +// r,m. +def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd], (instregex "(ADC|SBB)(8|16|32|64)rm")>; + +// m,r/i. +def : InstRW<[Write3P0156_2P237_P4], + (instregex "(ADC|SBB)(8|16|32|64)m(r|i)", + "(ADC|SBB)(16|32|64)mi8", + "(ADC|SBB)64mi32")>; + +// INC DEC NOT NEG. +// m. +def : InstRW<[WriteP0156_2P237_P4], + (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m", + "(INC|DEC)64(16|32)m")>; + +// MUL IMUL. +// r16. +def WriteMul16 : SchedWriteRes<[HWPort1, HWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; +} +def : InstRW<[WriteMul16], (instregex "IMUL16r", "MUL16r")>; + +// m16. +def WriteMul16Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> { + let Latency = 8; + let NumMicroOps = 5; +} +def : InstRW<[WriteMul16Ld], (instregex "IMUL16m", "MUL16m")>; + +// r32. +def WriteMul32 : SchedWriteRes<[HWPort1, HWPort0156]> { + let Latency = 4; + let NumMicroOps = 3; +} +def : InstRW<[WriteMul32], (instregex "IMUL32r", "MUL32r")>; + +// m32. +def WriteMul32Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> { + let Latency = 8; + let NumMicroOps = 4; +} +def : InstRW<[WriteMul32Ld], (instregex "IMUL32m", "MUL32m")>; + +// r64. +def WriteMul64 : SchedWriteRes<[HWPort1, HWPort6]> { + let Latency = 3; + let NumMicroOps = 2; +} +def : InstRW<[WriteMul64], (instregex "IMUL64r", "MUL64r")>; + +// m64. +def WriteMul64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> { + let Latency = 7; + let NumMicroOps = 3; +} +def : InstRW<[WriteMul64Ld], (instregex "IMUL64m", "MUL64m")>; + +// r16,r16. +def WriteMul16rri : SchedWriteRes<[HWPort1, HWPort0156]> { + let Latency = 4; + let NumMicroOps = 2; +} +def : InstRW<[WriteMul16rri], (instregex "IMUL16rri", "IMUL16rri8")>; + +// r16,m16. +def WriteMul16rmi : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> { + let Latency = 8; + let NumMicroOps = 3; +} +def : InstRW<[WriteMul16rmi], (instregex "IMUL16rmi", "IMUL16rmi8")>; + +// MULX. +// r32,r32,r32. +def WriteMulX32 : SchedWriteRes<[HWPort1, HWPort056]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1, 2]; +} +def : InstRW<[WriteMulX32], (instregex "MULX32rr")>; + +// r32,r32,m32. +def WriteMulX32Ld : SchedWriteRes<[HWPort1, HWPort056, HWPort23]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [1, 2, 1]; +} +def : InstRW<[WriteMulX32Ld], (instregex "MULX32rm")>; + +// r64,r64,r64. +def WriteMulX64 : SchedWriteRes<[HWPort1, HWPort6]> { + let Latency = 4; + let NumMicroOps = 2; +} +def : InstRW<[WriteMulX64], (instregex "MULX64rr")>; + +// r64,r64,m64. +def WriteMulX64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> { + let Latency = 8; + let NumMicroOps = 3; +} +def : InstRW<[WriteMulX64Ld], (instregex "MULX64rm")>; + +// DIV. +// r8. +def WriteDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 22; + let NumMicroOps = 9; +} +def : InstRW<[WriteDiv8], (instregex "DIV8r")>; + +// r16. +def WriteDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 23; + let NumMicroOps = 10; +} +def : InstRW<[WriteDiv16], (instregex "DIV16r")>; + +// r32. +def WriteDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 22; + let NumMicroOps = 10; +} +def : InstRW<[WriteDiv32], (instregex "DIV32r")>; + +// r64. +def WriteDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 32; + let NumMicroOps = 36; +} +def : InstRW<[WriteDiv64], (instregex "DIV64r")>; + +// IDIV. +// r8. +def WriteIDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 23; + let NumMicroOps = 9; +} +def : InstRW<[WriteIDiv8], (instregex "IDIV8r")>; + +// r16. +def WriteIDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 23; + let NumMicroOps = 10; +} +def : InstRW<[WriteIDiv16], (instregex "IDIV16r")>; + +// r32. +def WriteIDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 22; + let NumMicroOps = 9; +} +def : InstRW<[WriteIDiv32], (instregex "IDIV32r")>; + +// r64. +def WriteIDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 39; + let NumMicroOps = 59; +} +def : InstRW<[WriteIDiv64], (instregex "IDIV64r")>; + +//-- Logic instructions --// + +// AND OR XOR. +// m,r/i. +def : InstRW<[Write2P0156_2P237_P4], + (instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)", + "(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>; + +// SHR SHL SAR. +// m,i. +def WriteShiftRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteShiftRMW], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>; + +// r,cl. +def : InstRW<[Write3P06_Lat2], (instregex "S(A|H)(R|L)(8|16|32|64)rCL")>; + +// m,cl. +def WriteShiftClLdRMW : SchedWriteRes<[HWPort06, HWPort23, HWPort4]> { + let NumMicroOps = 6; + let ResourceCycles = [3, 2, 1]; +} +def : InstRW<[WriteShiftClLdRMW], (instregex "S(A|H)(R|L)(8|16|32|64)mCL")>; + +// ROR ROL. +// r,1. +def : InstRW<[Write2P06], (instregex "RO(R|L)(8|16|32|64)r1")>; + +// m,i. +def WriteRotateRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { + let NumMicroOps = 5; + let ResourceCycles = [2, 2, 1]; +} +def : InstRW<[WriteRotateRMW], (instregex "RO(R|L)(8|16|32|64)mi")>; + +// r,cl. +def : InstRW<[Write3P06_Lat2], (instregex "RO(R|L)(8|16|32|64)rCL")>; + +// m,cl. +def WriteRotateRMWCL : SchedWriteRes<[]> { + let NumMicroOps = 6; +} +def : InstRW<[WriteRotateRMWCL], (instregex "RO(R|L)(8|16|32|64)mCL")>; + +// RCR RCL. +// r,1. +def WriteRCr1 : SchedWriteRes<[HWPort06, HWPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteRCr1], (instregex "RC(R|L)(8|16|32|64)r1")>; + +// m,1. +def WriteRCm1 : SchedWriteRes<[]> { + let NumMicroOps = 6; +} +def : InstRW<[WriteRCm1], (instregex "RC(R|L)(8|16|32|64)m1")>; + +// r,i. +def WriteRCri : SchedWriteRes<[HWPort0156]> { + let Latency = 6; + let NumMicroOps = 8; +} +def : InstRW<[WriteRCri], (instregex "RC(R|L)(8|16|32|64)r(i|CL)")>; + +// m,i. +def WriteRCmi : SchedWriteRes<[]> { + let NumMicroOps = 11; +} +def : InstRW<[WriteRCmi], (instregex "RC(R|L)(8|16|32|64)m(i|CL)")>; + +// SHRD SHLD. +// r,r,i. +def WriteShDrr : SchedWriteRes<[HWPort1]> { + let Latency = 3; +} +def : InstRW<[WriteShDrr], (instregex "SH(R|L)D(16|32|64)rri8")>; + +// m,r,i. +def WriteShDmr : SchedWriteRes<[]> { + let NumMicroOps = 5; +} +def : InstRW<[WriteShDmr], (instregex "SH(R|L)D(16|32|64)mri8")>; + +// r,r,cl. +def WriteShlDCL : SchedWriteRes<[HWPort0156]> { + let Latency = 3; + let NumMicroOps = 4; +} +def : InstRW<[WriteShlDCL], (instregex "SHLD(16|32|64)rrCL")>; + +// r,r,cl. +def WriteShrDCL : SchedWriteRes<[HWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; +} +def : InstRW<[WriteShrDCL], (instregex "SHRD(16|32|64)rrCL")>; + +// m,r,cl. +def WriteShDmrCL : SchedWriteRes<[]> { + let NumMicroOps = 7; +} +def : InstRW<[WriteShDmrCL], (instregex "SH(R|L)D(16|32|64)mrCL")>; + +// BT. +// r,r/i. +def : InstRW<[WriteShift], (instregex "BT(16|32|64)r(r|i8)")>; + +// m,r. +def WriteBTmr : SchedWriteRes<[]> { + let NumMicroOps = 10; +} +def : InstRW<[WriteBTmr], (instregex "BT(16|32|64)mr")>; + +// m,i. +def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mi8")>; + +// BTR BTS BTC. +// r,r,i. +def : InstRW<[WriteShift], (instregex "BT(R|S|C)(16|32|64)r(r|i8)")>; + +// m,r. +def WriteBTRSCmr : SchedWriteRes<[]> { + let NumMicroOps = 11; +} +def : InstRW<[WriteBTRSCmr], (instregex "BT(R|S|C)(16|32|64)mr")>; + +// m,i. +def : InstRW<[WriteShiftLd], (instregex "BT(R|S|C)(16|32|64)mi8")>; + +// BSF BSR. +// r,r. +def : InstRW<[WriteP1_Lat3], (instregex "BS(R|F)(16|32|64)rr")>; +// r,m. +def : InstRW<[WriteP1_Lat3Ld], (instregex "BS(R|F)(16|32|64)rm")>; + +// SETcc. +// r. +def : InstRW<[WriteShift], + (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)r")>; +// m. +def WriteSetCCm : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { + let NumMicroOps = 3; +} +def : InstRW<[WriteSetCCm], + (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)m")>; + +// CLD STD. +def WriteCldStd : SchedWriteRes<[HWPort15, HWPort6]> { + let NumMicroOps = 3; +} +def : InstRW<[WriteCldStd], (instregex "STD", "CLD")>; + +// LZCNT TZCNT. +// r,r. +def : InstRW<[WriteP1_Lat3], (instregex "(L|TZCNT)(16|32|64)rr")>; +// r,m. +def : InstRW<[WriteP1_Lat3Ld], (instregex "(L|TZCNT)(16|32|64)rm")>; + +// ANDN. +// r,r. +def : InstRW<[WriteP15], (instregex "ANDN(32|64)rr")>; +// r,m. +def : InstRW<[WriteP15Ld], (instregex "ANDN(32|64)rm")>; + +// BLSI BLSMSK BLSR. +// r,r. +def : InstRW<[WriteP15], (instregex "BLS(I|MSK|R)(32|64)rr")>; +// r,m. +def : InstRW<[WriteP15Ld], (instregex "BLS(I|MSK|R)(32|64)rm")>; + +// BEXTR. +// r,r,r. +def : InstRW<[Write2P0156_Lat2], (instregex "BEXTR(32|64)rr")>; +// r,m,r. +def : InstRW<[Write2P0156_Lat2Ld], (instregex "BEXTR(32|64)rm")>; + +// BZHI. +// r,r,r. +def : InstRW<[WriteP15], (instregex "BZHI(32|64)rr")>; +// r,m,r. +def : InstRW<[WriteP15Ld], (instregex "BZHI(32|64)rm")>; + +// PDEP PEXT. +// r,r,r. +def : InstRW<[WriteP1_Lat3], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>; +// r,m,r. +def : InstRW<[WriteP1_Lat3Ld], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>; + +//-- Control transfer instructions --// + +// J(E|R)CXZ. +def WriteJCXZ : SchedWriteRes<[HWPort0156, HWPort6]> { + let NumMicroOps = 2; +} +def : InstRW<[WriteJCXZ], (instregex "JCXZ", "JECXZ_(32|64)", "JRCXZ")>; + +// LOOP. +def WriteLOOP : SchedWriteRes<[]> { + let NumMicroOps = 7; +} +def : InstRW<[WriteLOOP], (instregex "LOOP")>; + +// LOOP(N)E +def WriteLOOPE : SchedWriteRes<[]> { + let NumMicroOps = 11; +} +def : InstRW<[WriteLOOPE], (instregex "LOOPE", "LOOPNE")>; + +// CALL. +// r. +def WriteCALLr : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> { + let NumMicroOps = 3; +} +def : InstRW<[WriteCALLr], (instregex "CALL(16|32)r")>; + +// m. +def WriteCALLm : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> { + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteCALLm], (instregex "CALL(16|32)m")>; + +// RET. +def WriteRET : SchedWriteRes<[HWPort237, HWPort6]> { + let NumMicroOps = 2; +} +def : InstRW<[WriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)")>; + +// i. +def WriteRETI : SchedWriteRes<[HWPort23, HWPort6, HWPort015]> { + let NumMicroOps = 4; + let ResourceCycles = [1, 2, 1]; +} +def : InstRW<[WriteRETI], (instregex "RETI(L|Q|W)", "LRETI(L|Q|W)")>; + +// BOUND. +// r,m. +def WriteBOUND : SchedWriteRes<[]> { + let NumMicroOps = 15; +} +def : InstRW<[WriteBOUND], (instregex "BOUNDS(16|32)rm")>; + +// INTO. +def WriteINTO : SchedWriteRes<[]> { + let NumMicroOps = 4; +} +def : InstRW<[WriteINTO], (instregex "INTO")>; + +//-- String instructions --// + +// LODSB/W. +def : InstRW<[Write2P0156_P23], (instregex "LODS(B|W)")>; + +// LODSD/Q. +def : InstRW<[WriteP0156_P23], (instregex "LODS(L|Q)")>; + +// STOS. +def WriteSTOS : SchedWriteRes<[HWPort23, HWPort0156, HWPort4]> { + let NumMicroOps = 3; +} +def : InstRW<[WriteSTOS], (instregex "STOS(B|L|Q|W)")>; + +// MOVS. +def WriteMOVS : SchedWriteRes<[HWPort23, HWPort4, HWPort0156]> { + let Latency = 4; + let NumMicroOps = 5; + let ResourceCycles = [2, 1, 2]; +} +def : InstRW<[WriteMOVS], (instregex "MOVS(B|L|Q|W)")>; + +// SCAS. +def : InstRW<[Write2P0156_P23], (instregex "SCAS(B|W|L|Q)")>; + +// CMPS. +def WriteCMPS : SchedWriteRes<[HWPort23, HWPort0156]> { + let Latency = 4; + let NumMicroOps = 5; + let ResourceCycles = [2, 3]; +} +def : InstRW<[WriteCMPS], (instregex "CMPS(B|L|Q|W)")>; + +//-- Synchronization instructions --// + +// XADD. +def WriteXADD : SchedWriteRes<[]> { + let NumMicroOps = 5; +} +def : InstRW<[WriteXADD], (instregex "XADD(8|16|32|64)rm")>; + +// CMPXCHG. +def WriteCMPXCHG : SchedWriteRes<[]> { + let NumMicroOps = 6; +} +def : InstRW<[WriteCMPXCHG], (instregex "CMPXCHG(8|16|32|64)rm")>; + +// CMPXCHG8B. +def WriteCMPXCHG8B : SchedWriteRes<[]> { + let NumMicroOps = 15; +} +def : InstRW<[WriteCMPXCHG8B], (instregex "CMPXCHG8B")>; + +// CMPXCHG16B. +def WriteCMPXCHG16B : SchedWriteRes<[]> { + let NumMicroOps = 22; +} +def : InstRW<[WriteCMPXCHG16B], (instregex "CMPXCHG16B")>; + +//-- Other --// + +// PAUSE. +def WritePAUSE : SchedWriteRes<[HWPort05, HWPort6]> { + let NumMicroOps = 5; + let ResourceCycles = [1, 3]; +} +def : InstRW<[WritePAUSE], (instregex "PAUSE")>; + +// LEAVE. +def : InstRW<[Write2P0156_P23], (instregex "LEAVE")>; + +// XGETBV. +def WriteXGETBV : SchedWriteRes<[]> { + let NumMicroOps = 8; +} +def : InstRW<[WriteXGETBV], (instregex "XGETBV")>; + +// RDTSC. +def WriteRDTSC : SchedWriteRes<[]> { + let NumMicroOps = 15; +} +def : InstRW<[WriteRDTSC], (instregex "RDTSC")>; + +// RDPMC. +def WriteRDPMC : SchedWriteRes<[]> { + let NumMicroOps = 34; +} +def : InstRW<[WriteRDPMC], (instregex "RDPMC")>; + +// RDRAND. +def WriteRDRAND : SchedWriteRes<[HWPort23, HWPort015]> { + let NumMicroOps = 17; + let ResourceCycles = [1, 16]; +} +def : InstRW<[WriteRDRAND], (instregex "RDRAND(16|32|64)r")>; + +//=== Floating Point x87 Instructions ===// +//-- Move instructions --// + +// FLD. +// m80. +def : InstRW<[WriteP01], (instregex "LD_Frr")>; + +def WriteLD_F80m : SchedWriteRes<[HWPort01, HWPort23]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [2, 2]; +} +def : InstRW<[WriteLD_F80m], (instregex "LD_F80m")>; + +// FBLD. +// m80. +def WriteFBLD : SchedWriteRes<[]> { + let Latency = 47; + let NumMicroOps = 43; +} +def : InstRW<[WriteFBLD], (instregex "FBLDm")>; + +// FST(P). +// r. +def : InstRW<[WriteP01], (instregex "ST_(F|FP)rr")>; + +// m80. +def WriteST_FP80m : SchedWriteRes<[HWPort0156, HWPort23, HWPort4]> { + let NumMicroOps = 7; + let ResourceCycles = [3, 2, 2]; +} +def : InstRW<[WriteST_FP80m], (instregex "ST_FP80m")>; + +// FBSTP. +// m80. +def WriteFBSTP : SchedWriteRes<[]> { + let NumMicroOps = 226; +} +def : InstRW<[WriteFBSTP], (instregex "FBSTPm")>; + +// FXCHG. +def : InstRW<[WriteNop], (instregex "XCH_F")>; + +// FILD. +def WriteFILD : SchedWriteRes<[HWPort01, HWPort23]> { + let Latency = 6; + let NumMicroOps = 2; +} +def : InstRW<[WriteFILD], (instregex "ILD_F(16|32|64)m")>; + +// FIST(P) FISTTP. +def WriteFIST : SchedWriteRes<[HWPort1, HWPort23, HWPort4]> { + let Latency = 7; + let NumMicroOps = 3; +} +def : InstRW<[WriteFIST], (instregex "IST_(F|FP)(16|32)m")>; + +// FLDZ. +def : InstRW<[WriteP01], (instregex "LD_F0")>; + +// FLD1. +def : InstRW<[Write2P01], (instregex "LD_F1")>; + +// FLDPI FLDL2E etc. +def : InstRW<[Write2P01], (instregex "FLDPI", "FLDL2(T|E)" "FLDL(G|N)2")>; + +// FCMOVcc. +def WriteFCMOVcc : SchedWriteRes<[HWPort0, HWPort5]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteFCMOVcc], (instregex "CMOV(B|BE|P|NB|NBE|NE|NP)_F")>; + +// FNSTSW. +// AX. +def WriteFNSTSW : SchedWriteRes<[HWPort0, HWPort0156]> { + let NumMicroOps = 2; +} +def : InstRW<[WriteFNSTSW], (instregex "FNSTSW16r")>; + +// m16. +def WriteFNSTSWm : SchedWriteRes<[HWPort0, HWPort4, HWPort237]> { + let Latency = 6; + let NumMicroOps = 3; +} +def : InstRW<[WriteFNSTSWm], (instregex "FNSTSWm")>; + +// FLDCW. +def WriteFLDCW : SchedWriteRes<[HWPort01, HWPort23, HWPort6]> { + let Latency = 7; + let NumMicroOps = 3; +} +def : InstRW<[WriteFLDCW], (instregex "FLDCW16m")>; + +// FNSTCW. +def WriteFNSTCW : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> { + let NumMicroOps = 3; +} +def : InstRW<[WriteFNSTCW], (instregex "FNSTCW16m")>; + +// FINCSTP FDECSTP. +def : InstRW<[WriteP01], (instregex "FINCSTP", "FDECSTP")>; + +// FFREE. +def : InstRW<[WriteP01], (instregex "FFREE")>; + +// FNSAVE. +def WriteFNSAVE : SchedWriteRes<[]> { + let NumMicroOps = 147; +} +def : InstRW<[WriteFNSAVE], (instregex "FSAVEm")>; + +// FRSTOR. +def WriteFRSTOR : SchedWriteRes<[]> { + let NumMicroOps = 90; +} +def : InstRW<[WriteFRSTOR], (instregex "FRSTORm")>; + +//-- Arithmetic instructions --// + +// FABS. +def : InstRW<[WriteP0], (instregex "ABS_F")>; + +// FCHS. +def : InstRW<[WriteP0], (instregex "CHS_F")>; + +// FCOM(P) FUCOM(P). +// r. +def : InstRW<[WriteP1], (instregex "COM_FST0r", "COMP_FST0r", "UCOM_Fr", + "UCOM_FPr")>; +// m. +def : InstRW<[WriteP1_P23], (instregex "FCOM(32|64)m", "FCOMP(32|64)m")>; + +// FCOMPP FUCOMPP. +// r. +def : InstRW<[Write2P01], (instregex "FCOMPP", "UCOM_FPPr")>; + +// FCOMI(P) FUCOMI(P). +// m. +def : InstRW<[Write3P01], (instregex "COM_FIr", "COM_FIPr", "UCOM_FIr", + "UCOM_FIPr")>; + +// FICOM(P). +def : InstRW<[Write2P1_P23], (instregex "FICOM(16|32)m", "FICOMP(16|32)m")>; + +// FTST. +def : InstRW<[WriteP1], (instregex "TST_F")>; + +// FXAM. +def : InstRW<[Write2P1], (instregex "FXAM")>; + +// FPREM. +def WriteFPREM : SchedWriteRes<[]> { + let Latency = 19; + let NumMicroOps = 28; +} +def : InstRW<[WriteFPREM], (instregex "FPREM")>; + +// FPREM1. +def WriteFPREM1 : SchedWriteRes<[]> { + let Latency = 27; + let NumMicroOps = 41; +} +def : InstRW<[WriteFPREM1], (instregex "FPREM1")>; + +// FRNDINT. +def WriteFRNDINT : SchedWriteRes<[]> { + let Latency = 11; + let NumMicroOps = 17; +} +def : InstRW<[WriteFRNDINT], (instregex "FRNDINT")>; + +//-- Math instructions --// + +// FSCALE. +def WriteFSCALE : SchedWriteRes<[]> { + let Latency = 75; // 49-125 + let NumMicroOps = 50; // 25-75 +} +def : InstRW<[WriteFSCALE], (instregex "FSCALE")>; + +// FXTRACT. +def WriteFXTRACT : SchedWriteRes<[]> { + let Latency = 15; + let NumMicroOps = 17; +} +def : InstRW<[WriteFXTRACT], (instregex "FXTRACT")>; + +//-- Other instructions --// + +// FNOP. +def : InstRW<[WriteP01], (instregex "FNOP")>; + +// WAIT. +def : InstRW<[Write2P01], (instregex "WAIT")>; + +// FNCLEX. +def : InstRW<[Write5P0156], (instregex "FNCLEX")>; + +// FNINIT. +def WriteFNINIT : SchedWriteRes<[]> { + let NumMicroOps = 26; +} +def : InstRW<[WriteFNINIT], (instregex "FNINIT")>; + +//=== Integer MMX and XMM Instructions ===// +//-- Move instructions --// + +// MOVD. +// r32/64 <- (x)mm. +def : InstRW<[WriteP0], (instregex "MMX_MOVD64grr", "MMX_MOVD64from64rr", + "VMOVPDI2DIrr", "MOVPDI2DIrr")>; + +// (x)mm <- r32/64. +def : InstRW<[WriteP5], (instregex "MMX_MOVD64rr", "MMX_MOVD64to64rr", + "VMOVDI2PDIrr", "MOVDI2PDIrr")>; + +// MOVQ. +// r64 <- (x)mm. +def : InstRW<[WriteP0], (instregex "VMOVPQIto64rr")>; + +// (x)mm <- r64. +def : InstRW<[WriteP5], (instregex "VMOV64toPQIrr", "VMOVZQI2PQIrr")>; + +// (x)mm <- (x)mm. +def : InstRW<[WriteP015], (instregex "MMX_MOVQ64rr")>; + +// (V)MOVDQA/U. +// x <- x. +def : InstRW<[WriteP015], (instregex "MOVDQ(A|U)rr", "VMOVDQ(A|U)rr", + "MOVDQ(A|U)rr_REV", "VMOVDQ(A|U)rr_REV", + "VMOVDQ(A|U)Yrr", "VMOVDQ(A|U)Yrr_REV")>; + +// MOVDQ2Q. +def : InstRW<[WriteP01_P5], (instregex "MMX_MOVDQ2Qrr")>; + +// MOVQ2DQ. +def : InstRW<[WriteP015], (instregex "MMX_MOVQ2DQrr")>; + + +// PACKSSWB/DW. +// mm <- mm. +def WriteMMXPACKSSrr : SchedWriteRes<[HWPort5]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def : InstRW<[WriteMMXPACKSSrr], (instregex "MMX_PACKSSDWirr", + "MMX_PACKSSWBirr", "MMX_PACKUSWBirr")>; + +// mm <- m64. +def WriteMMXPACKSSrm : SchedWriteRes<[HWPort23, HWPort5]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1, 3]; +} +def : InstRW<[WriteMMXPACKSSrm], (instregex "MMX_PACKSSDWirm", + "MMX_PACKSSWBirm", "MMX_PACKUSWBirm")>; + +// VPMOVSX/ZX BW BD BQ DW DQ. +// y <- x. +def WriteVPMOVSX : SchedWriteRes<[HWPort5]> { + let Latency = 3; + let NumMicroOps = 1; +} +def : InstRW<[WriteVPMOVSX], (instregex "VPMOV(SX|ZX)(BW|BQ|DW|DQ)Yrr")>; + +// PBLENDW. +// x,x,i / v,v,v,i +def WritePBLENDWr : SchedWriteRes<[HWPort5]>; +def : InstRW<[WritePBLENDWr], (instregex "(V?)PBLENDW(Y?)rri")>; + +// x,m,i / v,v,m,i +def WritePBLENDWm : SchedWriteRes<[HWPort5, HWPort23]> { + let NumMicroOps = 2; + let Latency = 4; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WritePBLENDWm, ReadAfterLd], (instregex "(V?)PBLENDW(Y?)rmi")>; + +// VPBLENDD. +// v,v,v,i. +def WriteVPBLENDDr : SchedWriteRes<[HWPort015]>; +def : InstRW<[WriteVPBLENDDr], (instregex "VPBLENDD(Y?)rri")>; + +// v,v,m,i +def WriteVPBLENDDm : SchedWriteRes<[HWPort015, HWPort23]> { + let NumMicroOps = 2; + let Latency = 4; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteVPBLENDDm, ReadAfterLd], (instregex "VPBLENDD(Y?)rmi")>; + +// MASKMOVQ. +def WriteMASKMOVQ : SchedWriteRes<[HWPort0, HWPort4, HWPort23]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [1, 1, 2]; +} +def : InstRW<[WriteMASKMOVQ], (instregex "MMX_MASKMOVQ(64)?")>; + +// MASKMOVDQU. +def WriteMASKMOVDQU : SchedWriteRes<[HWPort04, HWPort56, HWPort23]> { + let Latency = 14; + let NumMicroOps = 10; + let ResourceCycles = [4, 2, 4]; +} +def : InstRW<[WriteMASKMOVDQU], (instregex "(V?)MASKMOVDQU(64)?")>; + +// VPMASKMOV D/Q. +// v,v,m. +def WriteVPMASKMOVr : SchedWriteRes<[HWPort5, HWPort23]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteVPMASKMOVr, ReadAfterLd], + (instregex "VPMASKMOV(D|Q)(Y?)rm")>; + +// m, v,v. +def WriteVPMASKMOVm : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [1, 1, 1, 1]; +} +def : InstRW<[WriteVPMASKMOVm], (instregex "VPMASKMOV(D|Q)(Y?)mr")>; + +// PMOVMSKB. +def WritePMOVMSKB : SchedWriteRes<[HWPort0]> { + let Latency = 3; +} +def : InstRW<[WritePMOVMSKB], (instregex "(V|MMX_)?PMOVMSKB(Y?)rr")>; + +// PEXTR B/W/D/Q. +// r32,x,i. +def WritePEXTRr : SchedWriteRes<[HWPort0, HWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WritePEXTRr], (instregex "PEXTR(B|W|D|Q)rr", "MMX_PEXTRWirri")>; + +// m8,x,i. +def WritePEXTRm : SchedWriteRes<[HWPort23, HWPort4, HWPort5]> { + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WritePEXTRm], (instregex "PEXTR(B|W|D|Q)mr")>; + +// VPBROADCAST B/W. +// x, m8/16. +def WriteVPBROADCAST128Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WriteVPBROADCAST128Ld, ReadAfterLd], + (instregex "VPBROADCAST(B|W)rm")>; + +// y, m8/16 +def WriteVPBROADCAST256Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WriteVPBROADCAST256Ld, ReadAfterLd], + (instregex "VPBROADCAST(B|W)Yrm")>; + +// VPGATHERDD. +// x. +def WriteVPGATHERDD128 : SchedWriteRes<[]> { + let NumMicroOps = 20; +} +def : InstRW<[WriteVPGATHERDD128, ReadAfterLd], (instregex "VPGATHERDDrm")>; + +// y. +def WriteVPGATHERDD256 : SchedWriteRes<[]> { + let NumMicroOps = 34; +} +def : InstRW<[WriteVPGATHERDD256, ReadAfterLd], (instregex "VPGATHERDDYrm")>; + +// VPGATHERQD. +// x. +def WriteVPGATHERQD128 : SchedWriteRes<[]> { + let NumMicroOps = 15; +} +def : InstRW<[WriteVPGATHERQD128, ReadAfterLd], (instregex "VPGATHERQDrm")>; + +// y. +def WriteVPGATHERQD256 : SchedWriteRes<[]> { + let NumMicroOps = 22; +} +def : InstRW<[WriteVPGATHERQD256, ReadAfterLd], (instregex "VPGATHERQDYrm")>; + +// VPGATHERDQ. +// x. +def WriteVPGATHERDQ128 : SchedWriteRes<[]> { + let NumMicroOps = 12; +} +def : InstRW<[WriteVPGATHERDQ128, ReadAfterLd], (instregex "VPGATHERDQrm")>; + +// y. +def WriteVPGATHERDQ256 : SchedWriteRes<[]> { + let NumMicroOps = 20; +} +def : InstRW<[WriteVPGATHERDQ256, ReadAfterLd], (instregex "VPGATHERDQYrm")>; + +// VPGATHERQQ. +// x. +def WriteVPGATHERQQ128 : SchedWriteRes<[]> { + let NumMicroOps = 14; +} +def : InstRW<[WriteVPGATHERQQ128, ReadAfterLd], (instregex "VPGATHERQQrm")>; + +// y. +def WriteVPGATHERQQ256 : SchedWriteRes<[]> { + let NumMicroOps = 22; +} +def : InstRW<[WriteVPGATHERQQ256, ReadAfterLd], (instregex "VPGATHERQQYrm")>; + +//-- Arithmetic instructions --// + +// PHADD|PHSUB (S) W/D. +// v <- v,v. +def WritePHADDSUBr : SchedWriteRes<[HWPort1, HWPort5]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1, 2]; +} +def : InstRW<[WritePHADDSUBr], (instregex "MMX_PHADD(W?)rr64", + "MMX_PHADDSWrr64", + "MMX_PHSUB(W|D)rr64", + "MMX_PHSUBSWrr64", + "(V?)PH(ADD|SUB)(W|D)(Y?)rr", + "(V?)PH(ADD|SUB)SWrr(256)?")>; + +// v <- v,m. +def WritePHADDSUBm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [1, 2, 1]; +} +def : InstRW<[WritePHADDSUBm, ReadAfterLd], + (instregex "MMX_PHADD(W?)rm64", + "MMX_PHADDSWrm64", + "MMX_PHSUB(W|D)rm64", + "MMX_PHSUBSWrm64", + "(V?)PH(ADD|SUB)(W|D)(Y?)rm", + "(V?)PH(ADD|SUB)SWrm(128|256)?")>; + +// PCMPGTQ. +// v <- v,v. +def WritePCMPGTQr : SchedWriteRes<[HWPort0]> { + let Latency = 5; + let NumMicroOps = 1; +} +def : InstRW<[WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>; + +// v <- v,m. +def WritePCMPGTQm : SchedWriteRes<[HWPort0, HWPort23]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WritePCMPGTQm, ReadAfterLd], (instregex "(V?)PCMPGTQ(Y?)rm")>; + +// PMULLD. +// x,x / y,y,y. +def WritePMULLDr : SchedWriteRes<[HWPort0]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : InstRW<[WritePMULLDr], (instregex "(V?)PMULLD(Y?)rr")>; + +// x,m / y,y,m. +def WritePMULLDm : SchedWriteRes<[HWPort0, HWPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WritePMULLDm, ReadAfterLd], (instregex "(V?)PMULLD(Y?)rm")>; + +//-- Logic instructions --// + +// PTEST. +// v,v. +def WritePTESTr : SchedWriteRes<[HWPort0, HWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rr")>; + +// v,m. +def WritePTESTm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rm")>; + +// PSLL,PSRL,PSRA W/D/Q. +// x,x / v,v,x. +def WritePShift : SchedWriteRes<[HWPort0, HWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WritePShift], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)(Y?)rr")>; + +// PSLL,PSRL DQ. +def : InstRW<[WriteP5], (instregex "(V?)PS(R|L)LDQ(Y?)ri")>; + +//-- Other --// + +// EMMS. +def WriteEMMS : SchedWriteRes<[]> { + let Latency = 13; + let NumMicroOps = 31; +} +def : InstRW<[WriteEMMS], (instregex "MMX_EMMS")>; + +//=== Floating Point XMM and YMM Instructions ===// +//-- Move instructions --// + +// MOVMSKP S/D. +// r32 <- x. +def WriteMOVMSKPr : SchedWriteRes<[HWPort0]> { + let Latency = 3; +} +def : InstRW<[WriteMOVMSKPr], (instregex "(V?)MOVMSKP(S|D)rr")>; + +// r32 <- y. +def WriteVMOVMSKPYr : SchedWriteRes<[HWPort0]> { + let Latency = 2; +} +def : InstRW<[WriteVMOVMSKPYr], (instregex "VMOVMSKP(S|D)Yrr")>; + +// VPERM2F128. +def : InstRW<[WriteFShuffle256], (instregex "VPERM2F128rr")>; +def : InstRW<[WriteFShuffle256Ld, ReadAfterLd], (instregex "VPERM2F128rm")>; + +// BLENDVP S/D. +def : InstRW<[WriteFVarBlend], (instregex "BLENDVP(S|D)rr0")>; +def : InstRW<[WriteFVarBlendLd, ReadAfterLd], (instregex "BLENDVP(S|D)rm0")>; + +// VBROADCASTF128. +def : InstRW<[WriteLoad], (instregex "VBROADCASTF128")>; + +// EXTRACTPS. +// r32,x,i. +def WriteEXTRACTPSr : SchedWriteRes<[HWPort0, HWPort5]> { + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteEXTRACTPSr], (instregex "(V?)EXTRACTPSrr")>; + +// m32,x,i. +def WriteEXTRACTPSm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>; + +// VEXTRACTF128. +// x,y,i. +def : InstRW<[WriteFShuffle256], (instregex "VEXTRACTF128rr")>; + +// m128,y,i. +def WriteVEXTRACTF128m : SchedWriteRes<[HWPort23, HWPort4]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteVEXTRACTF128m], (instregex "VEXTRACTF128mr")>; + +// VINSERTF128. +// y,y,x,i. +def : InstRW<[WriteFShuffle256], (instregex "VINSERTF128rr")>; + +// y,y,m128,i. +def WriteVINSERTF128m : SchedWriteRes<[HWPort015, HWPort23]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteFShuffle256, ReadAfterLd], (instregex "VINSERTF128rm")>; + +// VMASKMOVP S/D. +// v,v,m. +def WriteVMASKMOVPrm : SchedWriteRes<[HWPort5, HWPort23]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteVMASKMOVPrm], (instregex "VMASKMOVP(S|D)(Y?)rm")>; + +// m128,x,x. +def WriteVMASKMOVPmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [1, 1, 1, 1]; +} +def : InstRW<[WriteVMASKMOVPmr], (instregex "VMASKMOVP(S|D)mr")>; + +// m256,y,y. +def WriteVMASKMOVPYmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> { + let Latency = 14; + let NumMicroOps = 4; + let ResourceCycles = [1, 1, 1, 1]; +} +def : InstRW<[WriteVMASKMOVPYmr], (instregex "VMASKMOVP(S|D)Ymr")>; + +// VGATHERDPS. +// x. +def WriteVGATHERDPS128 : SchedWriteRes<[]> { + let NumMicroOps = 20; +} +def : InstRW<[WriteVGATHERDPS128, ReadAfterLd], (instregex "VGATHERDPSrm")>; + +// y. +def WriteVGATHERDPS256 : SchedWriteRes<[]> { + let NumMicroOps = 34; +} +def : InstRW<[WriteVGATHERDPS256, ReadAfterLd], (instregex "VGATHERDPSYrm")>; + +// VGATHERQPS. +// x. +def WriteVGATHERQPS128 : SchedWriteRes<[]> { + let NumMicroOps = 15; +} +def : InstRW<[WriteVGATHERQPS128, ReadAfterLd], (instregex "VGATHERQPSrm")>; + +// y. +def WriteVGATHERQPS256 : SchedWriteRes<[]> { + let NumMicroOps = 22; +} +def : InstRW<[WriteVGATHERQPS256, ReadAfterLd], (instregex "VGATHERQPSYrm")>; + +// VGATHERDPD. +// x. +def WriteVGATHERDPD128 : SchedWriteRes<[]> { + let NumMicroOps = 12; +} +def : InstRW<[WriteVGATHERDPD128, ReadAfterLd], (instregex "VGATHERDPDrm")>; + +// y. +def WriteVGATHERDPD256 : SchedWriteRes<[]> { + let NumMicroOps = 20; +} +def : InstRW<[WriteVGATHERDPD256, ReadAfterLd], (instregex "VGATHERDPDYrm")>; + +// VGATHERQPD. +// x. +def WriteVGATHERQPD128 : SchedWriteRes<[]> { + let NumMicroOps = 14; +} +def : InstRW<[WriteVGATHERQPD128, ReadAfterLd], (instregex "VGATHERQPDrm")>; + +// y. +def WriteVGATHERQPD256 : SchedWriteRes<[]> { + let NumMicroOps = 22; +} +def : InstRW<[WriteVGATHERQPD256, ReadAfterLd], (instregex "VGATHERQPDYrm")>; + +//-- Conversion instructions --// + +// CVTPD2PS. +// x,x. +def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVTPD2PSrr")>; + +// x,m128. +def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVTPD2PS(X?)rm")>; + +// x,y. +def WriteCVTPD2PSYrr : SchedWriteRes<[HWPort1, HWPort5]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteCVTPD2PSYrr], (instregex "(V?)CVTPD2PSYrr")>; + +// x,m256. +def WriteCVTPD2PSYrm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WriteCVTPD2PSYrm], (instregex "(V?)CVTPD2PSYrm")>; + +// CVTSD2SS. +// x,x. +def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V)?CVTSD2SSrr")>; + +// x,m64. +def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(Int_)?(V)?CVTSD2SSrm")>; + +// CVTPS2PD. +// x,x. +def WriteCVTPS2PDrr : SchedWriteRes<[HWPort0, HWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteCVTPS2PDrr], (instregex "(V?)CVTPS2PDrr")>; + +// x,m64. +// y,m128. +def WriteCVTPS2PDrm : SchedWriteRes<[HWPort0, HWPort23]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteCVTPS2PDrm], (instregex "(V?)CVTPS2PD(Y?)rm")>; + +// y,x. +def WriteVCVTPS2PDYrr : SchedWriteRes<[HWPort0, HWPort5]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteVCVTPS2PDYrr], (instregex "VCVTPS2PDYrr")>; + +// CVTSS2SD. +// x,x. +def WriteCVTSS2SDrr : SchedWriteRes<[HWPort0, HWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteCVTSS2SDrr], (instregex "(Int_)?(V?)CVTSS2SDrr")>; + +// x,m32. +def WriteCVTSS2SDrm : SchedWriteRes<[HWPort0, HWPort23]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteCVTSS2SDrm], (instregex "(Int_)?(V?)CVTSS2SDrm")>; + +// CVTDQ2PD. +// x,x. +def : InstRW<[WriteP1_P5_Lat4], (instregex "(V)?CVTDQ2PDrr")>; + +// y,x. +def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVTDQ2PDYrr")>; + +// CVT(T)PD2DQ. +// x,x. +def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVT(T?)PD2DQrr")>; +// x,m128. +def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVT(T?)PD2DQrm")>; +// x,y. +def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVT(T?)PD2DQYrr")>; +// x,m256. +def : InstRW<[WriteP1_P5_Lat6Ld], (instregex "VCVT(T?)PD2DQYrm")>; + +// CVT(T)PS2PI. +// mm,x. +def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PS2PIirr")>; + +// CVTPI2PD. +// x,mm. +def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PI2PDirr")>; + +// CVT(T)PD2PI. +// mm,x. +def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PD2PIirr")>; + +// CVSTSI2SS. +// x,r32. +def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V?)CVT(T?)SI2SS(64)?rr")>; + +// CVT(T)SS2SI. +// r32,x. +def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rr")>; +// r32,m32. +def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rm")>; + +// CVTSI2SD. +// x,r32/64. +def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVTSI2SS(64)?rr")>; + +// CVTSD2SI. +// r32/64 +def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rr")>; +// r32,m32. +def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rm")>; + +// VCVTPS2PH. +// x,v,i. +def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPS2PH(Y?)rr")>; +// m,v,i. +def : InstRW<[WriteP1_P5_Lat4Ld, WriteRMW], (instregex "VCVTPS2PH(Y?)mr")>; + +// VCVTPH2PS. +// v,x. +def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPH2PS(Y?)rr")>; + +//-- Arithmetic instructions --// + +// HADD, HSUB PS/PD +// x,x / v,v,v. +def WriteHADDSUBPr : SchedWriteRes<[HWPort1, HWPort5]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1, 2]; +} +def : InstRW<[WriteHADDSUBPr], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rr")>; + +// x,m / v,v,m. +def WriteHADDSUBPm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [1, 2, 1]; +} +def : InstRW<[WriteHADDSUBPm], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rm")>; + +// MULL SS/SD PS/PD. +// x,x / v,v,v. +def WriteMULr : SchedWriteRes<[HWPort01]> { + let Latency = 5; +} +def : InstRW<[WriteMULr], (instregex "(V?)MUL(P|S)(S|D)rr")>; + +// x,m / v,v,m. +def WriteMULm : SchedWriteRes<[HWPort01, HWPort23]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteMULm], (instregex "(V?)MUL(P|S)(S|D)rm")>; + +// VDIVPS. +// y,y,y. +def WriteVDIVPSYrr : SchedWriteRes<[HWPort0, HWPort15]> { + let Latency = 19; // 18-21 cycles. + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteVDIVPSYrr], (instregex "VDIVPSYrr")>; + +// y,y,m256. +def WriteVDIVPSYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { + let Latency = 23; // 18-21 + 4 cycles. + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteVDIVPSYrm, ReadAfterLd], (instregex "VDIVPSYrm")>; + +// VDIVPD. +// y,y,y. +def WriteVDIVPDYrr : SchedWriteRes<[HWPort0, HWPort15]> { + let Latency = 27; // 19-35 cycles. + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteVDIVPDYrr], (instregex "VDIVPDYrr")>; + +// y,y,m256. +def WriteVDIVPDYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { + let Latency = 31; // 19-35 + 4 cycles. + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteVDIVPDYrm, ReadAfterLd], (instregex "VDIVPDYrm")>; + +// VRCPPS. +// y,y. +def WriteVRCPPSr : SchedWriteRes<[HWPort0, HWPort15]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteVRCPPSr], (instregex "VRCPPSYr(_Int)?")>; + +// y,m256. +def WriteVRCPPSm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { + let Latency = 11; + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteVRCPPSm], (instregex "VRCPPSYm(_Int)?")>; + +// ROUND SS/SD PS/PD. +// v,v,i. +def WriteROUNDr : SchedWriteRes<[HWPort1]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : InstRW<[WriteROUNDr], (instregex "(V?)ROUND(Y?)(S|P)(S|D)r(_Int)?")>; + +// v,m,i. +def WriteROUNDm : SchedWriteRes<[HWPort1, HWPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteROUNDm], (instregex "(V?)ROUND(Y?)(S|P)(S|D)m(_Int)?")>; + +// DPPS. +// x,x,i / v,v,v,i. +def WriteDPPSr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> { + let Latency = 14; + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteDPPSr], (instregex "(V?)DPPS(Y?)rri")>; + +// x,m,i / v,v,m,i. +def WriteDPPSm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23, HWPort6]> { + let Latency = 18; + let NumMicroOps = 6; + let ResourceCycles = [2, 1, 1, 1, 1]; +} +def : InstRW<[WriteDPPSm, ReadAfterLd], (instregex "(V?)DPPS(Y?)rmi")>; + +// DPPD. +// x,x,i. +def WriteDPPDr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WriteDPPDr], (instregex "(V?)DPPDrri")>; + +// x,m,i. +def WriteDPPDm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [1, 1, 1, 1]; +} +def : InstRW<[WriteDPPDm], (instregex "(V?)DPPDrmi")>; + +// VFMADD. +// v,v,v. +def WriteFMADDr : SchedWriteRes<[HWPort01]> { + let Latency = 5; + let NumMicroOps = 1; +} +def : InstRW<[WriteFMADDr], + (instregex + // 3p forms. + "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)r(Y)?", + // 3s forms. + "VF(N?)M(ADD|SUB)S(S|D)(r132|231|213)r", + // 4s/4s_int forms. + "VF(N?)M(ADD|SUB)S(S|D)4rr(_REV|_Int)?", + // 4p forms. + "VF(N?)M(ADD|SUB)P(S|D)4rr(Y)?(_REV)?")>; + +// v,v,m. +def WriteFMADDm : SchedWriteRes<[HWPort01, HWPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteFMADDm], + (instregex + // 3p forms. + "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)m(Y)?", + // 3s forms. + "VF(N?)M(ADD|SUB)S(S|D)(r132|231|213)m", + // 4s/4s_int forms. + "VF(N?)M(ADD|SUB)S(S|D)4(rm|mr)(_Int)?", + // 4p forms. + "VF(N?)M(ADD|SUB)P(S|D)4(rm|mr)(Y)?")>; + +//-- Math instructions --// + +// VSQRTPS. +// y,y. +def WriteVSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> { + let Latency = 19; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteVSQRTPSYr], (instregex "VSQRTPSYr")>; + +// y,m256. +def WriteVSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { + let Latency = 23; + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteVSQRTPSYm], (instregex "VSQRTPSYm")>; + +// VSQRTPD. +// y,y. +def WriteVSQRTPDYr : SchedWriteRes<[HWPort0, HWPort15]> { + let Latency = 28; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteVSQRTPDYr], (instregex "VSQRTPDYr")>; + +// y,m256. +def WriteVSQRTPDYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { + let Latency = 32; + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteVSQRTPDYm], (instregex "VSQRTPDYm")>; + +// RSQRT SS/PS. +// x,x. +def WriteRSQRTr : SchedWriteRes<[HWPort0]> { + let Latency = 5; +} +def : InstRW<[WriteRSQRTr], (instregex "(V?)RSQRT(SS|PS)r(_Int)?")>; + +// x,m128. +def WriteRSQRTm : SchedWriteRes<[HWPort0, HWPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteRSQRTm], (instregex "(V?)RSQRT(SS|PS)m(_Int)?")>; + +// RSQRTPS 256. +// y,y. +def WriteRSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteRSQRTPSYr], (instregex "VRSQRTPSYr(_Int)?")>; + +// y,m256. +def WriteRSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { + let Latency = 11; + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteRSQRTPSYm], (instregex "VRSQRTPSYm(_Int)?")>; + +//-- Logic instructions --// + +// AND, ANDN, OR, XOR PS/PD. +// x,x / v,v,v. +def : InstRW<[WriteP5], (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rr")>; +// x,m / v,v,m. +def : InstRW<[WriteP5Ld, ReadAfterLd], + (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rm")>; + +//-- Other instructions --// + +// VZEROUPPER. +def WriteVZEROUPPER : SchedWriteRes<[]> { + let NumMicroOps = 4; +} +def : InstRW<[WriteVZEROUPPER], (instregex "VZEROUPPER")>; + +// VZEROALL. +def WriteVZEROALL : SchedWriteRes<[]> { + let NumMicroOps = 12; +} +def : InstRW<[WriteVZEROALL], (instregex "VZEROALL")>; + +// LDMXCSR. +def WriteLDMXCSR : SchedWriteRes<[HWPort0, HWPort6, HWPort23]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WriteLDMXCSR], (instregex "(V)?LDMXCSR")>; + +// STMXCSR. +def WriteSTMXCSR : SchedWriteRes<[HWPort0, HWPort4, HWPort6, HWPort237]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1, 1, 1, 1]; +} +def : InstRW<[WriteSTMXCSR], (instregex "(V)?STMXCSR")>; + } // SchedModel diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td index 83f0534..eca65c2 100644 --- a/lib/Target/X86/X86SchedSandyBridge.td +++ b/lib/Target/X86/X86SchedSandyBridge.td @@ -117,6 +117,7 @@ defm : SBWriteResPair<WriteFAdd, SBPort1, 3>; defm : SBWriteResPair<WriteFMul, SBPort0, 5>; defm : SBWriteResPair<WriteFDiv, SBPort0, 12>; // 10-14 cycles. defm : SBWriteResPair<WriteFRcp, SBPort0, 5>; +defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>; defm : SBWriteResPair<WriteFSqrt, SBPort0, 15>; defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>; defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>; diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index 25c5a6b..a261356 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -63,12 +63,13 @@ def WriteZero : SchedWrite; defm WriteJump : X86SchedWritePair; // Floating point. This covers both scalar and vector operations. -defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare. -defm WriteFMul : X86SchedWritePair; // Floating point multiplication. -defm WriteFDiv : X86SchedWritePair; // Floating point division. -defm WriteFSqrt : X86SchedWritePair; // Floating point square root. -defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal. -defm WriteFMA : X86SchedWritePair; // Fused Multiply Add. +defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare. +defm WriteFMul : X86SchedWritePair; // Floating point multiplication. +defm WriteFDiv : X86SchedWritePair; // Floating point division. +defm WriteFSqrt : X86SchedWritePair; // Floating point square root. +defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal estimate. +defm WriteFRsqrt : X86SchedWritePair; // Floating point reciprocal square root estimate. +defm WriteFMA : X86SchedWritePair; // Fused Multiply Add. defm WriteFShuffle : X86SchedWritePair; // Floating point vector shuffles. defm WriteFBlend : X86SchedWritePair; // Floating point vector blends. defm WriteFVarBlend : X86SchedWritePair; // Fp vector variable blends. @@ -314,6 +315,11 @@ def IIC_SSE_SQRTPD_RM : InstrItinClass; def IIC_SSE_SQRTSD_RR : InstrItinClass; def IIC_SSE_SQRTSD_RM : InstrItinClass; +def IIC_SSE_RSQRTPS_RR : InstrItinClass; +def IIC_SSE_RSQRTPS_RM : InstrItinClass; +def IIC_SSE_RSQRTSS_RR : InstrItinClass; +def IIC_SSE_RSQRTSS_RM : InstrItinClass; + def IIC_SSE_RCPP_RR : InstrItinClass; def IIC_SSE_RCPP_RM : InstrItinClass; def IIC_SSE_RCPS_RR : InstrItinClass; @@ -633,9 +639,12 @@ def GenericModel : SchedMachineModel { let MicroOpBufferSize = 32; let LoadLatency = 4; let HighLatency = 10; + let PostRAScheduler = 0; } include "X86ScheduleAtom.td" include "X86SchedSandyBridge.td" include "X86SchedHaswell.td" include "X86ScheduleSLM.td" +include "X86ScheduleBtVer2.td" + diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td index 3256ee7..4c559c9 100644 --- a/lib/Target/X86/X86ScheduleAtom.td +++ b/lib/Target/X86/X86ScheduleAtom.td @@ -224,6 +224,11 @@ def AtomItineraries : ProcessorItineraries< InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >, InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_RSQRTPS_RR, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_RSQRTPS_RM, [InstrStage<10, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_RSQRTSS_RR, [InstrStage<4, [Port0]>] >, + InstrItinData<IIC_SSE_RSQRTSS_RM, [InstrStage<4, [Port0]>] >, + InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >, InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >, InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >, @@ -538,6 +543,7 @@ def AtomModel : SchedMachineModel { // On the Atom, the throughput for taken branches is 2 cycles. For small // simple loops, expand by a small factor to hide the backedge cost. let LoopMicroOpBufferSize = 10; + let PostRAScheduler = 1; let Itineraries = AtomItineraries; } diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td new file mode 100644 index 0000000..ce1ece3 --- /dev/null +++ b/lib/Target/X86/X86ScheduleBtVer2.td @@ -0,0 +1,341 @@ +//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for AMD btver2 (Jaguar) to support +// instruction scheduling and other instruction cost heuristics. Based off AMD Software +// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix. +// +//===----------------------------------------------------------------------===// + +def BtVer2Model : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and btver2 can + // decode 2 instructions per cycle. + let IssueWidth = 2; + let MicroOpBufferSize = 64; // Retire Control Unit + let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency) + let HighLatency = 25; + let MispredictPenalty = 14; // Minimum branch misdirection penalty + let PostRAScheduler = 1; + + // FIXME: SSE4/AVX is unimplemented. This flag is set to allow + // the scheduler to assign a default model to unrecognized opcodes. + let CompleteModel = 0; +} + +let SchedModel = BtVer2Model in { + +// Jaguar can issue up to 6 micro-ops in one cycle +def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam) +def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV +def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU +def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA) +def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA +def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM + +// Any pipe - FIXME we need this until we can discriminate between int/fpu load/store/moves properly +def JAny : ProcResGroup<[JALU0, JALU1, JLAGU, JSAGU, JFPU0, JFPU1]>; + +// Integer Pipe Scheduler +def JALU01 : ProcResGroup<[JALU0, JALU1]> { + let BufferSize=20; +} + +// AGU Pipe Scheduler +def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> { + let BufferSize=12; +} + +// Fpu Pipe Scheduler +def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> { + let BufferSize=18; +} + +def JDiv : ProcResource<1>; // integer division +def JMul : ProcResource<1>; // integer multiplication +def JVALU0 : ProcResource<1>; // vector integer +def JVALU1 : ProcResource<1>; // vector integer +def JVIMUL : ProcResource<1>; // vector integer multiplication +def JSTC : ProcResource<1>; // vector store/convert +def JFPM : ProcResource<1>; // FP multiplication +def JFPA : ProcResource<1>; // FP addition + +// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 3>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when dispatched by the schedulers. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW, + ProcResourceKind ExePort, + int Lat> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + + // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the + // latency. + def : WriteRes<SchedRW.Folded, [JLAGU, ExePort]> { + let Latency = !add(Lat, 3); + } +} + +multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW, + ProcResourceKind ExePort, + int Lat> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + + // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the + // latency. + def : WriteRes<SchedRW.Folded, [JLAGU, ExePort]> { + let Latency = !add(Lat, 5); + } +} + +// A folded store needs a cycle on the SAGU for the store data. +def : WriteRes<WriteRMW, [JSAGU]>; + +//////////////////////////////////////////////////////////////////////////////// +// Arithmetic. +//////////////////////////////////////////////////////////////////////////////// + +defm : JWriteResIntPair<WriteALU, JALU01, 1>; +defm : JWriteResIntPair<WriteIMul, JALU1, 3>; + +def : WriteRes<WriteIMulH, [JALU1]> { + let Latency = 6; + let ResourceCycles = [4]; +} + +// FIXME 8/16 bit divisions +def : WriteRes<WriteIDiv, [JALU1, JDiv]> { + let Latency = 25; + let ResourceCycles = [1, 25]; +} +def : WriteRes<WriteIDivLd, [JALU1, JLAGU, JDiv]> { + let Latency = 41; + let ResourceCycles = [1, 1, 25]; +} + +// This is for simple LEAs with one or two input operands. +// FIXME: SAGU 3-operand LEA +def : WriteRes<WriteLEA, [JALU01]>; + +//////////////////////////////////////////////////////////////////////////////// +// Integer shifts and rotates. +//////////////////////////////////////////////////////////////////////////////// + +defm : JWriteResIntPair<WriteShift, JALU01, 1>; + +//////////////////////////////////////////////////////////////////////////////// +// Loads, stores, and moves, not folded with other operations. +// FIXME: Split x86 and SSE load/store/moves +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 5; } +def : WriteRes<WriteStore, [JSAGU]>; +def : WriteRes<WriteMove, [JAny]>; + +//////////////////////////////////////////////////////////////////////////////// +// Idioms that clear a register, like xorps %xmm0, %xmm0. +// These can often bypass execution ports completely. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteZero, []>; + +//////////////////////////////////////////////////////////////////////////////// +// Branches don't produce values, so they have no latency, but they still +// consume resources. Indirect branches can fold loads. +//////////////////////////////////////////////////////////////////////////////// + +defm : JWriteResIntPair<WriteJump, JALU01, 1>; + +//////////////////////////////////////////////////////////////////////////////// +// Floating point. This covers both scalar and vector operations. +// FIXME: should we bother splitting JFPU pipe + unit stages for fast instructions? +// FIXME: Double precision latencies +// FIXME: SS vs PS latencies +// FIXME: ymm latencies +//////////////////////////////////////////////////////////////////////////////// + +defm : JWriteResFpuPair<WriteFAdd, JFPU0, 3>; +defm : JWriteResFpuPair<WriteFMul, JFPU1, 2>; +defm : JWriteResFpuPair<WriteFRcp, JFPU1, 2>; +defm : JWriteResFpuPair<WriteFRsqrt, JFPU1, 2>; +defm : JWriteResFpuPair<WriteFShuffle, JFPU01, 1>; +defm : JWriteResFpuPair<WriteFBlend, JFPU01, 1>; +defm : JWriteResFpuPair<WriteFShuffle256, JFPU01, 1>; + +def : WriteRes<WriteFSqrt, [JFPU1, JLAGU, JFPM]> { + let Latency = 21; + let ResourceCycles = [1, 1, 21]; +} +def : WriteRes<WriteFSqrtLd, [JFPU1, JLAGU, JFPM]> { + let Latency = 26; + let ResourceCycles = [1, 1, 21]; +} + +def : WriteRes<WriteFDiv, [JFPU1, JLAGU, JFPM]> { + let Latency = 19; + let ResourceCycles = [1, 1, 19]; +} +def : WriteRes<WriteFDivLd, [JFPU1, JLAGU, JFPM]> { + let Latency = 24; + let ResourceCycles = [1, 1, 19]; +} + +// FIXME: integer pipes +defm : JWriteResFpuPair<WriteCvtF2I, JFPU1, 3>; // Float -> Integer. +defm : JWriteResFpuPair<WriteCvtI2F, JFPU1, 3>; // Integer -> Float. +defm : JWriteResFpuPair<WriteCvtF2F, JFPU1, 3>; // Float -> Float size conversion. + +def : WriteRes<WriteFVarBlend, [JFPU01]> { + let Latency = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteFVarBlendLd, [JLAGU, JFPU01]> { + let Latency = 7; + let ResourceCycles = [1, 2]; +} + +// Vector integer operations. +defm : JWriteResFpuPair<WriteVecALU, JFPU01, 1>; +defm : JWriteResFpuPair<WriteVecShift, JFPU01, 1>; +defm : JWriteResFpuPair<WriteVecIMul, JFPU0, 2>; +defm : JWriteResFpuPair<WriteShuffle, JFPU01, 1>; +defm : JWriteResFpuPair<WriteBlend, JFPU01, 1>; +defm : JWriteResFpuPair<WriteVecLogic, JFPU01, 1>; +defm : JWriteResFpuPair<WriteShuffle256, JFPU01, 1>; + +def : WriteRes<WriteVarBlend, [JFPU01]> { + let Latency = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteVarBlendLd, [JLAGU, JFPU01]> { + let Latency = 7; + let ResourceCycles = [1, 2]; +} + +// FIXME: why do we need to define AVX2 resource on CPU that doesn't have AVX2? +def : WriteRes<WriteVarVecShift, [JFPU01]> { + let Latency = 1; + let ResourceCycles = [1]; +} +def : WriteRes<WriteVarVecShiftLd, [JLAGU, JFPU01]> { + let Latency = 6; + let ResourceCycles = [1, 1]; +} + +def : WriteRes<WriteMPSAD, [JFPU0]> { + let Latency = 3; + let ResourceCycles = [2]; +} +def : WriteRes<WriteMPSADLd, [JLAGU, JFPU0]> { + let Latency = 8; + let ResourceCycles = [1, 2]; +} + +//////////////////////////////////////////////////////////////////////////////// +// String instructions. +// Packed Compare Implicit Length Strings, Return Mask +// FIXME: approximate latencies + pipe dependencies +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WritePCmpIStrM, [JFPU01]> { + let Latency = 7; + let ResourceCycles = [2]; +} +def : WriteRes<WritePCmpIStrMLd, [JLAGU, JFPU01]> { + let Latency = 12; + let ResourceCycles = [1, 2]; +} + +// Packed Compare Explicit Length Strings, Return Mask +def : WriteRes<WritePCmpEStrM, [JFPU01]> { + let Latency = 13; + let ResourceCycles = [5]; +} +def : WriteRes<WritePCmpEStrMLd, [JLAGU, JFPU01]> { + let Latency = 18; + let ResourceCycles = [1, 5]; +} + +// Packed Compare Implicit Length Strings, Return Index +def : WriteRes<WritePCmpIStrI, [JFPU01]> { + let Latency = 6; + let ResourceCycles = [2]; +} +def : WriteRes<WritePCmpIStrILd, [JLAGU, JFPU01]> { + let Latency = 11; + let ResourceCycles = [1, 2]; +} + +// Packed Compare Explicit Length Strings, Return Index +def : WriteRes<WritePCmpEStrI, [JFPU01]> { + let Latency = 13; + let ResourceCycles = [5]; +} +def : WriteRes<WritePCmpEStrILd, [JLAGU, JFPU01]> { + let Latency = 18; + let ResourceCycles = [1, 5]; +} + +//////////////////////////////////////////////////////////////////////////////// +// AES Instructions. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteAESDecEnc, [JFPU01, JVIMUL]> { + let Latency = 3; + let ResourceCycles = [1, 1]; +} +def : WriteRes<WriteAESDecEncLd, [JFPU01, JLAGU, JVIMUL]> { + let Latency = 8; + let ResourceCycles = [1, 1, 1]; +} + +def : WriteRes<WriteAESIMC, [JVIMUL]> { + let Latency = 2; + let ResourceCycles = [1]; +} +def : WriteRes<WriteAESIMCLd, [JLAGU, JVIMUL]> { + let Latency = 7; + let ResourceCycles = [1, 1]; +} + +def : WriteRes<WriteAESKeyGen, [JVIMUL]> { + let Latency = 2; + let ResourceCycles = [1]; +} +def : WriteRes<WriteAESKeyGenLd, [JLAGU, JVIMUL]> { + let Latency = 7; + let ResourceCycles = [1, 1]; +} + +//////////////////////////////////////////////////////////////////////////////// +// Carry-less multiplication instructions. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteCLMul, [JVIMUL]> { + let Latency = 2; + let ResourceCycles = [1]; +} +def : WriteRes<WriteCLMulLd, [JLAGU, JVIMUL]> { + let Latency = 7; + let ResourceCycles = [1, 1]; +} + +// FIXME: pipe for system/microcode? +def : WriteRes<WriteSystem, [JAny]> { let Latency = 100; } +def : WriteRes<WriteMicrocoded, [JAny]> { let Latency = 100; } +def : WriteRes<WriteFence, [JSAGU]>; +def : WriteRes<WriteNop, []>; +} // SchedModel + diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td index 823d101..f95d4fa 100644 --- a/lib/Target/X86/X86ScheduleSLM.td +++ b/lib/Target/X86/X86ScheduleSLM.td @@ -19,6 +19,7 @@ def SLMModel : SchedMachineModel { let MicroOpBufferSize = 32; // Based on the reorder buffer. let LoadLatency = 3; let MispredictPenalty = 10; + let PostRAScheduler = 1; // For small loops, expand by a small factor to hide the backedge cost. let LoopMicroOpBufferSize = 10; @@ -100,6 +101,7 @@ def : WriteRes<WriteIDivLd, [MEC_RSV, IEC_RSV01, SMDivider]> { // Scalar and vector floating point. defm : SMWriteResPair<WriteFAdd, FPC_RSV1, 3>; defm : SMWriteResPair<WriteFRcp, FPC_RSV0, 5>; +defm : SMWriteResPair<WriteFRsqrt, FPC_RSV0, 5>; defm : SMWriteResPair<WriteFSqrt, FPC_RSV0, 15>; defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>; defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>; diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index a83dd9b..821044f 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -29,6 +29,26 @@ X86SelectionDAGInfo::X86SelectionDAGInfo(const DataLayout &DL) X86SelectionDAGInfo::~X86SelectionDAGInfo() {} +bool X86SelectionDAGInfo::isBaseRegConflictPossible( + SelectionDAG &DAG, ArrayRef<unsigned> ClobberSet) const { + // We cannot use TRI->hasBasePointer() until *after* we select all basic + // blocks. Legalization may introduce new stack temporaries with large + // alignment requirements. Fall back to generic code if there are any + // dynamic stack adjustments (hopefully rare) and the base pointer would + // conflict if we had to use it. + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + if (!MFI->hasVarSizedObjects() && !MFI->hasInlineAsmWithSPAdjust()) + return false; + + const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); + unsigned BaseReg = TRI->getBaseRegister(); + for (unsigned R : ClobberSet) + if (BaseReg == R) + return true; + return false; +} + SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain, @@ -39,6 +59,13 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget<X86Subtarget>(); +#ifndef NDEBUG + // If the base register might conflict with our physical registers, bail out. + unsigned ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI, + X86::ECX, X86::EAX, X86::EDI}; + assert(!isBaseRegConflictPossible(DAG, ClobberSet)); +#endif + // If to a segment-relative address space, use the default lowering. if (DstPtrInfo.getAddrSpace() >= 256) return SDValue(); @@ -201,12 +228,10 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SrcPtrInfo.getAddrSpace() >= 256) return SDValue(); - // ESI might be used as a base pointer, in that case we can't simply overwrite - // the register. Fall back to generic code. - const X86RegisterInfo *TRI = - static_cast<const X86RegisterInfo *>(DAG.getTarget().getRegisterInfo()); - if (TRI->hasBasePointer(DAG.getMachineFunction()) && - TRI->getBaseRegister() == X86::ESI) + // If the base register might conflict with our physical registers, bail out. + unsigned ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI, + X86::ECX, X86::ESI, X86::EDI}; + if (isBaseRegConflictPossible(DAG, ClobberSet)) return SDValue(); MVT AVT; diff --git a/lib/Target/X86/X86SelectionDAGInfo.h b/lib/Target/X86/X86SelectionDAGInfo.h index c12555a..eb7e0ed 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.h +++ b/lib/Target/X86/X86SelectionDAGInfo.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86SELECTIONDAGINFO_H -#define X86SELECTIONDAGINFO_H +#ifndef LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H +#define LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H #include "llvm/Target/TargetSelectionDAGInfo.h" @@ -23,6 +23,11 @@ class X86TargetMachine; class X86Subtarget; class X86SelectionDAGInfo : public TargetSelectionDAGInfo { + /// Returns true if it is possible for the base register to conflict with the + /// given set of clobbers for a memory intrinsic. + bool isBaseRegConflictPossible(SelectionDAG &DAG, + ArrayRef<unsigned> ClobberSet) const; + public: explicit X86SelectionDAGInfo(const DataLayout &DL); ~X86SelectionDAGInfo(); diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 79b7e68..9d877c9 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -13,6 +13,7 @@ #include "X86Subtarget.h" #include "X86InstrInfo.h" +#include "X86TargetMachine.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" @@ -67,12 +68,7 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { if (GV->hasDLLImportStorageClass()) return X86II::MO_DLLIMPORT; - // Determine whether this is a reference to a definition or a declaration. - // Materializable GVs (in JIT lazy compilation mode) do not require an extra - // load from stub. - bool isDecl = GV->hasAvailableExternallyLinkage(); - if (GV->isDeclaration() && !GV->isMaterializable()) - isDecl = true; + bool isDecl = GV->isDeclarationForLinker(); // X86-64 in PIC mode. if (isPICStyleRIPRel()) { @@ -182,23 +178,7 @@ bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const { return isTargetELF() || TM.getRelocationModel() == Reloc::Static; } -void X86Subtarget::resetSubtargetFeatures(const MachineFunction *MF) { - AttributeSet FnAttrs = MF->getFunction()->getAttributes(); - Attribute CPUAttr = - FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu"); - Attribute FSAttr = - FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features"); - std::string CPU = - !CPUAttr.hasAttribute(Attribute::None) ? CPUAttr.getValueAsString() : ""; - std::string FS = - !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : ""; - if (!FS.empty()) { - initializeEnvironment(); - resetSubtargetFeatures(CPU, FS); - } -} - -void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) { +void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { std::string CPUName = CPU; if (CPUName.empty()) CPUName = "generic"; @@ -219,9 +199,6 @@ void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) { // Make sure the right MCSchedModel is used. InitCPUSchedModel(CPUName); - if (X86ProcFamily == IntelAtom || X86ProcFamily == IntelSLM) - PostRAScheduler = true; - InstrItins = getInstrItineraryForCPU(CPUName); // It's important to keep the MCSubtargetInfo feature bits in sync with @@ -275,10 +252,15 @@ void X86Subtarget::initializeEnvironment() { HasERI = false; HasCDI = false; HasPFI = false; + HasDQI = false; + HasBWI = false; + HasVLX = false; HasADX = false; HasSHA = false; + HasSGX = false; HasPRFCHW = false; HasRDSEED = false; + HasSMAP = false; IsBTMemSlow = false; IsSHLDSlow = false; IsUAMemFast = false; @@ -286,48 +268,51 @@ void X86Subtarget::initializeEnvironment() { HasCmpxchg16b = false; UseLeaForSP = false; HasSlowDivide = false; - PostRAScheduler = false; PadShortFunctions = false; CallRegIndirect = false; LEAUsesAG = false; SlowLEA = false; SlowIncDec = false; + UseSqrtEst = false; + UseReciprocalEst = false; stackAlignment = 4; // FIXME: this is a known good value for Yonah. How about others? MaxInlineSizeThreshold = 128; } -static std::string computeDataLayout(const X86Subtarget &ST) { +static std::string computeDataLayout(const Triple &TT) { // X86 is little endian std::string Ret = "e"; - Ret += DataLayout::getManglingComponent(ST.getTargetTriple()); + Ret += DataLayout::getManglingComponent(TT); // X86 and x32 have 32 bit pointers. - if (ST.isTarget64BitILP32() || !ST.is64Bit()) + if ((TT.isArch64Bit() && + (TT.getEnvironment() == Triple::GNUX32 || TT.isOSNaCl())) || + !TT.isArch64Bit()) Ret += "-p:32:32"; // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32. - if (ST.is64Bit() || ST.isOSWindows() || ST.isTargetNaCl()) + if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl()) Ret += "-i64:64"; else Ret += "-f64:32:64"; // Some ABIs align long double to 128 bits, others to 32. - if (ST.isTargetNaCl()) + if (TT.isOSNaCl()) ; // No f80 - else if (ST.is64Bit() || ST.isTargetDarwin()) + else if (TT.isArch64Bit() || TT.isOSDarwin()) Ret += "-f80:128"; else Ret += "-f80:32"; // The registers can hold 8, 16, 32 or, in x86-64, 64 bits. - if (ST.is64Bit()) + if (TT.isArch64Bit()) Ret += "-n8:16:32:64"; else Ret += "-n8:16:32"; // The stack is aligned to 32 bits on some ABIs and 128 bits on others. - if (!ST.is64Bit() && ST.isOSWindows()) + if (!TT.isArch64Bit() && TT.isOSWindows()) Ret += "-S32"; else Ret += "-S128"; @@ -338,37 +323,47 @@ static std::string computeDataLayout(const X86Subtarget &ST) { X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { initializeEnvironment(); - resetSubtargetFeatures(CPU, FS); + initSubtargetFeatures(CPU, FS); return *this; } X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, X86TargetMachine &TM, + const std::string &FS, const X86TargetMachine &TM, unsigned StackAlignOverride) : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others), PICStyle(PICStyles::None), TargetTriple(TT), + DL(computeDataLayout(TargetTriple)), StackAlignOverride(StackAlignOverride), In64BitMode(TargetTriple.getArch() == Triple::x86_64), In32BitMode(TargetTriple.getArch() == Triple::x86 && TargetTriple.getEnvironment() != Triple::CODE16), In16BitMode(TargetTriple.getArch() == Triple::x86 && TargetTriple.getEnvironment() == Triple::CODE16), - DL(computeDataLayout(*this)), TSInfo(DL), - InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM), - FrameLowering(TargetFrameLowering::StackGrowsDown, getStackAlignment(), - is64Bit() ? -8 : -4), - JITInfo(hasSSE1()) {} - -bool -X86Subtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel, - TargetSubtargetInfo::AntiDepBreakMode &Mode, - RegClassVector &CriticalPathRCs) const { - Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL; - CriticalPathRCs.clear(); - return PostRAScheduler && OptLevel >= CodeGenOpt::Default; + TSInfo(DL), InstrInfo(initializeSubtargetDependencies(CPU, FS)), + TLInfo(TM), FrameLowering(TargetFrameLowering::StackGrowsDown, + getStackAlignment(), is64Bit() ? -8 : -4) { + // Determine the PICStyle based on the target selected. + if (TM.getRelocationModel() == Reloc::Static) { + // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None. + setPICStyle(PICStyles::None); + } else if (is64Bit()) { + // PIC in 64 bit mode is always rip-rel. + setPICStyle(PICStyles::RIPRel); + } else if (isTargetCOFF()) { + setPICStyle(PICStyles::None); + } else if (isTargetDarwin()) { + if (TM.getRelocationModel() == Reloc::PIC_) + setPICStyle(PICStyles::StubPIC); + else { + assert(TM.getRelocationModel() == Reloc::DynamicNoPIC); + setPICStyle(PICStyles::StubDynamicNoPIC); + } + } else if (isTargetELF()) { + setPICStyle(PICStyles::GOT); + } } -bool -X86Subtarget::enableEarlyIfConversion() const { +bool X86Subtarget::enableEarlyIfConversion() const { return hasCMov() && X86EarlyIfConv; } + diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 09db0eb..091b6c4 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -11,13 +11,12 @@ // //===----------------------------------------------------------------------===// -#ifndef X86SUBTARGET_H -#define X86SUBTARGET_H +#ifndef LLVM_LIB_TARGET_X86_X86SUBTARGET_H +#define LLVM_LIB_TARGET_X86_X86SUBTARGET_H #include "X86FrameLowering.h" #include "X86ISelLowering.h" #include "X86InstrInfo.h" -#include "X86JITInfo.h" #include "X86SelectionDAGInfo.h" #include "llvm/ADT/Triple.h" #include "llvm/IR/CallingConv.h" @@ -139,12 +138,18 @@ protected: /// HasSHA - Processor has SHA instructions. bool HasSHA; + /// HasSGX - Processor has SGX instructions. + bool HasSGX; + /// HasPRFCHW - Processor has PRFCHW instructions. bool HasPRFCHW; /// HasRDSEED - Processor has RDSEED instructions. bool HasRDSEED; + /// HasSMAP - Processor has SMAP instructions. + bool HasSMAP; + /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow. bool IsBTMemSlow; @@ -170,9 +175,6 @@ protected: /// full divides and should be used when possible. bool HasSlowDivide; - /// PostRAScheduler - True if using post-register-allocation scheduler. - bool PostRAScheduler; - /// PadShortFunctions - True if the short functions should be padded to prevent /// a stall when returning too early. bool PadShortFunctions; @@ -190,15 +192,34 @@ protected: /// SlowIncDec - True if INC and DEC instructions are slow when writing to flags bool SlowIncDec; + /// Use the RSQRT* instructions to optimize square root calculations. + /// For this to be profitable, the cost of FSQRT and FDIV must be + /// substantially higher than normal FP ops like FADD and FMUL. + bool UseSqrtEst; + + /// Use the RCP* instructions to optimize FP division calculations. + /// For this to be profitable, the cost of FDIV must be + /// substantially higher than normal FP ops like FADD and FMUL. + bool UseReciprocalEst; + /// Processor has AVX-512 PreFetch Instructions bool HasPFI; - + /// Processor has AVX-512 Exponential and Reciprocal Instructions bool HasERI; - + /// Processor has AVX-512 Conflict Detection Instructions bool HasCDI; - + + /// Processor has AVX-512 Doubleword and Quadword instructions + bool HasDQI; + + /// Processor has AVX-512 Byte and Word instructions + bool HasBWI; + + /// Processor has AVX-512 Vector Length eXtenstions + bool HasVLX; + /// stackAlignment - The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. unsigned stackAlignment; @@ -214,6 +235,9 @@ protected: InstrItineraryData InstrItins; private: + // Calculates type size & alignment + const DataLayout DL; + /// StackAlignOverride - Override the stack alignment. unsigned StackAlignOverride; @@ -226,30 +250,35 @@ private: /// In16BitMode - True if compiling for 16-bit, false for 32-bit or 64-bit. bool In16BitMode; - // Calculates type size & alignment - const DataLayout DL; X86SelectionDAGInfo TSInfo; // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which // X86TargetLowering needs. X86InstrInfo InstrInfo; X86TargetLowering TLInfo; X86FrameLowering FrameLowering; - X86JITInfo JITInfo; public: /// This constructor initializes the data members to match that /// of the specified triple. /// X86Subtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, X86TargetMachine &TM, + const std::string &FS, const X86TargetMachine &TM, unsigned StackAlignOverride); - const X86TargetLowering *getTargetLowering() const { return &TLInfo; } - const X86InstrInfo *getInstrInfo() const { return &InstrInfo; } - const DataLayout *getDataLayout() const { return &DL; } - const X86FrameLowering *getFrameLowering() const { return &FrameLowering; } - const X86SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } - X86JITInfo *getJITInfo() { return &JITInfo; } + const X86TargetLowering *getTargetLowering() const override { + return &TLInfo; + } + const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; } + const DataLayout *getDataLayout() const override { return &DL; } + const X86FrameLowering *getFrameLowering() const override { + return &FrameLowering; + } + const X86SelectionDAGInfo *getSelectionDAGInfo() const override { + return &TSInfo; + } + const X86RegisterInfo *getRegisterInfo() const override { + return &getInstrInfo()->getRegisterInfo(); + } /// getStackAlignment - Returns the minimum alignment known to hold of the /// stack frame on entry to the function and which must be maintained by every @@ -264,14 +293,12 @@ public: /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef FS); - /// \brief Reset the features for the X86 target. - void resetSubtargetFeatures(const MachineFunction *MF) override; private: /// \brief Initialize the full set of dependencies so we can use an initializer /// list for X86Subtarget. X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); void initializeEnvironment(); - void resetSubtargetFeatures(StringRef CPU, StringRef FS); + void initSubtargetFeatures(StringRef CPU, StringRef FS); public: /// Is this x86_64? (disregarding specific ABI / programming model) bool is64Bit() const { @@ -294,7 +321,8 @@ public: /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)? bool isTarget64BitLP64() const { - return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32); + return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32 && + TargetTriple.getOS() != Triple::NaCl); } PICStyles::Style getPICStyle() const { return PICStyle; } @@ -335,8 +363,10 @@ public: bool hasHLE() const { return HasHLE; } bool hasADX() const { return HasADX; } bool hasSHA() const { return HasSHA; } + bool hasSGX() const { return HasSGX; } bool hasPRFCHW() const { return HasPRFCHW; } bool hasRDSEED() const { return HasRDSEED; } + bool hasSMAP() const { return HasSMAP; } bool isBTMemSlow() const { return IsBTMemSlow; } bool isSHLDSlow() const { return IsSHLDSlow; } bool isUnalignedMemAccessFast() const { return IsUAMemFast; } @@ -349,9 +379,14 @@ public: bool LEAusesAG() const { return LEAUsesAG; } bool slowLEA() const { return SlowLEA; } bool slowIncDec() const { return SlowIncDec; } + bool useSqrtEst() const { return UseSqrtEst; } + bool useReciprocalEst() const { return UseReciprocalEst; } bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } bool hasERI() const { return HasERI; } + bool hasDQI() const { return HasDQI; } + bool hasBWI() const { return HasBWI; } + bool hasVLX() const { return HasVLX; } bool isAtom() const { return X86ProcFamily == IntelAtom; } bool isSLM() const { return X86ProcFamily == IntelSLM; } @@ -391,6 +426,10 @@ public: return TargetTriple.isWindowsGNUEnvironment(); } + bool isTargetWindowsItanium() const { + return TargetTriple.isWindowsItaniumEnvironment(); + } + bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); } bool isOSWindows() const { return TargetTriple.isOSWindows(); } @@ -453,18 +492,17 @@ public: /// Enable the MachineScheduler pass for all X86 subtargets. bool enableMachineScheduler() const override { return true; } - /// enablePostRAScheduler - run for Atom optimization. - bool enablePostRAScheduler(CodeGenOpt::Level OptLevel, - TargetSubtargetInfo::AntiDepBreakMode& Mode, - RegClassVector& CriticalPathRCs) const override; - - bool postRAScheduler() const { return PostRAScheduler; } - bool enableEarlyIfConversion() const override; /// getInstrItins = Return the instruction itineraries based on the /// subtarget selection. - const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } + const InstrItineraryData *getInstrItineraryData() const override { + return &InstrItins; + } + + AntiDepBreakMode getAntiDepBreakMode() const override { + return TargetSubtargetInfo::ANTIDEP_CRITICAL; + } }; } // End llvm namespace diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index f12140f..8802feb 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -13,7 +13,9 @@ #include "X86TargetMachine.h" #include "X86.h" +#include "X86TargetObjectFile.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" #include "llvm/PassManager.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormattedStream.h" @@ -27,7 +29,23 @@ extern "C" void LLVMInitializeX86Target() { RegisterTargetMachine<X86TargetMachine> Y(TheX86_64Target); } -void X86TargetMachine::anchor() { } +static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { + if (TT.isOSBinFormatMachO()) { + if (TT.getArch() == Triple::x86_64) + return make_unique<X86_64MachoTargetObjectFile>(); + return make_unique<TargetLoweringObjectFileMachO>(); + } + + if (TT.isOSLinux()) + return make_unique<X86LinuxTargetObjectFile>(); + if (TT.isOSBinFormatELF()) + return make_unique<TargetLoweringObjectFileELF>(); + if (TT.isKnownWindowsMSVCEnvironment()) + return make_unique<X86WindowsTargetObjectFile>(); + if (TT.isOSBinFormatCOFF()) + return make_unique<TargetLoweringObjectFileCOFF>(); + llvm_unreachable("unknown subtarget type"); +} /// X86TargetMachine ctor - Create an X86 target. /// @@ -36,27 +54,8 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + TLOF(createTLOF(Triple(getTargetTriple()))), Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) { - // Determine the PICStyle based on the target selected. - if (getRelocationModel() == Reloc::Static) { - // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None. - Subtarget.setPICStyle(PICStyles::None); - } else if (Subtarget.is64Bit()) { - // PIC in 64 bit mode is always rip-rel. - Subtarget.setPICStyle(PICStyles::RIPRel); - } else if (Subtarget.isTargetCOFF()) { - Subtarget.setPICStyle(PICStyles::None); - } else if (Subtarget.isTargetDarwin()) { - if (getRelocationModel() == Reloc::PIC_) - Subtarget.setPICStyle(PICStyles::StubPIC); - else { - assert(getRelocationModel() == Reloc::DynamicNoPIC); - Subtarget.setPICStyle(PICStyles::StubDynamicNoPIC); - } - } else if (Subtarget.isTargetELF()) { - Subtarget.setPICStyle(PICStyles::GOT); - } - // default to hard float ABI if (Options.FloatABIType == FloatABI::Default) this->Options.FloatABIType = FloatABI::Hard; @@ -71,6 +70,47 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, initAsmInfo(); } +X86TargetMachine::~X86TargetMachine() {} + +const X86Subtarget * +X86TargetMachine::getSubtargetImpl(const Function &F) const { + AttributeSet FnAttrs = F.getAttributes(); + Attribute CPUAttr = + FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu"); + Attribute FSAttr = + FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features"); + + std::string CPU = !CPUAttr.hasAttribute(Attribute::None) + ? CPUAttr.getValueAsString().str() + : TargetCPU; + std::string FS = !FSAttr.hasAttribute(Attribute::None) + ? FSAttr.getValueAsString().str() + : TargetFS; + + // FIXME: This is related to the code below to reset the target options, + // we need to know whether or not the soft float flag is set on the + // function before we can generate a subtarget. We also need to use + // it as a key for the subtarget since that can be the only difference + // between two functions. + Attribute SFAttr = + FnAttrs.getAttribute(AttributeSet::FunctionIndex, "use-soft-float"); + bool SoftFloat = !SFAttr.hasAttribute(Attribute::None) + ? SFAttr.getValueAsString() == "true" + : Options.UseSoftFloat; + + auto &I = SubtargetMap[CPU + FS + (SoftFloat ? "use-soft-float=true" + : "use-soft-float=false")]; + if (!I) { + // This needs to be done before we create a new subtarget since any + // creation will depend on the TM and the code generation flags on the + // function that reside in TargetOptions. + resetTargetOptions(F); + I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this, + Options.StackAlignmentOverride); + } + return I.get(); +} + //===----------------------------------------------------------------------===// // Command line options for x86 //===----------------------------------------------------------------------===// @@ -125,7 +165,7 @@ TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) { } void X86PassConfig::addIRPasses() { - addPass(createX86AtomicExpandPass(&getX86TargetMachine())); + addPass(createAtomicExpandPass(&getX86TargetMachine())); TargetPassConfig::addIRPasses(); } @@ -177,10 +217,3 @@ bool X86PassConfig::addPreEmitPass() { return ShouldPrint; } - -bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM, - JITCodeEmitter &JCE) { - PM.add(createX86JITCodeEmitterPass(*this, JCE)); - - return false; -} diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index 41d5157..916278c 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86TARGETMACHINE_H -#define X86TARGETMACHINE_H +#ifndef LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H +#define LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H #include "X86InstrInfo.h" #include "X86Subtarget.h" #include "llvm/IR/DataLayout.h" @@ -23,46 +23,29 @@ namespace llvm { class StringRef; class X86TargetMachine final : public LLVMTargetMachine { - virtual void anchor(); + std::unique_ptr<TargetLoweringObjectFile> TLOF; X86Subtarget Subtarget; + mutable StringMap<std::unique_ptr<X86Subtarget>> SubtargetMap; + public: X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); + ~X86TargetMachine() override; - const DataLayout *getDataLayout() const override { - return getSubtargetImpl()->getDataLayout(); - } - const X86InstrInfo *getInstrInfo() const override { - return getSubtargetImpl()->getInstrInfo(); - } - const TargetFrameLowering *getFrameLowering() const override { - return getSubtargetImpl()->getFrameLowering(); - } - X86JITInfo *getJITInfo() override { return Subtarget.getJITInfo(); } const X86Subtarget *getSubtargetImpl() const override { return &Subtarget; } - const X86TargetLowering *getTargetLowering() const override { - return getSubtargetImpl()->getTargetLowering(); - } - const X86SelectionDAGInfo *getSelectionDAGInfo() const override { - return getSubtargetImpl()->getSelectionDAGInfo(); - } - const X86RegisterInfo *getRegisterInfo() const override { - return &getInstrInfo()->getRegisterInfo(); - } - const InstrItineraryData *getInstrItineraryData() const override { - return &getSubtargetImpl()->getInstrItineraryData(); - } + const X86Subtarget *getSubtargetImpl(const Function &F) const override; /// \brief Register X86 analysis passes with a pass manager. void addAnalysisPasses(PassManagerBase &PM) override; // Set up the pass pipeline. TargetPassConfig *createPassConfig(PassManagerBase &PM) override; - - bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE) override; + TargetLoweringObjectFile *getObjFileLowering() const override { + return TLOF.get(); + } }; } // End llvm namespace diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp index 8157085..f8bcd61 100644 --- a/lib/Target/X86/X86TargetObjectFile.cpp +++ b/lib/Target/X86/X86TargetObjectFile.cpp @@ -8,10 +8,12 @@ //===----------------------------------------------------------------------===// #include "X86TargetObjectFile.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/IR/Mangler.h" #include "llvm/IR/Operator.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/Support/Dwarf.h" #include "llvm/Target/TargetLowering.h" @@ -106,3 +108,64 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol( MCSymbolRefExpr::VK_COFF_IMGREL32, getContext()); } + +static std::string APIntToHexString(const APInt &AI) { + unsigned Width = (AI.getBitWidth() / 8) * 2; + std::string HexString = utohexstr(AI.getLimitedValue(), /*LowerCase=*/true); + unsigned Size = HexString.size(); + assert(Width >= Size && "hex string is too large!"); + HexString.insert(HexString.begin(), Width - Size, '0'); + + return HexString; +} + + +static std::string scalarConstantToHexString(const Constant *C) { + Type *Ty = C->getType(); + APInt AI; + if (isa<UndefValue>(C)) { + AI = APInt(Ty->getPrimitiveSizeInBits(), /*val=*/0); + } else if (Ty->isFloatTy() || Ty->isDoubleTy()) { + const auto *CFP = cast<ConstantFP>(C); + AI = CFP->getValueAPF().bitcastToAPInt(); + } else if (Ty->isIntegerTy()) { + const auto *CI = cast<ConstantInt>(C); + AI = CI->getValue(); + } else { + llvm_unreachable("unexpected constant pool element type!"); + } + return APIntToHexString(AI); +} + +const MCSection * +X86WindowsTargetObjectFile::getSectionForConstant(SectionKind Kind, + const Constant *C) const { + if (Kind.isReadOnly()) { + if (C) { + Type *Ty = C->getType(); + SmallString<32> COMDATSymName; + if (Ty->isFloatTy() || Ty->isDoubleTy()) { + COMDATSymName = "__real@"; + COMDATSymName += scalarConstantToHexString(C); + } else if (const auto *VTy = dyn_cast<VectorType>(Ty)) { + uint64_t NumBits = VTy->getBitWidth(); + if (NumBits == 128 || NumBits == 256) { + COMDATSymName = NumBits == 128 ? "__xmm@" : "__ymm@"; + for (int I = VTy->getNumElements() - 1, E = -1; I != E; --I) + COMDATSymName += + scalarConstantToHexString(C->getAggregateElement(I)); + } + } + if (!COMDATSymName.empty()) { + unsigned Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | + COFF::IMAGE_SCN_MEM_READ | + COFF::IMAGE_SCN_LNK_COMDAT; + return getContext().getCOFFSection(".rdata", Characteristics, Kind, + COMDATSymName, + COFF::IMAGE_COMDAT_SELECT_ANY); + } + } + } + + return TargetLoweringObjectFile::getSectionForConstant(Kind, C); +} diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h index a08ed09..6a6988a 100644 --- a/lib/Target/X86/X86TargetObjectFile.h +++ b/lib/Target/X86/X86TargetObjectFile.h @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_TARGET_X86_TARGETOBJECTFILE_H -#define LLVM_TARGET_X86_TARGETOBJECTFILE_H +#ifndef LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H +#define LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/Target/TargetLoweringObjectFile.h" @@ -46,6 +46,11 @@ namespace llvm { const MCExpr * getExecutableRelativeSymbol(const ConstantExpr *CE, Mangler &Mang, const TargetMachine &TM) const override; + + /// \brief Given a mergeable constant with the specified size and relocation + /// information, return a section that it should be placed in. + const MCSection *getSectionForConstant(SectionKind Kind, + const Constant *C) const override; }; } // end namespace llvm diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index c961e2f..2b70fd0 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -48,8 +48,8 @@ public: } X86TTI(const X86TargetMachine *TM) - : ImmutablePass(ID), ST(TM->getSubtargetImpl()), - TLI(TM->getTargetLowering()) { + : ImmutablePass(ID), ST(TM->getSubtargetImpl()), + TLI(TM->getSubtargetImpl()->getTargetLowering()) { initializeX86TTIPass(*PassRegistry::getPassRegistry()); } @@ -82,9 +82,10 @@ public: unsigned getNumberOfRegisters(bool Vector) const override; unsigned getRegisterBitWidth(bool Vector) const override; - unsigned getMaximumUnrollFactor() const override; + unsigned getMaxInterleaveFactor() const override; unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind, - OperandValueKind) const override; + OperandValueKind, OperandValueProperties, + OperandValueProperties) const override; unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const override; unsigned getCastInstrCost(unsigned Opcode, Type *Dst, @@ -166,7 +167,7 @@ unsigned X86TTI::getRegisterBitWidth(bool Vector) const { } -unsigned X86TTI::getMaximumUnrollFactor() const { +unsigned X86TTI::getMaxInterleaveFactor() const { if (ST->isAtom()) return 1; @@ -178,15 +179,37 @@ unsigned X86TTI::getMaximumUnrollFactor() const { return 2; } -unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind Op1Info, - OperandValueKind Op2Info) const { +unsigned X86TTI::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, OperandValueKind Op1Info, + OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo, + OperandValueProperties Opd2PropInfo) const { // Legalize the type. std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + if (ISD == ISD::SDIV && + Op2Info == TargetTransformInfo::OK_UniformConstantValue && + Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { + // On X86, vector signed division by constants power-of-two are + // normally expanded to the sequence SRA + SRL + ADD + SRA. + // The OperandValue properties many not be same as that of previous + // operation;conservatively assume OP_None. + unsigned Cost = + 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + + return Cost; + } + static const CostTblEntry<MVT::SimpleValueType> AVX2UniformConstCostTable[] = { { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence @@ -202,6 +225,15 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, return LT.first * AVX2UniformConstCostTable[Idx].Cost; } + static const CostTblEntry<MVT::SimpleValueType> AVX512CostTable[] = { + { ISD::SHL, MVT::v16i32, 1 }, + { ISD::SRL, MVT::v16i32, 1 }, + { ISD::SRA, MVT::v16i32, 1 }, + { ISD::SHL, MVT::v8i64, 1 }, + { ISD::SRL, MVT::v8i64, 1 }, + { ISD::SRA, MVT::v8i64, 1 }, + }; + static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = { // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to // customize them to detect the cases where shift amount is a scalar one. @@ -237,6 +269,11 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::UDIV, MVT::v4i64, 4*20 }, }; + if (ST->hasAVX512()) { + int Idx = CostTableLookup(AVX512CostTable, ISD, LT.second); + if (Idx != -1) + return LT.first * AVX512CostTable[Idx].Cost; + } // Look for AVX2 lowering tricks. if (ST->hasAVX2()) { if (ISD == ISD::SHL && LT.second == MVT::v16i16 && @@ -541,7 +578,7 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, // There are faster sequences for float conversions. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 }, { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, @@ -557,6 +594,45 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { return LTSrc.first * SSE2ConvTbl[Idx].Cost; } + static const TypeConversionCostTblEntry<MVT::SimpleValueType> + AVX512ConversionTbl[] = { + { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, + { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, + { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, + { ISD::FP_ROUND, MVT::v16f32, MVT::v8f64, 3 }, + + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 }, + { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 }, + { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, + { ISD::TRUNCATE, MVT::v16i32, MVT::v8i64, 4 }, + + // v16i1 -> v16i32 - load + broadcast + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, + + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i32, 3 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i32, 3 }, + + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, + }; + + if (ST->hasAVX512()) { + int Idx = ConvertCostTableLookup(AVX512ConversionTbl, ISD, LTDest.second, + LTSrc.second); + if (Idx != -1) + return AVX512ConversionTbl[Idx].Cost; + } EVT SrcTy = TLI->getValueType(Src); EVT DstTy = TLI->getValueType(Dst); @@ -589,6 +665,11 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 4 }, + + { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, + { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, + + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, }; static const TypeConversionCostTblEntry<MVT::SimpleValueType> @@ -715,6 +796,19 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, { ISD::SETCC, MVT::v32i8, 1 }, }; + static const CostTblEntry<MVT::SimpleValueType> AVX512CostTbl[] = { + { ISD::SETCC, MVT::v8i64, 1 }, + { ISD::SETCC, MVT::v16i32, 1 }, + { ISD::SETCC, MVT::v8f64, 1 }, + { ISD::SETCC, MVT::v16f32, 1 }, + }; + + if (ST->hasAVX512()) { + int Idx = CostTableLookup(AVX512CostTbl, ISD, MTy); + if (Idx != -1) + return LT.first * AVX512CostTbl[Idx].Cost; + } + if (ST->hasAVX2()) { int Idx = CostTableLookup(AVX2CostTbl, ISD, MTy); if (Idx != -1) diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index 0bb5f99..d93baeb 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -250,7 +250,7 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>(); if (!ST.hasAVX() || ST.hasAVX512()) return false; - TII = MF.getTarget().getInstrInfo(); + TII = MF.getSubtarget().getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); EverMadeChange = false; |