diff options
Diffstat (limited to 'lib/Target/X86')
44 files changed, 5684 insertions, 2296 deletions
diff --git a/lib/Target/X86/Android.mk b/lib/Target/X86/Android.mk index 0d0a9ca..e2c4be7 100644 --- a/lib/Target/X86/Android.mk +++ b/lib/Target/X86/Android.mk @@ -12,6 +12,7 @@ x86_codegen_TBLGEN_TABLES := \ x86_codegen_SRC_FILES := \ X86AsmPrinter.cpp \ + X86AtomicExpandPass.cpp \ X86CodeEmitter.cpp \ X86FastISel.cpp \ X86FixupLEAs.cpp \ diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp index f3e6b3f..a365f62 100644 --- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp +++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp @@ -20,6 +20,7 @@ #include "llvm/MC/MCParser/MCParsedAsmOperand.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetAsmParser.h" #include "llvm/MC/MCTargetOptions.h" #include "llvm/Support/CommandLine.h" @@ -36,8 +37,8 @@ bool IsStackReg(unsigned Reg) { } std::string FuncName(unsigned AccessSize, bool IsWrite) { - return std::string("__sanitizer_sanitize_") + (IsWrite ? "store" : "load") + - (utostr(AccessSize)); + return std::string("__asan_report_") + (IsWrite ? "store" : "load") + + utostr(AccessSize); } class X86AddressSanitizer : public X86AsmInstrumentation { @@ -47,47 +48,55 @@ public: // X86AsmInstrumentation implementation: virtual void InstrumentInstruction( - const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands, - MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) override { + const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, + const MCInstrInfo &MII, MCStreamer &Out) override { InstrumentMOV(Inst, Operands, Ctx, MII, Out); } // Should be implemented differently in x86_32 and x86_64 subclasses. - virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize, - bool IsWrite, MCContext &Ctx, - MCStreamer &Out) = 0; + virtual void InstrumentMemOperandSmallImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) = 0; + virtual void InstrumentMemOperandLargeImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) = 0; - void InstrumentMemOperand(MCParsedAsmOperand *Op, unsigned AccessSize, + void InstrumentMemOperand(MCParsedAsmOperand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, MCStreamer &Out); - void InstrumentMOV(const MCInst &Inst, - SmallVectorImpl<MCParsedAsmOperand *> &Operands, + void InstrumentMOV(const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out); void EmitInstruction(MCStreamer &Out, const MCInst &Inst) { Out.EmitInstruction(Inst, STI); } + void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); } + protected: const MCSubtargetInfo &STI; }; void X86AddressSanitizer::InstrumentMemOperand( - MCParsedAsmOperand *Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCParsedAsmOperand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, MCStreamer &Out) { - assert(Op && Op->isMem() && "Op should be a memory operand."); + assert(Op.isMem() && "Op should be a memory operand."); assert((AccessSize & (AccessSize - 1)) == 0 && AccessSize <= 16 && "AccessSize should be a power of two, less or equal than 16."); - X86Operand *MemOp = static_cast<X86Operand *>(Op); + X86Operand &MemOp = static_cast<X86Operand &>(Op); // FIXME: get rid of this limitation. - if (IsStackReg(MemOp->getMemBaseReg()) || IsStackReg(MemOp->getMemIndexReg())) + if (IsStackReg(MemOp.getMemBaseReg()) || IsStackReg(MemOp.getMemIndexReg())) return; - InstrumentMemOperandImpl(MemOp, AccessSize, IsWrite, Ctx, Out); + // FIXME: take into account load/store alignment. + if (AccessSize < 8) + InstrumentMemOperandSmallImpl(MemOp, AccessSize, IsWrite, Ctx, Out); + else + InstrumentMemOperandLargeImpl(MemOp, AccessSize, IsWrite, Ctx, Out); } void X86AddressSanitizer::InstrumentMOV( - const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands, - MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) { + const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, + const MCInstrInfo &MII, MCStreamer &Out) { // Access size in bytes. unsigned AccessSize = 0; @@ -124,107 +133,351 @@ void X86AddressSanitizer::InstrumentMOV( const bool IsWrite = MII.get(Inst.getOpcode()).mayStore(); for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) { - MCParsedAsmOperand *Op = Operands[Ix]; - if (Op && Op->isMem()) + assert(Operands[Ix]); + MCParsedAsmOperand &Op = *Operands[Ix]; + if (Op.isMem()) InstrumentMemOperand(Op, AccessSize, IsWrite, Ctx, Out); } } class X86AddressSanitizer32 : public X86AddressSanitizer { public: + static const long kShadowOffset = 0x20000000; + X86AddressSanitizer32(const MCSubtargetInfo &STI) : X86AddressSanitizer(STI) {} virtual ~X86AddressSanitizer32() {} - virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize, - bool IsWrite, MCContext &Ctx, - MCStreamer &Out) override; + virtual void InstrumentMemOperandSmallImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) override; + virtual void InstrumentMemOperandLargeImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) override; + + private: + void EmitCallAsanReport(MCContext &Ctx, MCStreamer &Out, unsigned AccessSize, + bool IsWrite, unsigned AddressReg) { + EmitInstruction(Out, MCInstBuilder(X86::CLD)); + EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS)); + + EmitInstruction(Out, MCInstBuilder(X86::AND64ri8).addReg(X86::ESP) + .addReg(X86::ESP).addImm(-16)); + EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(AddressReg)); + + + const std::string& Fn = FuncName(AccessSize, IsWrite); + MCSymbol *FnSym = Ctx.GetOrCreateSymbol(StringRef(Fn)); + const MCSymbolRefExpr *FnExpr = + MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr)); + } }; -void X86AddressSanitizer32::InstrumentMemOperandImpl( - X86Operand *Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, +void X86AddressSanitizer32::InstrumentMemOperandSmallImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, MCStreamer &Out) { - // FIXME: emit .cfi directives for correct stack unwinding. EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX)); + EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::ECX)); + EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EDX)); + EmitInstruction(Out, MCInstBuilder(X86::PUSHF32)); + { MCInst Inst; Inst.setOpcode(X86::LEA32r); Inst.addOperand(MCOperand::CreateReg(X86::EAX)); + Op.addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + } + + EmitInstruction( + Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EAX)); + EmitInstruction(Out, MCInstBuilder(X86::SHR32ri).addReg(X86::ECX) + .addReg(X86::ECX).addImm(3)); + + { + MCInst Inst; + Inst.setOpcode(X86::MOV8rm); + Inst.addOperand(MCOperand::CreateReg(X86::CL)); + const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); + std::unique_ptr<X86Operand> Op( + X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc())); + Op->addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + } + + EmitInstruction(Out, + MCInstBuilder(X86::TEST8rr).addReg(X86::CL).addReg(X86::CL)); + MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + + EmitInstruction( + Out, MCInstBuilder(X86::MOV32rr).addReg(X86::EDX).addReg(X86::EAX)); + EmitInstruction(Out, MCInstBuilder(X86::AND32ri).addReg(X86::EDX) + .addReg(X86::EDX).addImm(7)); + + switch (AccessSize) { + case 1: + break; + case 2: { + MCInst Inst; + Inst.setOpcode(X86::LEA32r); + Inst.addOperand(MCOperand::CreateReg(X86::EDX)); + + const MCExpr *Disp = MCConstantExpr::Create(1, Ctx); + std::unique_ptr<X86Operand> Op( + X86Operand::CreateMem(0, Disp, X86::EDX, 0, 1, SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); EmitInstruction(Out, Inst); + break; } + case 4: + EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8).addReg(X86::EDX) + .addReg(X86::EDX).addImm(3)); + break; + default: + assert(false && "Incorrect access size"); + break; + } + + EmitInstruction( + Out, MCInstBuilder(X86::MOVSX32rr8).addReg(X86::ECX).addReg(X86::CL)); + EmitInstruction( + Out, MCInstBuilder(X86::CMP32rr).addReg(X86::EDX).addReg(X86::ECX)); + EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr)); + + EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite, X86::EAX); + EmitLabel(Out, DoneSym); + + EmitInstruction(Out, MCInstBuilder(X86::POPF32)); + EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EDX)); + EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::ECX)); + EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX)); +} + +void X86AddressSanitizer32::InstrumentMemOperandLargeImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) { EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX)); + EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::ECX)); + EmitInstruction(Out, MCInstBuilder(X86::PUSHF32)); + { - const std::string Func = FuncName(AccessSize, IsWrite); - const MCSymbol *FuncSym = Ctx.GetOrCreateSymbol(StringRef(Func)); - const MCSymbolRefExpr *FuncExpr = - MCSymbolRefExpr::Create(FuncSym, MCSymbolRefExpr::VK_PLT, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FuncExpr)); + MCInst Inst; + Inst.setOpcode(X86::LEA32r); + Inst.addOperand(MCOperand::CreateReg(X86::EAX)); + Op.addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); } - EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX)); + EmitInstruction( + Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EAX)); + EmitInstruction(Out, MCInstBuilder(X86::SHR32ri).addReg(X86::ECX) + .addReg(X86::ECX).addImm(3)); + { + MCInst Inst; + switch (AccessSize) { + case 8: + Inst.setOpcode(X86::CMP8mi); + break; + case 16: + Inst.setOpcode(X86::CMP16mi); + break; + default: + assert(false && "Incorrect access size"); + break; + } + const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); + std::unique_ptr<X86Operand> Op( + X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc())); + Op->addMemOperands(Inst, 5); + Inst.addOperand(MCOperand::CreateImm(0)); + EmitInstruction(Out, Inst); + } + MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + + EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite, X86::EAX); + EmitLabel(Out, DoneSym); + + EmitInstruction(Out, MCInstBuilder(X86::POPF32)); + EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::ECX)); EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX)); } class X86AddressSanitizer64 : public X86AddressSanitizer { public: + static const long kShadowOffset = 0x7fff8000; + X86AddressSanitizer64(const MCSubtargetInfo &STI) : X86AddressSanitizer(STI) {} virtual ~X86AddressSanitizer64() {} - virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize, - bool IsWrite, MCContext &Ctx, - MCStreamer &Out) override; -}; + virtual void InstrumentMemOperandSmallImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) override; + virtual void InstrumentMemOperandLargeImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) override; -void X86AddressSanitizer64::InstrumentMemOperandImpl(X86Operand *Op, - unsigned AccessSize, - bool IsWrite, - MCContext &Ctx, - MCStreamer &Out) { - // FIXME: emit .cfi directives for correct stack unwinding. - - // Set %rsp below current red zone (128 bytes wide) using LEA instruction to - // preserve flags. - { +private: + void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) { MCInst Inst; Inst.setOpcode(X86::LEA64r); Inst.addOperand(MCOperand::CreateReg(X86::RSP)); - const MCExpr *Disp = MCConstantExpr::Create(-128, Ctx); + const MCExpr *Disp = MCConstantExpr::Create(Offset, Ctx); std::unique_ptr<X86Operand> Op( X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); EmitInstruction(Out, Inst); } + + void EmitCallAsanReport(MCContext &Ctx, MCStreamer &Out, unsigned AccessSize, + bool IsWrite) { + EmitInstruction(Out, MCInstBuilder(X86::CLD)); + EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS)); + + EmitInstruction(Out, MCInstBuilder(X86::AND64ri8).addReg(X86::RSP) + .addReg(X86::RSP).addImm(-16)); + + const std::string& Fn = FuncName(AccessSize, IsWrite); + MCSymbol *FnSym = Ctx.GetOrCreateSymbol(StringRef(Fn)); + const MCSymbolRefExpr *FnExpr = + MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr)); + } +}; + +void X86AddressSanitizer64::InstrumentMemOperandSmallImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) { + EmitAdjustRSP(Ctx, Out, -128); + EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RAX)); + EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RCX)); EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RDI)); + EmitInstruction(Out, MCInstBuilder(X86::PUSHF64)); { MCInst Inst; Inst.setOpcode(X86::LEA64r); Inst.addOperand(MCOperand::CreateReg(X86::RDI)); - Op->addMemOperands(Inst, 5); + Op.addMemOperands(Inst, 5); EmitInstruction(Out, Inst); } + EmitInstruction( + Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RAX).addReg(X86::RDI)); + EmitInstruction(Out, MCInstBuilder(X86::SHR64ri).addReg(X86::RAX) + .addReg(X86::RAX).addImm(3)); { - const std::string Func = FuncName(AccessSize, IsWrite); - const MCSymbol *FuncSym = Ctx.GetOrCreateSymbol(StringRef(Func)); - const MCSymbolRefExpr *FuncExpr = - MCSymbolRefExpr::Create(FuncSym, MCSymbolRefExpr::VK_PLT, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FuncExpr)); + MCInst Inst; + Inst.setOpcode(X86::MOV8rm); + Inst.addOperand(MCOperand::CreateReg(X86::AL)); + const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); + std::unique_ptr<X86Operand> Op( + X86Operand::CreateMem(0, Disp, X86::RAX, 0, 1, SMLoc(), SMLoc())); + Op->addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + } + + EmitInstruction(Out, + MCInstBuilder(X86::TEST8rr).addReg(X86::AL).addReg(X86::AL)); + MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + + EmitInstruction( + Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EDI)); + EmitInstruction(Out, MCInstBuilder(X86::AND32ri).addReg(X86::ECX) + .addReg(X86::ECX).addImm(7)); + + switch (AccessSize) { + case 1: + break; + case 2: { + MCInst Inst; + Inst.setOpcode(X86::LEA32r); + Inst.addOperand(MCOperand::CreateReg(X86::ECX)); + + const MCExpr *Disp = MCConstantExpr::Create(1, Ctx); + std::unique_ptr<X86Operand> Op( + X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc())); + Op->addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + break; } + case 4: + EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8).addReg(X86::ECX) + .addReg(X86::ECX).addImm(3)); + break; + default: + assert(false && "Incorrect access size"); + break; + } + + EmitInstruction( + Out, MCInstBuilder(X86::MOVSX32rr8).addReg(X86::EAX).addReg(X86::AL)); + EmitInstruction( + Out, MCInstBuilder(X86::CMP32rr).addReg(X86::ECX).addReg(X86::EAX)); + EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr)); + + EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite); + EmitLabel(Out, DoneSym); + + EmitInstruction(Out, MCInstBuilder(X86::POPF64)); EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RDI)); + EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RCX)); + EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RAX)); + EmitAdjustRSP(Ctx, Out, 128); +} + +void X86AddressSanitizer64::InstrumentMemOperandLargeImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) { + EmitAdjustRSP(Ctx, Out, -128); + EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RAX)); + EmitInstruction(Out, MCInstBuilder(X86::PUSHF64)); - // Restore old %rsp value. { MCInst Inst; Inst.setOpcode(X86::LEA64r); - Inst.addOperand(MCOperand::CreateReg(X86::RSP)); - - const MCExpr *Disp = MCConstantExpr::Create(128, Ctx); + Inst.addOperand(MCOperand::CreateReg(X86::RAX)); + Op.addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + } + EmitInstruction(Out, MCInstBuilder(X86::SHR64ri).addReg(X86::RAX) + .addReg(X86::RAX).addImm(3)); + { + MCInst Inst; + switch (AccessSize) { + case 8: + Inst.setOpcode(X86::CMP8mi); + break; + case 16: + Inst.setOpcode(X86::CMP16mi); + break; + default: + assert(false && "Incorrect access size"); + break; + } + const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc())); + X86Operand::CreateMem(0, Disp, X86::RAX, 0, 1, SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); + Inst.addOperand(MCOperand::CreateImm(0)); EmitInstruction(Out, Inst); } + + MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + + EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite); + EmitLabel(Out, DoneSym); + + EmitInstruction(Out, MCInstBuilder(X86::POPF64)); + EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RAX)); + EmitAdjustRSP(Ctx, Out, 128); } } // End anonymous namespace @@ -233,8 +486,8 @@ X86AsmInstrumentation::X86AsmInstrumentation() {} X86AsmInstrumentation::~X86AsmInstrumentation() {} void X86AsmInstrumentation::InstrumentInstruction( - const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands, - MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) {} + const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, + const MCInstrInfo &MII, MCStreamer &Out) {} X86AsmInstrumentation * CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h index 0369b14..1bc3c09 100644 --- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h +++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h @@ -12,6 +12,8 @@ #include "llvm/ADT/SmallVector.h" +#include <memory> + namespace llvm { class MCContext; @@ -35,10 +37,9 @@ public: // Instruments Inst. Should be called just before the original // instruction is sent to Out. virtual void InstrumentInstruction( - const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands, - MCContext &Ctx, - const MCInstrInfo &MII, - MCStreamer &Out); + const MCInst &Inst, + SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>> &Operands, + MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out); protected: friend X86AsmInstrumentation * diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index d3e695e..f0765ed 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -235,6 +235,7 @@ private: IES_RSHIFT, IES_PLUS, IES_MINUS, + IES_NOT, IES_MULTIPLY, IES_DIVIDE, IES_LBRAC, @@ -372,6 +373,7 @@ private: State = IES_ERROR; break; case IES_PLUS: + case IES_NOT: case IES_MULTIPLY: case IES_DIVIDE: case IES_LPAREN: @@ -401,6 +403,19 @@ private: } PrevState = CurrState; } + void onNot() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_PLUS: + case IES_NOT: + State = IES_NOT; + break; + } + PrevState = CurrState; + } void onRegister(unsigned Reg) { IntelExprState CurrState = State; switch (State) { @@ -438,6 +453,7 @@ private: break; case IES_PLUS: case IES_MINUS: + case IES_NOT: State = IES_INTEGER; Sym = SymRef; SymName = SymRefName; @@ -453,6 +469,7 @@ private: break; case IES_PLUS: case IES_MINUS: + case IES_NOT: case IES_OR: case IES_AND: case IES_LSHIFT: @@ -476,11 +493,22 @@ private: PrevState == IES_OR || PrevState == IES_AND || PrevState == IES_LSHIFT || PrevState == IES_RSHIFT || PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || - PrevState == IES_LPAREN || PrevState == IES_LBRAC) && + PrevState == IES_LPAREN || PrevState == IES_LBRAC || + PrevState == IES_NOT) && CurrState == IES_MINUS) { // Unary minus. No need to pop the minus operand because it was never // pushed. IC.pushOperand(IC_IMM, -TmpInt); // Push -Imm. + } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS || + PrevState == IES_OR || PrevState == IES_AND || + PrevState == IES_LSHIFT || PrevState == IES_RSHIFT || + PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || + PrevState == IES_LPAREN || PrevState == IES_LBRAC || + PrevState == IES_NOT) && + CurrState == IES_NOT) { + // Unary not. No need to pop the not operand because it was never + // pushed. + IC.pushOperand(IC_IMM, ~TmpInt); // Push ~Imm. } else { IC.pushOperand(IC_IMM, TmpInt); } @@ -561,6 +589,7 @@ private: break; case IES_PLUS: case IES_MINUS: + case IES_NOT: case IES_OR: case IES_AND: case IES_LSHIFT: @@ -568,13 +597,14 @@ private: case IES_MULTIPLY: case IES_DIVIDE: case IES_LPAREN: - // FIXME: We don't handle this type of unary minus, yet. + // FIXME: We don't handle this type of unary minus or not, yet. if ((PrevState == IES_PLUS || PrevState == IES_MINUS || PrevState == IES_OR || PrevState == IES_AND || PrevState == IES_LSHIFT || PrevState == IES_RSHIFT || PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || - PrevState == IES_LPAREN || PrevState == IES_LBRAC) && - CurrState == IES_MINUS) { + PrevState == IES_LPAREN || PrevState == IES_LBRAC || + PrevState == IES_NOT) && + (CurrState == IES_MINUS || CurrState == IES_NOT)) { State = IES_ERROR; break; } @@ -618,52 +648,52 @@ private: return Error(L, Msg, Ranges, MatchingInlineAsm); } - X86Operand *ErrorOperand(SMLoc Loc, StringRef Msg) { + std::nullptr_t ErrorOperand(SMLoc Loc, StringRef Msg) { Error(Loc, Msg); return nullptr; } - X86Operand *DefaultMemSIOperand(SMLoc Loc); - X86Operand *DefaultMemDIOperand(SMLoc Loc); - X86Operand *ParseOperand(); - X86Operand *ParseATTOperand(); - X86Operand *ParseIntelOperand(); - X86Operand *ParseIntelOffsetOfOperator(); + std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc); + std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc); + std::unique_ptr<X86Operand> ParseOperand(); + std::unique_ptr<X86Operand> ParseATTOperand(); + std::unique_ptr<X86Operand> ParseIntelOperand(); + std::unique_ptr<X86Operand> ParseIntelOffsetOfOperator(); bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp); - X86Operand *ParseIntelOperator(unsigned OpKind); - X86Operand *ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size); - X86Operand *ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc, - unsigned Size); + std::unique_ptr<X86Operand> ParseIntelOperator(unsigned OpKind); + std::unique_ptr<X86Operand> + ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size); + std::unique_ptr<X86Operand> + ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc, unsigned Size); bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End); - X86Operand *ParseIntelBracExpression(unsigned SegReg, SMLoc Start, - int64_t ImmDisp, unsigned Size); + std::unique_ptr<X86Operand> ParseIntelBracExpression(unsigned SegReg, + SMLoc Start, + int64_t ImmDisp, + unsigned Size); bool ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier, InlineAsmIdentifierInfo &Info, bool IsUnevaluatedOperand, SMLoc &End); - X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc); + std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg, SMLoc StartLoc); - X86Operand *CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, - unsigned BaseReg, unsigned IndexReg, - unsigned Scale, SMLoc Start, SMLoc End, - unsigned Size, StringRef Identifier, - InlineAsmIdentifierInfo &Info); + std::unique_ptr<X86Operand> + CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, + unsigned IndexReg, unsigned Scale, SMLoc Start, + SMLoc End, unsigned Size, StringRef Identifier, + InlineAsmIdentifierInfo &Info); bool ParseDirectiveWord(unsigned Size, SMLoc L); bool ParseDirectiveCode(StringRef IDVal, SMLoc L); - bool processInstruction(MCInst &Inst, - const SmallVectorImpl<MCParsedAsmOperand*> &Ops); + bool processInstruction(MCInst &Inst, const OperandVector &Ops); /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds /// instrumentation around Inst. - void EmitInstruction(MCInst &Inst, - SmallVectorImpl<MCParsedAsmOperand *> &Operands, - MCStreamer &Out); + void EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out); bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, - SmallVectorImpl<MCParsedAsmOperand*> &Operands, - MCStreamer &Out, unsigned &ErrorInfo, + OperandVector &Operands, MCStreamer &Out, + unsigned &ErrorInfo, bool MatchingInlineAsm) override; /// doSrcDstMatch - Returns true if operands are matching in their @@ -674,8 +704,8 @@ private: /// Parses AVX512 specific operand primitives: masked registers ({%k<NUM>}, {z}) /// and memory broadcasting ({1to<NUM>}) primitives, updating Operands vector if required. /// \return \c true if no parsing errors occurred, \c false otherwise. - bool HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands, - const MCParsedAsmOperand &Op); + bool HandleAVX512Operand(OperandVector &Operands, + const MCParsedAsmOperand &Op); bool is64BitMode() const { // FIXME: Can tablegen auto-generate this? @@ -725,9 +755,8 @@ public: bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; - bool - ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, - SmallVectorImpl<MCParsedAsmOperand*> &Operands) override; + bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) override; bool ParseDirective(AsmToken DirectiveID) override; }; @@ -908,7 +937,7 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, return false; } -X86Operand *X86AsmParser::DefaultMemSIOperand(SMLoc Loc) { +std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) { unsigned basereg = is64BitMode() ? X86::RSI : (is32BitMode() ? X86::ESI : X86::SI); const MCExpr *Disp = MCConstantExpr::Create(0, getContext()); @@ -916,7 +945,7 @@ X86Operand *X86AsmParser::DefaultMemSIOperand(SMLoc Loc) { /*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0); } -X86Operand *X86AsmParser::DefaultMemDIOperand(SMLoc Loc) { +std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) { unsigned basereg = is64BitMode() ? X86::RDI : (is32BitMode() ? X86::EDI : X86::DI); const MCExpr *Disp = MCConstantExpr::Create(0, getContext()); @@ -924,7 +953,7 @@ X86Operand *X86AsmParser::DefaultMemDIOperand(SMLoc Loc) { /*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0); } -X86Operand *X86AsmParser::ParseOperand() { +std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() { if (isParsingIntelSyntax()) return ParseIntelOperand(); return ParseATTOperand(); @@ -946,12 +975,10 @@ static unsigned getIntelMemOperandSize(StringRef OpStr) { return Size; } -X86Operand * -X86AsmParser::CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, - unsigned BaseReg, unsigned IndexReg, - unsigned Scale, SMLoc Start, SMLoc End, - unsigned Size, StringRef Identifier, - InlineAsmIdentifierInfo &Info){ +std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( + unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg, + unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier, + InlineAsmIdentifierInfo &Info) { // If this is not a VarDecl then assume it is a FuncDecl or some other label // reference. We need an 'r' constraint here, so we need to create register // operand to ensure proper matching. Just pick a GPR based on the size of @@ -1064,7 +1091,8 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac) break; - switch (getLexer().getKind()) { + AsmToken::TokenKind TK = getLexer().getKind(); + switch (TK) { default: { if (SM.isValidEndState()) { Done = true; @@ -1076,13 +1104,14 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { Done = true; break; } + case AsmToken::String: case AsmToken::Identifier: { // This could be a register or a symbolic displacement. unsigned TmpReg; const MCExpr *Val; SMLoc IdentLoc = Tok.getLoc(); StringRef Identifier = Tok.getString(); - if(!ParseRegister(TmpReg, IdentLoc, End)) { + if (TK != AsmToken::String && !ParseRegister(TmpReg, IdentLoc, End)) { SM.onRegister(TmpReg); UpdateLocLex = false; break; @@ -1142,6 +1171,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { } case AsmToken::Plus: SM.onPlus(); break; case AsmToken::Minus: SM.onMinus(); break; + case AsmToken::Tilde: SM.onNot(); break; case AsmToken::Star: SM.onStar(); break; case AsmToken::Slash: SM.onDivide(); break; case AsmToken::Pipe: SM.onOr(); break; @@ -1164,9 +1194,9 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { return false; } -X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, - int64_t ImmDisp, - unsigned Size) { +std::unique_ptr<X86Operand> +X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, + int64_t ImmDisp, unsigned Size) { const AsmToken &Tok = Parser.getTok(); SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc(); if (getLexer().isNot(AsmToken::LBrac)) @@ -1270,9 +1300,9 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, } /// \brief Parse intel style segment override. -X86Operand *X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, - SMLoc Start, - unsigned Size) { +std::unique_ptr<X86Operand> +X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, + unsigned Size) { assert(SegReg != 0 && "Tried to parse a segment override without a segment!"); const AsmToken &Tok = Parser.getTok(); // Eat colon. if (Tok.isNot(AsmToken::Colon)) @@ -1321,8 +1351,9 @@ X86Operand *X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, } /// ParseIntelMemOperand - Parse intel style memory operand. -X86Operand *X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start, - unsigned Size) { +std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, + SMLoc Start, + unsigned Size) { const AsmToken &Tok = Parser.getTok(); SMLoc End; @@ -1425,7 +1456,7 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, /// Parse the 'offset' operator. This operator is used to specify the /// location rather then the content of a variable. -X86Operand *X86AsmParser::ParseIntelOffsetOfOperator() { +std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() { const AsmToken &Tok = Parser.getTok(); SMLoc OffsetOfLoc = Tok.getLoc(); Parser.Lex(); // Eat offset. @@ -1462,7 +1493,7 @@ enum IntelOperatorKind { /// variable. A variable's size is the product of its LENGTH and TYPE. The /// TYPE operator returns the size of a C or C++ type or variable. If the /// variable is an array, TYPE returns the size of a single element. -X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) { +std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) { const AsmToken &Tok = Parser.getTok(); SMLoc TypeLoc = Tok.getLoc(); Parser.Lex(); // Eat operator. @@ -1495,7 +1526,7 @@ X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) { return X86Operand::CreateImm(Imm, Start, End); } -X86Operand *X86AsmParser::ParseIntelOperand() { +std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { const AsmToken &Tok = Parser.getTok(); SMLoc Start, End; @@ -1523,7 +1554,7 @@ X86Operand *X86AsmParser::ParseIntelOperand() { // Immediate. if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Minus) || - getLexer().is(AsmToken::LParen)) { + getLexer().is(AsmToken::Tilde) || getLexer().is(AsmToken::LParen)) { AsmToken StartTok = Tok; IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true, /*AddImmPrefix=*/false); @@ -1577,7 +1608,7 @@ X86Operand *X86AsmParser::ParseIntelOperand() { return ParseIntelMemOperand(/*Disp=*/0, Start, Size); } -X86Operand *X86AsmParser::ParseATTOperand() { +std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { switch (getLexer().getKind()) { default: // Parse a memory operand with no segment register. @@ -1613,9 +1644,8 @@ X86Operand *X86AsmParser::ParseATTOperand() { } } -bool -X86AsmParser::HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands, - const MCParsedAsmOperand &Op) { +bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands, + const MCParsedAsmOperand &Op) { if(STI.getFeatureBits() & X86::FeatureAVX512) { if (getLexer().is(AsmToken::LCurly)) { // Eat "{" and mark the current place. @@ -1653,8 +1683,8 @@ X86AsmParser::HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands } else { // Parse mask register {%k1} Operands.push_back(X86Operand::CreateToken("{", consumedToken)); - if (X86Operand *Op = ParseOperand()) { - Operands.push_back(Op); + if (std::unique_ptr<X86Operand> Op = ParseOperand()) { + Operands.push_back(std::move(Op)); if (!getLexer().is(AsmToken::RCurly)) return !ErrorAndEatStatement(getLexer().getLoc(), "Expected } at this point"); @@ -1682,7 +1712,8 @@ X86AsmParser::HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands /// ParseMemOperand: segment: disp(basereg, indexreg, scale). The '%ds:' prefix /// has already been parsed if present. -X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { +std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg, + SMLoc MemStart) { // We have to disambiguate a parenthesized expression "(4+5)" from the start // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)". The @@ -1845,9 +1876,8 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { MemStart, MemEnd); } -bool X86AsmParser:: -ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, - SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) { InstInfo = &Info; StringRef PatchedName = Name; @@ -1940,9 +1970,9 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, // Read the operands. while(1) { - if (X86Operand *Op = ParseOperand()) { - Operands.push_back(Op); - if (!HandleAVX512Operand(Operands, *Op)) + if (std::unique_ptr<X86Operand> Op = ParseOperand()) { + Operands.push_back(std::move(Op)); + if (!HandleAVX512Operand(Operands, *Operands.back())) return true; } else { Parser.eatToEndOfStatement(); @@ -1973,27 +2003,25 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, // documented form in various unofficial manuals, so a lot of code uses it. if ((Name == "outb" || Name == "outw" || Name == "outl" || Name == "out") && Operands.size() == 3) { - X86Operand &Op = *(X86Operand*)Operands.back(); + X86Operand &Op = (X86Operand &)*Operands.back(); if (Op.isMem() && Op.Mem.SegReg == 0 && isa<MCConstantExpr>(Op.Mem.Disp) && cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 && Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) { SMLoc Loc = Op.getEndLoc(); Operands.back() = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc); - delete &Op; } } // Same hack for "in[bwl]? (%dx), %al" -> "inb %dx, %al". if ((Name == "inb" || Name == "inw" || Name == "inl" || Name == "in") && Operands.size() == 3) { - X86Operand &Op = *(X86Operand*)Operands.begin()[1]; + X86Operand &Op = (X86Operand &)*Operands[1]; if (Op.isMem() && Op.Mem.SegReg == 0 && isa<MCConstantExpr>(Op.Mem.Disp) && cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 && Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) { SMLoc Loc = Op.getEndLoc(); - Operands.begin()[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc); - delete &Op; + Operands[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc); } } @@ -2060,8 +2088,8 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, Operands.push_back(DefaultMemSIOperand(NameLoc)); } } else if (Operands.size() == 3) { - X86Operand &Op = *(X86Operand*)Operands.begin()[1]; - X86Operand &Op2 = *(X86Operand*)Operands.begin()[2]; + X86Operand &Op = (X86Operand &)*Operands[1]; + X86Operand &Op2 = (X86Operand &)*Operands[2]; if (!doSrcDstMatch(Op, Op2)) return Error(Op.getStartLoc(), "mismatching source and destination index registers"); @@ -2076,10 +2104,8 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, (Name == "smov" || Name == "smovb" || Name == "smovw" || Name == "smovl" || Name == "smovd" || Name == "smovq"))) { if (Operands.size() == 1) { - if (Name == "movsd") { - delete Operands.back(); + if (Name == "movsd") Operands.back() = X86Operand::CreateToken("movsl", NameLoc); - } if (isParsingIntelSyntax()) { Operands.push_back(DefaultMemDIOperand(NameLoc)); Operands.push_back(DefaultMemSIOperand(NameLoc)); @@ -2088,8 +2114,8 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, Operands.push_back(DefaultMemDIOperand(NameLoc)); } } else if (Operands.size() == 3) { - X86Operand &Op = *(X86Operand*)Operands.begin()[1]; - X86Operand &Op2 = *(X86Operand*)Operands.begin()[2]; + X86Operand &Op = (X86Operand &)*Operands[1]; + X86Operand &Op2 = (X86Operand &)*Operands[2]; if (!doSrcDstMatch(Op, Op2)) return Error(Op.getStartLoc(), "mismatching source and destination index registers"); @@ -2105,31 +2131,26 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, Operands.size() == 3) { if (isParsingIntelSyntax()) { // Intel syntax - X86Operand *Op1 = static_cast<X86Operand*>(Operands[2]); - if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) && - cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) { - delete Operands[2]; + X86Operand &Op1 = static_cast<X86Operand &>(*Operands[2]); + if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) && + cast<MCConstantExpr>(Op1.getImm())->getValue() == 1) Operands.pop_back(); - } } else { - X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]); - if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) && - cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) { - delete Operands[1]; + X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]); + if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) && + cast<MCConstantExpr>(Op1.getImm())->getValue() == 1) Operands.erase(Operands.begin() + 1); - } } } // Transforms "int $3" into "int3" as a size optimization. We can't write an // instalias with an immediate operand yet. if (Name == "int" && Operands.size() == 2) { - X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]); - if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) && - cast<MCConstantExpr>(Op1->getImm())->getValue() == 3) { - delete Operands[1]; + X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]); + if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) && + cast<MCConstantExpr>(Op1.getImm())->getValue() == 3) { Operands.erase(Operands.begin() + 1); - static_cast<X86Operand*>(Operands[0])->setTokenValue("int3"); + static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3"); } } @@ -2175,9 +2196,7 @@ static bool convert64i32to64ri8(MCInst &Inst, unsigned Opcode, return convertToSExti8(Inst, Opcode, X86::RAX, isCmp); } -bool X86AsmParser:: -processInstruction(MCInst &Inst, - const SmallVectorImpl<MCParsedAsmOperand*> &Ops) { +bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { switch (Inst.getOpcode()) { default: return false; case X86::AND16i16: return convert16i16to16ri8(Inst, X86::AND16ri8); @@ -2258,51 +2277,47 @@ processInstruction(MCInst &Inst, static const char *getSubtargetFeatureName(unsigned Val); -void X86AsmParser::EmitInstruction( - MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands, - MCStreamer &Out) { +void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands, + MCStreamer &Out) { Instrumentation->InstrumentInstruction(Inst, Operands, getContext(), MII, Out); Out.EmitInstruction(Inst, STI); } -bool X86AsmParser:: -MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, - SmallVectorImpl<MCParsedAsmOperand*> &Operands, - MCStreamer &Out, unsigned &ErrorInfo, - bool MatchingInlineAsm) { +bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, unsigned &ErrorInfo, + bool MatchingInlineAsm) { assert(!Operands.empty() && "Unexpect empty operand list!"); - X86Operand *Op = static_cast<X86Operand*>(Operands[0]); - assert(Op->isToken() && "Leading operand should always be a mnemonic!"); + X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); + assert(Op.isToken() && "Leading operand should always be a mnemonic!"); ArrayRef<SMRange> EmptyRanges = None; // First, handle aliases that expand to multiple instructions. // FIXME: This should be replaced with a real .td file alias mechanism. // Also, MatchInstructionImpl should actually *do* the EmitInstruction // call. - if (Op->getToken() == "fstsw" || Op->getToken() == "fstcw" || - Op->getToken() == "fstsww" || Op->getToken() == "fstcww" || - Op->getToken() == "finit" || Op->getToken() == "fsave" || - Op->getToken() == "fstenv" || Op->getToken() == "fclex") { + if (Op.getToken() == "fstsw" || Op.getToken() == "fstcw" || + Op.getToken() == "fstsww" || Op.getToken() == "fstcww" || + Op.getToken() == "finit" || Op.getToken() == "fsave" || + Op.getToken() == "fstenv" || Op.getToken() == "fclex") { MCInst Inst; Inst.setOpcode(X86::WAIT); Inst.setLoc(IDLoc); if (!MatchingInlineAsm) EmitInstruction(Inst, Operands, Out); - const char *Repl = - StringSwitch<const char*>(Op->getToken()) - .Case("finit", "fninit") - .Case("fsave", "fnsave") - .Case("fstcw", "fnstcw") - .Case("fstcww", "fnstcw") - .Case("fstenv", "fnstenv") - .Case("fstsw", "fnstsw") - .Case("fstsww", "fnstsw") - .Case("fclex", "fnclex") - .Default(nullptr); + const char *Repl = StringSwitch<const char *>(Op.getToken()) + .Case("finit", "fninit") + .Case("fsave", "fnsave") + .Case("fstcw", "fnstcw") + .Case("fstcww", "fnstcw") + .Case("fstenv", "fnstenv") + .Case("fstsw", "fnstsw") + .Case("fstsww", "fnstsw") + .Case("fclex", "fnclex") + .Default(nullptr); assert(Repl && "Unknown wait-prefixed instruction"); - delete Operands[0]; Operands[0] = X86Operand::CreateToken(Repl, IDLoc); } @@ -2355,11 +2370,11 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // following hack. // Change the operand to point to a temporary token. - StringRef Base = Op->getToken(); + StringRef Base = Op.getToken(); SmallString<16> Tmp; Tmp += Base; Tmp += ' '; - Op->setTokenValue(Tmp.str()); + Op.setTokenValue(Tmp.str()); // If this instruction starts with an 'f', then it is a floating point stack // instruction. These come in up to three forms for 32-bit, 64-bit, and @@ -2400,7 +2415,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, ErrorInfoMissingFeature = ErrorInfoIgnore; // Restore the old token. - Op->setTokenValue(Base); + Op.setTokenValue(Base); // If exactly one matched, then we treat that as a successful match (and the // instruction will already have been filled in correctly, since the failing @@ -2450,8 +2465,8 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, if ((Match1 == Match_MnemonicFail) && (Match2 == Match_MnemonicFail) && (Match3 == Match_MnemonicFail) && (Match4 == Match_MnemonicFail)) { if (!WasOriginallyInvalidOperand) { - ArrayRef<SMRange> Ranges = MatchingInlineAsm ? EmptyRanges : - Op->getLocRange(); + ArrayRef<SMRange> Ranges = + MatchingInlineAsm ? EmptyRanges : Op.getLocRange(); return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'", Ranges, MatchingInlineAsm); } @@ -2462,10 +2477,10 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return Error(IDLoc, "too few operands for instruction", EmptyRanges, MatchingInlineAsm); - X86Operand *Operand = (X86Operand*)Operands[ErrorInfo]; - if (Operand->getStartLoc().isValid()) { - SMRange OperandRange = Operand->getLocRange(); - return Error(Operand->getStartLoc(), "invalid operand for instruction", + X86Operand &Operand = (X86Operand &)*Operands[ErrorInfo]; + if (Operand.getStartLoc().isValid()) { + SMRange OperandRange = Operand.getLocRange(); + return Error(Operand.getStartLoc(), "invalid operand for instruction", OperandRange, MatchingInlineAsm); } } diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h index de3be38..1bbfc11 100644 --- a/lib/Target/X86/AsmParser/X86Operand.h +++ b/lib/Target/X86/AsmParser/X86Operand.h @@ -13,6 +13,7 @@ #include "X86AsmParserCommon.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/ADT/STLExtras.h" namespace llvm { @@ -410,20 +411,19 @@ struct X86Operand : public MCParsedAsmOperand { Inst.addOperand(MCOperand::CreateReg(getMemSegReg())); } - static X86Operand *CreateToken(StringRef Str, SMLoc Loc) { + static std::unique_ptr<X86Operand> CreateToken(StringRef Str, SMLoc Loc) { SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size()); - X86Operand *Res = new X86Operand(Token, Loc, EndLoc); + auto Res = llvm::make_unique<X86Operand>(Token, Loc, EndLoc); Res->Tok.Data = Str.data(); Res->Tok.Length = Str.size(); return Res; } - static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc, - bool AddressOf = false, - SMLoc OffsetOfLoc = SMLoc(), - StringRef SymName = StringRef(), - void *OpDecl = nullptr) { - X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc); + static std::unique_ptr<X86Operand> + CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc, + bool AddressOf = false, SMLoc OffsetOfLoc = SMLoc(), + StringRef SymName = StringRef(), void *OpDecl = nullptr) { + auto Res = llvm::make_unique<X86Operand>(Register, StartLoc, EndLoc); Res->Reg.RegNo = RegNo; Res->AddressOf = AddressOf; Res->OffsetOfLoc = OffsetOfLoc; @@ -432,17 +432,18 @@ struct X86Operand : public MCParsedAsmOperand { return Res; } - static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc){ - X86Operand *Res = new X86Operand(Immediate, StartLoc, EndLoc); + static std::unique_ptr<X86Operand> CreateImm(const MCExpr *Val, + SMLoc StartLoc, SMLoc EndLoc) { + auto Res = llvm::make_unique<X86Operand>(Immediate, StartLoc, EndLoc); Res->Imm.Val = Val; return Res; } /// Create an absolute memory operand. - static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, - unsigned Size = 0, StringRef SymName = StringRef(), - void *OpDecl = nullptr) { - X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc); + static std::unique_ptr<X86Operand> + CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, unsigned Size = 0, + StringRef SymName = StringRef(), void *OpDecl = nullptr) { + auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc); Res->Mem.SegReg = 0; Res->Mem.Disp = Disp; Res->Mem.BaseReg = 0; @@ -456,12 +457,11 @@ struct X86Operand : public MCParsedAsmOperand { } /// Create a generalized memory operand. - static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp, - unsigned BaseReg, unsigned IndexReg, - unsigned Scale, SMLoc StartLoc, SMLoc EndLoc, - unsigned Size = 0, - StringRef SymName = StringRef(), - void *OpDecl = nullptr) { + static std::unique_ptr<X86Operand> + CreateMem(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, + unsigned IndexReg, unsigned Scale, SMLoc StartLoc, SMLoc EndLoc, + unsigned Size = 0, StringRef SymName = StringRef(), + void *OpDecl = nullptr) { // We should never just have a displacement, that should be parsed as an // absolute memory operand. assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!"); @@ -469,7 +469,7 @@ struct X86Operand : public MCParsedAsmOperand { // The scale should always be one of {1,2,4,8}. assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) && "Invalid scale!"); - X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc); + auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc); Res->Mem.SegReg = SegReg; Res->Mem.Disp = Disp; Res->Mem.BaseReg = BaseReg; diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index c54fbc1..a09767e 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -14,6 +14,7 @@ add_public_tablegen_target(X86CommonTableGen) set(sources X86AsmPrinter.cpp + X86AtomicExpandPass.cpp X86CodeEmitter.cpp X86FastISel.cpp X86FloatingPoint.cpp diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index 804606d..55587d4 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -1620,7 +1620,8 @@ static int readVVVV(struct InternalInstruction* insn) { int vvvv; if (insn->vectorExtensionType == TYPE_EVEX) - vvvv = vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]); + vvvv = (v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4 | + vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2])); else if (insn->vectorExtensionType == TYPE_VEX_3B) vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]); else if (insn->vectorExtensionType == TYPE_VEX_2B) diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index bf30a8e..23bca0d 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -73,11 +73,12 @@ public: }; class X86AsmBackend : public MCAsmBackend { - StringRef CPU; + const StringRef CPU; bool HasNopl; + const uint64_t MaxNopLength; public: X86AsmBackend(const Target &T, StringRef _CPU) - : MCAsmBackend(), CPU(_CPU) { + : MCAsmBackend(), CPU(_CPU), MaxNopLength(_CPU == "slm" ? 7 : 15) { HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" && CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" && CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" && @@ -331,7 +332,7 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { // 15 is the longest single nop instruction. Emit as many 15-byte nops as // needed, then emit a nop of the remaining length. do { - const uint8_t ThisNopLength = (uint8_t) std::min(Count, (uint64_t) 15); + const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength); const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10; for (uint8_t i = 0; i < Prefixes; i++) OW->Write8(0x66); @@ -365,6 +366,17 @@ public: } }; +class ELFX86_X32AsmBackend : public ELFX86AsmBackend { +public: + ELFX86_X32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) + : ELFX86AsmBackend(T, OSABI, CPU) {} + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { + return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, + ELF::EM_X86_64); + } +}; + class ELFX86_64AsmBackend : public ELFX86AsmBackend { public: ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) @@ -717,11 +729,10 @@ public: }; class DarwinX86_32AsmBackend : public DarwinX86AsmBackend { - bool SupportsCU; public: DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef CPU, bool SupportsCU) - : DarwinX86AsmBackend(T, MRI, CPU, false), SupportsCU(SupportsCU) {} + StringRef CPU) + : DarwinX86AsmBackend(T, MRI, CPU, false) {} MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { return createX86MachObjectWriter(OS, /*Is64Bit=*/false, @@ -732,20 +743,16 @@ public: /// \brief Generate the compact unwind encoding for the CFI instructions. uint32_t generateCompactUnwindEncoding( ArrayRef<MCCFIInstruction> Instrs) const override { - return SupportsCU ? generateCompactUnwindEncodingImpl(Instrs) : 0; + return generateCompactUnwindEncodingImpl(Instrs); } }; class DarwinX86_64AsmBackend : public DarwinX86AsmBackend { - bool SupportsCU; const MachO::CPUSubTypeX86 Subtype; public: DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef CPU, bool SupportsCU, - MachO::CPUSubTypeX86 st) - : DarwinX86AsmBackend(T, MRI, CPU, true), SupportsCU(SupportsCU), - Subtype(st) { - } + StringRef CPU, MachO::CPUSubTypeX86 st) + : DarwinX86AsmBackend(T, MRI, CPU, true), Subtype(st) {} MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { return createX86MachObjectWriter(OS, /*Is64Bit=*/true, @@ -788,7 +795,7 @@ public: /// \brief Generate the compact unwind encoding for the CFI instructions. uint32_t generateCompactUnwindEncoding( ArrayRef<MCCFIInstruction> Instrs) const override { - return SupportsCU ? generateCompactUnwindEncodingImpl(Instrs) : 0; + return generateCompactUnwindEncodingImpl(Instrs); } }; @@ -801,9 +808,7 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, Triple TheTriple(TT); if (TheTriple.isOSBinFormatMachO()) - return new DarwinX86_32AsmBackend(T, MRI, CPU, - TheTriple.isMacOSX() && - !TheTriple.isMacOSXVersionLT(10, 7)); + return new DarwinX86_32AsmBackend(T, MRI, CPU); if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF()) return new WindowsX86AsmBackend(T, false, CPU); @@ -823,14 +828,15 @@ MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName()) .Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H) .Default(MachO::CPU_SUBTYPE_X86_64_ALL); - return new DarwinX86_64AsmBackend(T, MRI, CPU, - TheTriple.isMacOSX() && - !TheTriple.isMacOSXVersionLT(10, 7), CS); + return new DarwinX86_64AsmBackend(T, MRI, CPU, CS); } if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF()) return new WindowsX86AsmBackend(T, true, CPU); uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); + + if (TheTriple.getEnvironment() == Triple::GNUX32) + return new ELFX86_X32AsmBackend(T, OSABI, CPU); return new ELFX86_64AsmBackend(T, OSABI, CPU); } diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index 39480ea..83b2777 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -74,8 +74,9 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) { // FIXME: this should not depend on the target OS version, but on the ld64 // version in use. From at least >= ld64-97.17 (Xcode 3.2.6) the abs-ified - // FDE relocs may be used. - DwarfFDESymbolsUseAbsDiff = T.isMacOSX() && !T.isMacOSXVersionLT(10, 6); + // FDE relocs may be used. We also use them for the ios simulator. + DwarfFDESymbolsUseAbsDiff = (T.isMacOSX() && !T.isMacOSXVersionLT(10, 6)) + || T.isiOS(); UseIntegratedAssembler = true; } @@ -142,8 +143,11 @@ getNonexecutableStackSection(MCContext &Ctx) const { void X86MCAsmInfoMicrosoft::anchor() { } X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) { - if (Triple.getArch() == Triple::x86_64) + if (Triple.getArch() == Triple::x86_64) { PrivateGlobalPrefix = ".L"; + PointerSize = 8; + ExceptionsType = ExceptionHandling::WinEH; + } AssemblerDialect = AsmWriterFlavor; @@ -157,17 +161,18 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) { void X86MCAsmInfoGNUCOFF::anchor() { } X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) { + assert(Triple.isOSWindows() && "Windows is the only supported COFF target"); if (Triple.getArch() == Triple::x86_64) { PrivateGlobalPrefix = ".L"; PointerSize = 8; + ExceptionsType = ExceptionHandling::WinEH; + } else { + ExceptionsType = ExceptionHandling::DwarfCFI; } AssemblerDialect = AsmWriterFlavor; TextAlignFillValue = 0x90; - // Exceptions handling - ExceptionsType = ExceptionHandling::DwarfCFI; - UseIntegratedAssembler = true; } diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index e63036c..5e29e5c 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -197,14 +197,13 @@ void X86_MC::DetectFamilyModel(unsigned EAX, unsigned &Family, } } -unsigned X86_MC::getDwarfRegFlavour(StringRef TT, bool isEH) { - Triple TheTriple(TT); - if (TheTriple.getArch() == Triple::x86_64) +unsigned X86_MC::getDwarfRegFlavour(Triple TT, bool isEH) { + if (TT.getArch() == Triple::x86_64) return DWARFFlavour::X86_64; - if (TheTriple.isOSDarwin()) + if (TT.isOSDarwin()) return isEH ? DWARFFlavour::X86_32_DarwinEH : DWARFFlavour::X86_32_Generic; - if (TheTriple.isOSCygMing()) + if (TT.isOSCygMing()) // Unsupported by now, just quick fallback return DWARFFlavour::X86_32_Generic; return DWARFFlavour::X86_32_Generic; @@ -251,8 +250,8 @@ static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) { MCRegisterInfo *X = new MCRegisterInfo(); InitX86MCRegisterInfo(X, RA, - X86_MC::getDwarfRegFlavour(TT, false), - X86_MC::getDwarfRegFlavour(TT, true), + X86_MC::getDwarfRegFlavour(TheTriple, false), + X86_MC::getDwarfRegFlavour(TheTriple, true), RA); X86_MC::InitLLVM2SEHRegisterMapping(X); return X; diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index 8fe40fd..ebe74cf 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -28,6 +28,7 @@ class MCSubtargetInfo; class MCRelocationInfo; class MCStreamer; class Target; +class Triple; class StringRef; class raw_ostream; @@ -64,7 +65,7 @@ namespace X86_MC { void DetectFamilyModel(unsigned EAX, unsigned &Family, unsigned &Model); - unsigned getDwarfRegFlavour(StringRef TT, bool isEH); + unsigned getDwarfRegFlavour(Triple TT, bool isEH); void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI); diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index c62fd0a..7fa4180 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -19,12 +19,12 @@ public: raw_ostream &OS) : MCWinCOFFStreamer(C, AB, *CE, OS) { } - void EmitWin64EHHandlerData() override; + void EmitWinEHHandlerData() override; void FinishImpl() override; }; -void X86WinCOFFStreamer::EmitWin64EHHandlerData() { - MCStreamer::EmitWin64EHHandlerData(); +void X86WinCOFFStreamer::EmitWinEHHandlerData() { + MCStreamer::EmitWinEHHandlerData(); // We have to emit the unwind info now, because this directive // actually switches to the .xdata section! diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index 64e8ea8..d5522ed 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -24,6 +24,10 @@ class ImmutablePass; class JITCodeEmitter; class X86TargetMachine; +/// createX86AtomicExpandPass - This pass expands atomic operations that cannot +/// be handled natively in terms of a loop using cmpxchg. +FunctionPass *createX86AtomicExpandPass(const X86TargetMachine *TM); + /// createX86ISelDag - This pass converts a legalized DAG into a /// X86-specific DAG, ready for instruction scheduling. /// diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 6912b57..93f516a 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -168,6 +168,8 @@ def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true", "LEA instruction needs inputs at AG stage">; def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true", "LEA instruction with certain arguments is slow">; +def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", + "INC and DEC instructions are slower than ADD and SUB">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -228,7 +230,7 @@ def : ProcessorModel<"slm", SLMModel, [ProcIntelSLM, FeaturePCLMUL, FeatureAES, FeatureCallRegIndirect, FeaturePRFCHW, - FeatureSlowLEA, + FeatureSlowLEA, FeatureSlowIncDec, FeatureSlowBTMem, FeatureFastUAMem]>; // "Arrandale" along with corei3 and corei5 def : ProcessorModel<"corei7", SandyBridgeModel, @@ -271,7 +273,8 @@ def : ProcessorModel<"knl", HaswellModel, FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, - FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE]>; + FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE, + FeatureSlowIncDec]>; def : Proc<"k6", [FeatureMMX]>; def : Proc<"k6-2", [Feature3DNow]>; diff --git a/lib/Target/X86/X86AtomicExpandPass.cpp b/lib/Target/X86/X86AtomicExpandPass.cpp new file mode 100644 index 0000000..61eefbb --- /dev/null +++ b/lib/Target/X86/X86AtomicExpandPass.cpp @@ -0,0 +1,287 @@ +//===-- X86AtomicExpandPass.cpp - Expand illegal atomic instructions --0---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass (at IR level) to replace atomic instructions which +// cannot be implemented as a single instruction with cmpxchg-based loops. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86TargetMachine.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" +using namespace llvm; + +#define DEBUG_TYPE "x86-atomic-expand" + +namespace { + class X86AtomicExpandPass : public FunctionPass { + const X86TargetMachine *TM; + public: + static char ID; // Pass identification, replacement for typeid + explicit X86AtomicExpandPass(const X86TargetMachine *TM) + : FunctionPass(ID), TM(TM) {} + + bool runOnFunction(Function &F) override; + bool expandAtomicInsts(Function &F); + + bool needsCmpXchgNb(Type *MemType); + + /// There are four kinds of atomic operations. Two never need expanding: + /// cmpxchg is what we expand the others *to*, and loads are easily handled + /// by ISelLowering. Atomicrmw and store can need expanding in some + /// circumstances. + bool shouldExpand(Instruction *Inst); + + /// 128-bit atomic stores (64-bit on i686) need to be implemented in terms + /// of trivial cmpxchg16b loops. A simple store isn't necessarily atomic. + bool shouldExpandStore(StoreInst *SI); + + /// Only some atomicrmw instructions need expanding -- some operations + /// (e.g. max) have absolutely no architectural support; some (e.g. or) have + /// limited support but can't return the previous value; some (e.g. add) + /// have complete support in the instruction set. + /// + /// Also, naturally, 128-bit operations always need to be expanded. + bool shouldExpandAtomicRMW(AtomicRMWInst *AI); + + bool expandAtomicRMW(AtomicRMWInst *AI); + bool expandAtomicStore(StoreInst *SI); + }; +} + +char X86AtomicExpandPass::ID = 0; + +FunctionPass *llvm::createX86AtomicExpandPass(const X86TargetMachine *TM) { + return new X86AtomicExpandPass(TM); +} + +bool X86AtomicExpandPass::runOnFunction(Function &F) { + SmallVector<Instruction *, 1> AtomicInsts; + + // Changing control-flow while iterating through it is a bad idea, so gather a + // list of all atomic instructions before we start. + for (BasicBlock &BB : F) + for (Instruction &Inst : BB) { + if (isa<AtomicRMWInst>(&Inst) || + (isa<StoreInst>(&Inst) && cast<StoreInst>(&Inst)->isAtomic())) + AtomicInsts.push_back(&Inst); + } + + bool MadeChange = false; + for (Instruction *Inst : AtomicInsts) { + if (!shouldExpand(Inst)) + continue; + + if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst)) + MadeChange |= expandAtomicRMW(AI); + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) + MadeChange |= expandAtomicStore(SI); + + assert(MadeChange && "Atomic inst not expanded when it should be?"); + Inst->eraseFromParent(); + } + + return MadeChange; +} + +/// Returns true if operations on the given type will need to use either +/// cmpxchg8b or cmpxchg16b. This occurs if the type is 1 step up from the +/// native width, and the instructions are available (otherwise we leave them +/// alone to become __sync_fetch_and_... calls). +bool X86AtomicExpandPass::needsCmpXchgNb(llvm::Type *MemType) { + const X86Subtarget &Subtarget = TM->getSubtarget<X86Subtarget>(); + if (!Subtarget.hasCmpxchg16b()) + return false; + + unsigned CmpXchgNbWidth = Subtarget.is64Bit() ? 128 : 64; + + unsigned OpWidth = MemType->getPrimitiveSizeInBits(); + if (OpWidth == CmpXchgNbWidth) + return true; + + return false; +} + + +bool X86AtomicExpandPass::shouldExpandAtomicRMW(AtomicRMWInst *AI) { + const X86Subtarget &Subtarget = TM->getSubtarget<X86Subtarget>(); + unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; + + if (needsCmpXchgNb(AI->getType())) + return true; + + if (AI->getType()->getPrimitiveSizeInBits() > NativeWidth) + return false; + + AtomicRMWInst::BinOp Op = AI->getOperation(); + switch (Op) { + default: + llvm_unreachable("Unknown atomic operation"); + case AtomicRMWInst::Xchg: + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: + // It's better to use xadd, xsub or xchg for these in all cases. + return false; + case AtomicRMWInst::Or: + case AtomicRMWInst::And: + case AtomicRMWInst::Xor: + // If the atomicrmw's result isn't actually used, we can just add a "lock" + // prefix to a normal instruction for these operations. + return !AI->use_empty(); + case AtomicRMWInst::Nand: + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: + // These always require a non-trivial set of data operations on x86. We must + // use a cmpxchg loop. + return true; + } +} + +bool X86AtomicExpandPass::shouldExpandStore(StoreInst *SI) { + if (needsCmpXchgNb(SI->getValueOperand()->getType())) + return true; + + return false; +} + +bool X86AtomicExpandPass::shouldExpand(Instruction *Inst) { + if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst)) + return shouldExpandAtomicRMW(AI); + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) + return shouldExpandStore(SI); + return false; +} + +/// Emit IR to implement the given atomicrmw operation on values in registers, +/// returning the new value. +static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, + Value *Loaded, Value *Inc) { + Value *NewVal; + switch (Op) { + case AtomicRMWInst::Xchg: + return Inc; + case AtomicRMWInst::Add: + return Builder.CreateAdd(Loaded, Inc, "new"); + case AtomicRMWInst::Sub: + return Builder.CreateSub(Loaded, Inc, "new"); + case AtomicRMWInst::And: + return Builder.CreateAnd(Loaded, Inc, "new"); + case AtomicRMWInst::Nand: + return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new"); + case AtomicRMWInst::Or: + return Builder.CreateOr(Loaded, Inc, "new"); + case AtomicRMWInst::Xor: + return Builder.CreateXor(Loaded, Inc, "new"); + case AtomicRMWInst::Max: + NewVal = Builder.CreateICmpSGT(Loaded, Inc); + return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); + case AtomicRMWInst::Min: + NewVal = Builder.CreateICmpSLE(Loaded, Inc); + return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); + case AtomicRMWInst::UMax: + NewVal = Builder.CreateICmpUGT(Loaded, Inc); + return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); + case AtomicRMWInst::UMin: + NewVal = Builder.CreateICmpULE(Loaded, Inc); + return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); + default: + break; + } + llvm_unreachable("Unknown atomic op"); +} + +bool X86AtomicExpandPass::expandAtomicRMW(AtomicRMWInst *AI) { + AtomicOrdering Order = + AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering(); + Value *Addr = AI->getPointerOperand(); + BasicBlock *BB = AI->getParent(); + Function *F = BB->getParent(); + LLVMContext &Ctx = F->getContext(); + + // Given: atomicrmw some_op iN* %addr, iN %incr ordering + // + // The standard expansion we produce is: + // [...] + // %init_loaded = load atomic iN* %addr + // br label %loop + // loop: + // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] + // %new = some_op iN %loaded, %incr + // %pair = cmpxchg iN* %addr, iN %loaded, iN %new + // %new_loaded = extractvalue { iN, i1 } %pair, 0 + // %success = extractvalue { iN, i1 } %pair, 1 + // br i1 %success, label %atomicrmw.end, label %loop + // atomicrmw.end: + // [...] + BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end"); + BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); + + // This grabs the DebugLoc from AI. + IRBuilder<> Builder(AI); + + // The split call above "helpfully" added a branch at the end of BB (to the + // wrong place), but we want a load. It's easiest to just remove + // the branch entirely. + std::prev(BB->end())->eraseFromParent(); + Builder.SetInsertPoint(BB); + LoadInst *InitLoaded = Builder.CreateLoad(Addr); + InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits()); + Builder.CreateBr(LoopBB); + + // Start the main loop block now that we've taken care of the preliminaries. + Builder.SetInsertPoint(LoopBB); + PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded"); + Loaded->addIncoming(InitLoaded, BB); + + Value *NewVal = + performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand()); + + Value *Pair = Builder.CreateAtomicCmpXchg( + Addr, Loaded, NewVal, Order, + AtomicCmpXchgInst::getStrongestFailureOrdering(Order)); + Value *NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); + Loaded->addIncoming(NewLoaded, LoopBB); + + Value *Success = Builder.CreateExtractValue(Pair, 1, "success"); + Builder.CreateCondBr(Success, ExitBB, LoopBB); + + AI->replaceAllUsesWith(NewLoaded); + + return true; +} + +bool X86AtomicExpandPass::expandAtomicStore(StoreInst *SI) { + // An atomic store might need cmpxchg16b (or 8b on x86) to execute. Express + // this in terms of the usual expansion to "atomicrmw xchg". + IRBuilder<> Builder(SI); + AtomicOrdering Order = + SI->getOrdering() == Unordered ? Monotonic : SI->getOrdering(); + AtomicRMWInst *AI = + Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(), + SI->getValueOperand(), Order); + + // Now we have an appropriate swap instruction, lower it as usual. + if (shouldExpandAtomicRMW(AI)) { + expandAtomicRMW(AI); + AI->eraseFromParent(); + return true; + } + + return AI; +} diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp index 76718d0..a3ae7ee 100644 --- a/lib/Target/X86/X86CodeEmitter.cpp +++ b/lib/Target/X86/X86CodeEmitter.cpp @@ -1113,9 +1113,14 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, case TargetOpcode::INLINEASM: // We allow inline assembler nodes with empty bodies - they can // implicitly define registers, which is ok for JIT. - if (MI.getOperand(0).getSymbolName()[0]) + if (MI.getOperand(0).getSymbolName()[0]) { + DebugLoc DL = MI.getDebugLoc(); + DL.print(MI.getParent()->getParent()->getFunction()->getContext(), + llvm::errs()); report_fatal_error("JIT does not support inline asm!"); + } break; + case TargetOpcode::DBG_VALUE: case TargetOpcode::CFI_INSTRUCTION: break; case TargetOpcode::GC_LABEL: @@ -1126,6 +1131,16 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, case TargetOpcode::IMPLICIT_DEF: case TargetOpcode::KILL: break; + + case X86::SEH_PushReg: + case X86::SEH_SaveReg: + case X86::SEH_SaveXMM: + case X86::SEH_StackAlloc: + case X86::SEH_SetFrame: + case X86::SEH_PushFrame: + case X86::SEH_EndPrologue: + break; + case X86::MOVPC32r: { // This emits the "call" portion of this pseudo instruction. MCE.emitByte(BaseOpcode); diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 56bcfa3..ce554ba 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -16,10 +16,12 @@ #include "X86.h" #include "X86CallingConv.h" #include "X86InstrBuilder.h" +#include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" @@ -78,12 +80,14 @@ public: private: bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT); - bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR); + bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, MachineMemOperand *MMO, + unsigned &ResultReg); bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM, - bool Aligned = false); - bool X86FastEmitStore(EVT VT, unsigned ValReg, const X86AddressMode &AM, - bool Aligned = false); + MachineMemOperand *MMO = nullptr, bool Aligned = false); + bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, + const X86AddressMode &AM, + MachineMemOperand *MMO = nullptr, bool Aligned = false); bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, unsigned &ResultReg); @@ -107,6 +111,12 @@ private: bool X86SelectDivRem(const Instruction *I); + bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I); + + bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I); + + bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I); + bool X86SelectSelect(const Instruction *I); bool X86SelectTrunc(const Instruction *I); @@ -147,10 +157,182 @@ private: bool TryEmitSmallMemcpy(X86AddressMode DestAM, X86AddressMode SrcAM, uint64_t Len); + + bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, + const Value *Cond); }; } // end anonymous namespace. +static CmpInst::Predicate optimizeCmpPredicate(const CmpInst *CI) { + // If both operands are the same, then try to optimize or fold the cmp. + CmpInst::Predicate Predicate = CI->getPredicate(); + if (CI->getOperand(0) != CI->getOperand(1)) + return Predicate; + + switch (Predicate) { + default: llvm_unreachable("Invalid predicate!"); + case CmpInst::FCMP_FALSE: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::FCMP_OEQ: Predicate = CmpInst::FCMP_ORD; break; + case CmpInst::FCMP_OGT: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::FCMP_OGE: Predicate = CmpInst::FCMP_ORD; break; + case CmpInst::FCMP_OLT: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::FCMP_OLE: Predicate = CmpInst::FCMP_ORD; break; + case CmpInst::FCMP_ONE: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::FCMP_ORD: Predicate = CmpInst::FCMP_ORD; break; + case CmpInst::FCMP_UNO: Predicate = CmpInst::FCMP_UNO; break; + case CmpInst::FCMP_UEQ: Predicate = CmpInst::FCMP_TRUE; break; + case CmpInst::FCMP_UGT: Predicate = CmpInst::FCMP_UNO; break; + case CmpInst::FCMP_UGE: Predicate = CmpInst::FCMP_TRUE; break; + case CmpInst::FCMP_ULT: Predicate = CmpInst::FCMP_UNO; break; + case CmpInst::FCMP_ULE: Predicate = CmpInst::FCMP_TRUE; break; + case CmpInst::FCMP_UNE: Predicate = CmpInst::FCMP_UNO; break; + case CmpInst::FCMP_TRUE: Predicate = CmpInst::FCMP_TRUE; break; + + case CmpInst::ICMP_EQ: Predicate = CmpInst::FCMP_TRUE; break; + case CmpInst::ICMP_NE: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::ICMP_UGT: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::ICMP_UGE: Predicate = CmpInst::FCMP_TRUE; break; + case CmpInst::ICMP_ULT: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::ICMP_ULE: Predicate = CmpInst::FCMP_TRUE; break; + case CmpInst::ICMP_SGT: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::ICMP_SGE: Predicate = CmpInst::FCMP_TRUE; break; + case CmpInst::ICMP_SLT: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::ICMP_SLE: Predicate = CmpInst::FCMP_TRUE; break; + } + + return Predicate; +} + +static std::pair<X86::CondCode, bool> +getX86ConditionCode(CmpInst::Predicate Predicate) { + X86::CondCode CC = X86::COND_INVALID; + bool NeedSwap = false; + switch (Predicate) { + default: break; + // Floating-point Predicates + case CmpInst::FCMP_UEQ: CC = X86::COND_E; break; + case CmpInst::FCMP_OLT: NeedSwap = true; // fall-through + case CmpInst::FCMP_OGT: CC = X86::COND_A; break; + case CmpInst::FCMP_OLE: NeedSwap = true; // fall-through + case CmpInst::FCMP_OGE: CC = X86::COND_AE; break; + case CmpInst::FCMP_UGT: NeedSwap = true; // fall-through + case CmpInst::FCMP_ULT: CC = X86::COND_B; break; + case CmpInst::FCMP_UGE: NeedSwap = true; // fall-through + case CmpInst::FCMP_ULE: CC = X86::COND_BE; break; + case CmpInst::FCMP_ONE: CC = X86::COND_NE; break; + case CmpInst::FCMP_UNO: CC = X86::COND_P; break; + case CmpInst::FCMP_ORD: CC = X86::COND_NP; break; + case CmpInst::FCMP_OEQ: // fall-through + case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break; + + // Integer Predicates + case CmpInst::ICMP_EQ: CC = X86::COND_E; break; + case CmpInst::ICMP_NE: CC = X86::COND_NE; break; + case CmpInst::ICMP_UGT: CC = X86::COND_A; break; + case CmpInst::ICMP_UGE: CC = X86::COND_AE; break; + case CmpInst::ICMP_ULT: CC = X86::COND_B; break; + case CmpInst::ICMP_ULE: CC = X86::COND_BE; break; + case CmpInst::ICMP_SGT: CC = X86::COND_G; break; + case CmpInst::ICMP_SGE: CC = X86::COND_GE; break; + case CmpInst::ICMP_SLT: CC = X86::COND_L; break; + case CmpInst::ICMP_SLE: CC = X86::COND_LE; break; + } + + return std::make_pair(CC, NeedSwap); +} + +static std::pair<unsigned, bool> +getX86SSEConditionCode(CmpInst::Predicate Predicate) { + unsigned CC; + bool NeedSwap = false; + + // SSE Condition code mapping: + // 0 - EQ + // 1 - LT + // 2 - LE + // 3 - UNORD + // 4 - NEQ + // 5 - NLT + // 6 - NLE + // 7 - ORD + switch (Predicate) { + default: llvm_unreachable("Unexpected predicate"); + case CmpInst::FCMP_OEQ: CC = 0; break; + case CmpInst::FCMP_OGT: NeedSwap = true; // fall-through + case CmpInst::FCMP_OLT: CC = 1; break; + case CmpInst::FCMP_OGE: NeedSwap = true; // fall-through + case CmpInst::FCMP_OLE: CC = 2; break; + case CmpInst::FCMP_UNO: CC = 3; break; + case CmpInst::FCMP_UNE: CC = 4; break; + case CmpInst::FCMP_ULE: NeedSwap = true; // fall-through + case CmpInst::FCMP_UGE: CC = 5; break; + case CmpInst::FCMP_ULT: NeedSwap = true; // fall-through + case CmpInst::FCMP_UGT: CC = 6; break; + case CmpInst::FCMP_ORD: CC = 7; break; + case CmpInst::FCMP_UEQ: + case CmpInst::FCMP_ONE: CC = 8; break; + } + + return std::make_pair(CC, NeedSwap); +} + +/// \brief Check if it is possible to fold the condition from the XALU intrinsic +/// into the user. The condition code will only be updated on success. +bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, + const Value *Cond) { + if (!isa<ExtractValueInst>(Cond)) + return false; + + const auto *EV = cast<ExtractValueInst>(Cond); + if (!isa<IntrinsicInst>(EV->getAggregateOperand())) + return false; + + const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand()); + MVT RetVT; + const Function *Callee = II->getCalledFunction(); + Type *RetTy = + cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U); + if (!isTypeLegal(RetTy, RetVT)) + return false; + + if (RetVT != MVT::i32 && RetVT != MVT::i64) + return false; + + X86::CondCode TmpCC; + switch (II->getIntrinsicID()) { + default: return false; + case Intrinsic::sadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break; + case Intrinsic::uadd_with_overflow: + case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break; + } + + // Check if both instructions are in the same basic block. + if (II->getParent() != I->getParent()) + return false; + + // Make sure nothing is in the way + BasicBlock::const_iterator Start = I; + BasicBlock::const_iterator End = II; + for (auto Itr = std::prev(Start); Itr != End; --Itr) { + // We only expect extractvalue instructions between the intrinsic and the + // instruction to be selected. + if (!isa<ExtractValueInst>(Itr)) + return false; + + // Check that the extractvalue operand comes from the intrinsic. + const auto *EVI = cast<ExtractValueInst>(Itr); + if (EVI->getAggregateOperand() != II) + return false; + } + + CC = TmpCC; + return true; +} + bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true); if (evt == MVT::Other || !evt.isSimple()) @@ -180,7 +362,7 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { /// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV. /// Return true and the result register by reference if it is possible. bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, - unsigned &ResultReg) { + MachineMemOperand *MMO, unsigned &ResultReg) { // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; const TargetRegisterClass *RC = nullptr; @@ -228,8 +410,11 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, } ResultReg = createResultReg(RC); - addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, - DbgLoc, TII.get(Opc), ResultReg), AM); + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); + addFullAddress(MIB, AM); + if (MMO) + MIB->addMemOperand(*FuncInfo.MF, MMO); return true; } @@ -237,9 +422,9 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, /// type VT. The address is either pre-computed, consisted of a base ptr, Ptr /// and a displacement offset, or a GlobalAddress, /// i.e. V. Return true if it is possible. -bool -X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, - const X86AddressMode &AM, bool Aligned) { +bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, + const X86AddressMode &AM, + MachineMemOperand *MMO, bool Aligned) { // Get opcode and regclass of the output for the given store instruction. unsigned Opc = 0; switch (VT.getSimpleVT().SimpleTy) { @@ -249,7 +434,8 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, // Mask out all but lowest bit. unsigned AndResult = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(X86::AND8ri), AndResult).addReg(ValReg).addImm(1); + TII.get(X86::AND8ri), AndResult) + .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1); ValReg = AndResult; } // FALLTHROUGH, handling i1 as i8. @@ -288,13 +474,18 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, break; } - addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, - DbgLoc, TII.get(Opc)), AM).addReg(ValReg); + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)); + addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill)); + if (MMO) + MIB->addMemOperand(*FuncInfo.MF, MMO); + return true; } bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, - const X86AddressMode &AM, bool Aligned) { + const X86AddressMode &AM, + MachineMemOperand *MMO, bool Aligned) { // Handle 'null' like i32/i64 0. if (isa<ConstantPointerNull>(Val)) Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext())); @@ -317,10 +508,12 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, } if (Opc) { - addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, - DbgLoc, TII.get(Opc)), AM) - .addImm(Signed ? (uint64_t) CI->getSExtValue() : - CI->getZExtValue()); + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)); + addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue() + : CI->getZExtValue()); + if (MMO) + MIB->addMemOperand(*FuncInfo.MF, MMO); return true; } } @@ -329,7 +522,8 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, if (ValReg == 0) return false; - return X86FastEmitStore(VT, ValReg, AM, Aligned); + bool ValKill = hasTrivialKill(Val); + return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned); } /// X86FastEmitExtend - Emit a machine instruction to extend a value Src of @@ -355,17 +549,8 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) { return false; // Can't handle TLS yet. - if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) - if (GVar->isThreadLocal()) - return false; - - // Can't handle TLS yet, part 2 (this is slightly crazy, but this is how - // it works...). - if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) - if (const GlobalVariable *GVar = - dyn_cast_or_null<GlobalVariable>(GA->getAliasee())) - if (GVar->isThreadLocal()) - return false; + if (GV->isThreadLocal()) + return false; // RIP-relative addresses can't have additional register operands, so if // we've already folded stuff into the addressing mode, just force the @@ -696,7 +881,7 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { (AM.Base.Reg != 0 || AM.IndexReg != 0)) return false; - // Can't handle DbgLocLImport. + // Can't handle DLL Import. if (GV->hasDLLImportStorageClass()) return false; @@ -749,19 +934,24 @@ bool X86FastISel::X86SelectStore(const Instruction *I) { if (S->isAtomic()) return false; - unsigned SABIAlignment = - DL.getABITypeAlignment(S->getValueOperand()->getType()); - bool Aligned = S->getAlignment() == 0 || S->getAlignment() >= SABIAlignment; + const Value *Val = S->getValueOperand(); + const Value *Ptr = S->getPointerOperand(); MVT VT; - if (!isTypeLegal(I->getOperand(0)->getType(), VT, /*AllowI1=*/true)) + if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true)) return false; + unsigned Alignment = S->getAlignment(); + unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType()); + if (Alignment == 0) // Ensure that codegen never sees alignment 0 + Alignment = ABIAlignment; + bool Aligned = Alignment >= ABIAlignment; + X86AddressMode AM; - if (!X86SelectAddress(I->getOperand(1), AM)) + if (!X86SelectAddress(Ptr, AM)) return false; - return X86FastEmitStore(VT, I->getOperand(0), AM, Aligned); + return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned); } /// X86SelectRet - Select and emit code to implement ret instructions. @@ -896,25 +1086,29 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { /// X86SelectLoad - Select and emit code to implement load instructions. /// -bool X86FastISel::X86SelectLoad(const Instruction *I) { +bool X86FastISel::X86SelectLoad(const Instruction *I) { + const LoadInst *LI = cast<LoadInst>(I); + // Atomic loads need special handling. - if (cast<LoadInst>(I)->isAtomic()) + if (LI->isAtomic()) return false; MVT VT; - if (!isTypeLegal(I->getType(), VT, /*AllowI1=*/true)) + if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true)) return false; + const Value *Ptr = LI->getPointerOperand(); + X86AddressMode AM; - if (!X86SelectAddress(I->getOperand(0), AM)) + if (!X86SelectAddress(Ptr, AM)) return false; unsigned ResultReg = 0; - if (X86FastEmitLoad(VT, AM, ResultReg)) { - UpdateValueMap(I, ResultReg); - return true; - } - return false; + if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg)) + return false; + + UpdateValueMap(I, ResultReg); + return true; } static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) { @@ -994,73 +1188,89 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { if (!isTypeLegal(I->getOperand(0)->getType(), VT)) return false; - unsigned ResultReg = createResultReg(&X86::GR8RegClass); - unsigned SetCCOpc; - bool SwapArgs; // false -> compare Op0, Op1. true -> compare Op1, Op0. - switch (CI->getPredicate()) { - case CmpInst::FCMP_OEQ: { - if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT)) + // Try to optimize or fold the cmp. + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + unsigned ResultReg = 0; + switch (Predicate) { + default: break; + case CmpInst::FCMP_FALSE: { + ResultReg = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0), + ResultReg); + ResultReg = FastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true, + X86::sub_8bit); + if (!ResultReg) return false; + break; + } + case CmpInst::FCMP_TRUE: { + ResultReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri), + ResultReg).addImm(1); + break; + } + } - unsigned EReg = createResultReg(&X86::GR8RegClass); - unsigned NPReg = createResultReg(&X86::GR8RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETEr), EReg); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(X86::SETNPr), NPReg); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(X86::AND8rr), ResultReg).addReg(NPReg).addReg(EReg); + if (ResultReg) { UpdateValueMap(I, ResultReg); return true; } - case CmpInst::FCMP_UNE: { - if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT)) + + const Value *LHS = CI->getOperand(0); + const Value *RHS = CI->getOperand(1); + + // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0. + // We don't have to materialize a zero constant for this case and can just use + // %x again on the RHS. + if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { + const auto *RHSC = dyn_cast<ConstantFP>(RHS); + if (RHSC && RHSC->isNullValue()) + RHS = LHS; + } + + // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. + static unsigned SETFOpcTable[2][3] = { + { X86::SETEr, X86::SETNPr, X86::AND8rr }, + { X86::SETNEr, X86::SETPr, X86::OR8rr } + }; + unsigned *SETFOpc = nullptr; + switch (Predicate) { + default: break; + case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break; + case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break; + } + + ResultReg = createResultReg(&X86::GR8RegClass); + if (SETFOpc) { + if (!X86FastEmitCompare(LHS, RHS, VT)) return false; - unsigned NEReg = createResultReg(&X86::GR8RegClass); - unsigned PReg = createResultReg(&X86::GR8RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETNEr), NEReg); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETPr), PReg); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::OR8rr),ResultReg) - .addReg(PReg).addReg(NEReg); + unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); + unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]), + FlagReg1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]), + FlagReg2); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]), + ResultReg).addReg(FlagReg1).addReg(FlagReg2); UpdateValueMap(I, ResultReg); return true; } - case CmpInst::FCMP_OGT: SwapArgs = false; SetCCOpc = X86::SETAr; break; - case CmpInst::FCMP_OGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break; - case CmpInst::FCMP_OLT: SwapArgs = true; SetCCOpc = X86::SETAr; break; - case CmpInst::FCMP_OLE: SwapArgs = true; SetCCOpc = X86::SETAEr; break; - case CmpInst::FCMP_ONE: SwapArgs = false; SetCCOpc = X86::SETNEr; break; - case CmpInst::FCMP_ORD: SwapArgs = false; SetCCOpc = X86::SETNPr; break; - case CmpInst::FCMP_UNO: SwapArgs = false; SetCCOpc = X86::SETPr; break; - case CmpInst::FCMP_UEQ: SwapArgs = false; SetCCOpc = X86::SETEr; break; - case CmpInst::FCMP_UGT: SwapArgs = true; SetCCOpc = X86::SETBr; break; - case CmpInst::FCMP_UGE: SwapArgs = true; SetCCOpc = X86::SETBEr; break; - case CmpInst::FCMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr; break; - case CmpInst::FCMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break; - - case CmpInst::ICMP_EQ: SwapArgs = false; SetCCOpc = X86::SETEr; break; - case CmpInst::ICMP_NE: SwapArgs = false; SetCCOpc = X86::SETNEr; break; - case CmpInst::ICMP_UGT: SwapArgs = false; SetCCOpc = X86::SETAr; break; - case CmpInst::ICMP_UGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break; - case CmpInst::ICMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr; break; - case CmpInst::ICMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break; - case CmpInst::ICMP_SGT: SwapArgs = false; SetCCOpc = X86::SETGr; break; - case CmpInst::ICMP_SGE: SwapArgs = false; SetCCOpc = X86::SETGEr; break; - case CmpInst::ICMP_SLT: SwapArgs = false; SetCCOpc = X86::SETLr; break; - case CmpInst::ICMP_SLE: SwapArgs = false; SetCCOpc = X86::SETLEr; break; - default: - return false; - } - const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1); + X86::CondCode CC; + bool SwapArgs; + std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); + assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); + unsigned Opc = X86::getSETFromCond(CC); + if (SwapArgs) - std::swap(Op0, Op1); + std::swap(LHS, RHS); - // Emit a compare of Op0/Op1. - if (!X86FastEmitCompare(Op0, Op1, VT)) + // Emit a compare of LHS/RHS. + if (!X86FastEmitCompare(LHS, RHS, VT)) return false; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SetCCOpc), ResultReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); UpdateValueMap(I, ResultReg); return true; } @@ -1126,73 +1336,88 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { // Fold the common case of a conditional branch with a comparison // in the same block (values defined on other blocks may not have // initialized registers). + X86::CondCode CC; if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) { if (CI->hasOneUse() && CI->getParent() == I->getParent()) { EVT VT = TLI.getValueType(CI->getOperand(0)->getType()); + // Try to optimize or fold the cmp. + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + switch (Predicate) { + default: break; + case CmpInst::FCMP_FALSE: FastEmitBranch(FalseMBB, DbgLoc); return true; + case CmpInst::FCMP_TRUE: FastEmitBranch(TrueMBB, DbgLoc); return true; + } + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + + // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, + // 0.0. + // We don't have to materialize a zero constant for this case and can just + // use %x again on the RHS. + if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { + const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS); + if (CmpRHSC && CmpRHSC->isNullValue()) + CmpRHS = CmpLHS; + } + // Try to take advantage of fallthrough opportunities. - CmpInst::Predicate Predicate = CI->getPredicate(); if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { std::swap(TrueMBB, FalseMBB); Predicate = CmpInst::getInversePredicate(Predicate); } - bool SwapArgs; // false -> compare Op0, Op1. true -> compare Op1, Op0. - unsigned BranchOpc; // Opcode to jump on, e.g. "X86::JA" - + // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition + // code check. Instead two branch instructions are required to check all + // the flags. First we change the predicate to a supported condition code, + // which will be the first branch. Later one we will emit the second + // branch. + bool NeedExtraBranch = false; switch (Predicate) { + default: break; case CmpInst::FCMP_OEQ: - std::swap(TrueMBB, FalseMBB); - Predicate = CmpInst::FCMP_UNE; - // FALL THROUGH - case CmpInst::FCMP_UNE: SwapArgs = false; BranchOpc = X86::JNE_4; break; - case CmpInst::FCMP_OGT: SwapArgs = false; BranchOpc = X86::JA_4; break; - case CmpInst::FCMP_OGE: SwapArgs = false; BranchOpc = X86::JAE_4; break; - case CmpInst::FCMP_OLT: SwapArgs = true; BranchOpc = X86::JA_4; break; - case CmpInst::FCMP_OLE: SwapArgs = true; BranchOpc = X86::JAE_4; break; - case CmpInst::FCMP_ONE: SwapArgs = false; BranchOpc = X86::JNE_4; break; - case CmpInst::FCMP_ORD: SwapArgs = false; BranchOpc = X86::JNP_4; break; - case CmpInst::FCMP_UNO: SwapArgs = false; BranchOpc = X86::JP_4; break; - case CmpInst::FCMP_UEQ: SwapArgs = false; BranchOpc = X86::JE_4; break; - case CmpInst::FCMP_UGT: SwapArgs = true; BranchOpc = X86::JB_4; break; - case CmpInst::FCMP_UGE: SwapArgs = true; BranchOpc = X86::JBE_4; break; - case CmpInst::FCMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4; break; - case CmpInst::FCMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break; - - case CmpInst::ICMP_EQ: SwapArgs = false; BranchOpc = X86::JE_4; break; - case CmpInst::ICMP_NE: SwapArgs = false; BranchOpc = X86::JNE_4; break; - case CmpInst::ICMP_UGT: SwapArgs = false; BranchOpc = X86::JA_4; break; - case CmpInst::ICMP_UGE: SwapArgs = false; BranchOpc = X86::JAE_4; break; - case CmpInst::ICMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4; break; - case CmpInst::ICMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break; - case CmpInst::ICMP_SGT: SwapArgs = false; BranchOpc = X86::JG_4; break; - case CmpInst::ICMP_SGE: SwapArgs = false; BranchOpc = X86::JGE_4; break; - case CmpInst::ICMP_SLT: SwapArgs = false; BranchOpc = X86::JL_4; break; - case CmpInst::ICMP_SLE: SwapArgs = false; BranchOpc = X86::JLE_4; break; - default: - return false; + std::swap(TrueMBB, FalseMBB); // fall-through + case CmpInst::FCMP_UNE: + NeedExtraBranch = true; + Predicate = CmpInst::FCMP_ONE; + break; } - const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1); + bool SwapArgs; + unsigned BranchOpc; + std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); + assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); + + BranchOpc = X86::GetCondBranchFromCond(CC); if (SwapArgs) - std::swap(Op0, Op1); + std::swap(CmpLHS, CmpRHS); // Emit a compare of the LHS and RHS, setting the flags. - if (!X86FastEmitCompare(Op0, Op1, VT)) + if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT)) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) .addMBB(TrueMBB); - if (Predicate == CmpInst::FCMP_UNE) { - // X86 requires a second branch to handle UNE (and OEQ, - // which is mapped to UNE above). + // X86 requires a second branch to handle UNE (and OEQ, which is mapped + // to UNE above). + if (NeedExtraBranch) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_4)) .addMBB(TrueMBB); } + // Obtain the branch weight and add the TrueBB to the successor list. + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TrueMBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); + + // Emits an unconditional branch to the FalseBB, obtains the branch + // weight, and adds it to the successor list. FastEmitBranch(FalseMBB, DbgLoc); - FuncInfo.MBB->addSuccessor(TrueMBB); + return true; } } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { @@ -1224,10 +1449,32 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc)) .addMBB(TrueMBB); FastEmitBranch(FalseMBB, DbgLoc); - FuncInfo.MBB->addSuccessor(TrueMBB); + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TrueMBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); return true; } } + } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) { + // Fake request the condition, otherwise the intrinsic might be completely + // optimized away. + unsigned TmpReg = getRegForValue(BI->getCondition()); + if (TmpReg == 0) + return false; + + unsigned BranchOpc = X86::GetCondBranchFromCond(CC); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) + .addMBB(TrueMBB); + FastEmitBranch(FalseMBB, DbgLoc); + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TrueMBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); + return true; } // Otherwise do a clumsy setcc and re-test it. @@ -1241,7 +1488,11 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_4)) .addMBB(TrueMBB); FastEmitBranch(FalseMBB, DbgLoc); - FuncInfo.MBB->addSuccessor(TrueMBB); + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TrueMBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); return true; } @@ -1478,50 +1729,319 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { return true; } -bool X86FastISel::X86SelectSelect(const Instruction *I) { - MVT VT; - if (!isTypeLegal(I->getType(), VT)) +/// \brief Emit a conditional move instruction (if the are supported) to lower +/// the select. +bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { + // Check if the subtarget supports these instructions. + if (!Subtarget->hasCMov()) return false; - // We only use cmov here, if we don't have a cmov instruction bail. - if (!Subtarget->hasCMov()) return false; + // FIXME: Add support for i8. + if (RetVT < MVT::i16 || RetVT > MVT::i64) + return false; - unsigned Opc = 0; - const TargetRegisterClass *RC = nullptr; - if (VT == MVT::i16) { - Opc = X86::CMOVE16rr; - RC = &X86::GR16RegClass; - } else if (VT == MVT::i32) { - Opc = X86::CMOVE32rr; - RC = &X86::GR32RegClass; - } else if (VT == MVT::i64) { - Opc = X86::CMOVE64rr; - RC = &X86::GR64RegClass; - } else { + const Value *Cond = I->getOperand(0); + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + bool NeedTest = true; + X86::CondCode CC = X86::COND_NE; + + // Optimize conditions coming from a compare if both instructions are in the + // same basic block (values defined in other basic blocks may not have + // initialized registers). + const auto *CI = dyn_cast<CmpInst>(Cond); + if (CI && (CI->getParent() == I->getParent())) { + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + + // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. + static unsigned SETFOpcTable[2][3] = { + { X86::SETNPr, X86::SETEr , X86::TEST8rr }, + { X86::SETPr, X86::SETNEr, X86::OR8rr } + }; + unsigned *SETFOpc = nullptr; + switch (Predicate) { + default: break; + case CmpInst::FCMP_OEQ: + SETFOpc = &SETFOpcTable[0][0]; + Predicate = CmpInst::ICMP_NE; + break; + case CmpInst::FCMP_UNE: + SETFOpc = &SETFOpcTable[1][0]; + Predicate = CmpInst::ICMP_NE; + break; + } + + bool NeedSwap; + std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate); + assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + if (NeedSwap) + std::swap(CmpLHS, CmpRHS); + + EVT CmpVT = TLI.getValueType(CmpLHS->getType()); + // Emit a compare of the LHS and RHS, setting the flags. + if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT)) + return false; + + if (SETFOpc) { + unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); + unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]), + FlagReg1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]), + FlagReg2); + auto const &II = TII.get(SETFOpc[2]); + if (II.getNumDefs()) { + unsigned TmpReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg) + .addReg(FlagReg2).addReg(FlagReg1); + } else { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) + .addReg(FlagReg2).addReg(FlagReg1); + } + } + NeedTest = false; + } else if (foldX86XALUIntrinsic(CC, I, Cond)) { + // Fake request the condition, otherwise the intrinsic might be completely + // optimized away. + unsigned TmpReg = getRegForValue(Cond); + if (TmpReg == 0) + return false; + + NeedTest = false; + } + + if (NeedTest) { + // Selects operate on i1, however, CondReg is 8 bits width and may contain + // garbage. Indeed, only the less significant bit is supposed to be + // accurate. If we read more than the lsb, we may see non-zero values + // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for + // the select. This is achieved by performing TEST against 1. + unsigned CondReg = getRegForValue(Cond); + if (CondReg == 0) + return false; + bool CondIsKill = hasTrivialKill(Cond); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) + .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1); + } + + const Value *LHS = I->getOperand(1); + const Value *RHS = I->getOperand(2); + + unsigned RHSReg = getRegForValue(RHS); + bool RHSIsKill = hasTrivialKill(RHS); + + unsigned LHSReg = getRegForValue(LHS); + bool LHSIsKill = hasTrivialKill(LHS); + + if (!LHSReg || !RHSReg) + return false; + + unsigned Opc = X86::getCMovFromCond(CC, RC->getSize()); + unsigned ResultReg = FastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill, + LHSReg, LHSIsKill); + UpdateValueMap(I, ResultReg); + return true; +} + +/// \brief Emit SSE instructions to lower the select. +/// +/// Try to use SSE1/SSE2 instructions to simulate a select without branches. +/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary +/// SSE instructions are available. +bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { + // Optimize conditions coming from a compare if both instructions are in the + // same basic block (values defined in other basic blocks may not have + // initialized registers). + const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0)); + if (!CI || (CI->getParent() != I->getParent())) return false; + + if (I->getType() != CI->getOperand(0)->getType() || + !((Subtarget->hasSSE1() && RetVT == MVT::f32) || + (Subtarget->hasSSE2() && RetVT == MVT::f64) )) + return false; + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + + // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0. + // We don't have to materialize a zero constant for this case and can just use + // %x again on the RHS. + if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { + const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS); + if (CmpRHSC && CmpRHSC->isNullValue()) + CmpRHS = CmpLHS; } - unsigned Op0Reg = getRegForValue(I->getOperand(0)); - if (Op0Reg == 0) return false; - unsigned Op1Reg = getRegForValue(I->getOperand(1)); - if (Op1Reg == 0) return false; - unsigned Op2Reg = getRegForValue(I->getOperand(2)); - if (Op2Reg == 0) return false; - - // Selects operate on i1, however, Op0Reg is 8 bits width and may contain - // garbage. Indeed, only the less significant bit is supposed to be accurate. - // If we read more than the lsb, we may see non-zero values whereas lsb - // is zero. Therefore, we have to truncate Op0Reg to i1 for the select. - // This is achieved by performing TEST against 1. - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) - .addReg(Op0Reg).addImm(1); - unsigned ResultReg = createResultReg(RC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) - .addReg(Op1Reg).addReg(Op2Reg); + unsigned CC; + bool NeedSwap; + std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate); + if (CC > 7) + return false; + + if (NeedSwap) + std::swap(CmpLHS, CmpRHS); + + static unsigned OpcTable[2][2][4] = { + { { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr }, + { X86::VCMPSSrr, X86::VFsANDPSrr, X86::VFsANDNPSrr, X86::VFsORPSrr } }, + { { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr }, + { X86::VCMPSDrr, X86::VFsANDPDrr, X86::VFsANDNPDrr, X86::VFsORPDrr } } + }; + + bool HasAVX = Subtarget->hasAVX(); + unsigned *Opc = nullptr; + switch (RetVT.SimpleTy) { + default: return false; + case MVT::f32: Opc = &OpcTable[0][HasAVX][0]; break; + case MVT::f64: Opc = &OpcTable[1][HasAVX][0]; break; + } + + const Value *LHS = I->getOperand(1); + const Value *RHS = I->getOperand(2); + + unsigned LHSReg = getRegForValue(LHS); + bool LHSIsKill = hasTrivialKill(LHS); + + unsigned RHSReg = getRegForValue(RHS); + bool RHSIsKill = hasTrivialKill(RHS); + + unsigned CmpLHSReg = getRegForValue(CmpLHS); + bool CmpLHSIsKill = hasTrivialKill(CmpLHS); + + unsigned CmpRHSReg = getRegForValue(CmpRHS); + bool CmpRHSIsKill = hasTrivialKill(CmpRHS); + + if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS) + return false; + + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + unsigned CmpReg = FastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, + CmpRHSReg, CmpRHSIsKill, CC); + unsigned AndReg = FastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, + LHSReg, LHSIsKill); + unsigned AndNReg = FastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, + RHSReg, RHSIsKill); + unsigned ResultReg = FastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, + AndReg, /*IsKill=*/true); + UpdateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { + // These are pseudo CMOV instructions and will be later expanded into control- + // flow. + unsigned Opc; + switch (RetVT.SimpleTy) { + default: return false; + case MVT::i8: Opc = X86::CMOV_GR8; break; + case MVT::i16: Opc = X86::CMOV_GR16; break; + case MVT::i32: Opc = X86::CMOV_GR32; break; + case MVT::f32: Opc = X86::CMOV_FR32; break; + case MVT::f64: Opc = X86::CMOV_FR64; break; + } + + const Value *Cond = I->getOperand(0); + X86::CondCode CC = X86::COND_NE; + + // Optimize conditions coming from a compare if both instructions are in the + // same basic block (values defined in other basic blocks may not have + // initialized registers). + const auto *CI = dyn_cast<CmpInst>(Cond); + if (CI && (CI->getParent() == I->getParent())) { + bool NeedSwap; + std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate()); + if (CC > X86::LAST_VALID_COND) + return false; + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + + if (NeedSwap) + std::swap(CmpLHS, CmpRHS); + + EVT CmpVT = TLI.getValueType(CmpLHS->getType()); + if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT)) + return false; + } else { + unsigned CondReg = getRegForValue(Cond); + if (CondReg == 0) + return false; + bool CondIsKill = hasTrivialKill(Cond); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) + .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1); + } + + const Value *LHS = I->getOperand(1); + const Value *RHS = I->getOperand(2); + + unsigned LHSReg = getRegForValue(LHS); + bool LHSIsKill = hasTrivialKill(LHS); + + unsigned RHSReg = getRegForValue(RHS); + bool RHSIsKill = hasTrivialKill(RHS); + + if (!LHSReg || !RHSReg) + return false; + + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + + unsigned ResultReg = + FastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC); UpdateValueMap(I, ResultReg); return true; } +bool X86FastISel::X86SelectSelect(const Instruction *I) { + MVT RetVT; + if (!isTypeLegal(I->getType(), RetVT)) + return false; + + // Check if we can fold the select. + if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) { + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + const Value *Opnd = nullptr; + switch (Predicate) { + default: break; + case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break; + case CmpInst::FCMP_TRUE: Opnd = I->getOperand(1); break; + } + // No need for a select anymore - this is an unconditional move. + if (Opnd) { + unsigned OpReg = getRegForValue(Opnd); + if (OpReg == 0) + return false; + bool OpIsKill = hasTrivialKill(Opnd); + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(OpReg, getKillRegState(OpIsKill)); + UpdateValueMap(I, ResultReg); + return true; + } + } + + // First try to use real conditional move instructions. + if (X86FastEmitCMoveSelect(RetVT, I)) + return true; + + // Try to use a sequence of SSE instructions to simulate a conditional move. + if (X86FastEmitSSESelect(RetVT, I)) + return true; + + // Fall-back to pseudo conditional move instructions, which will be later + // converted to control-flow. + if (X86FastEmitPseudoSelect(RetVT, I)) + return true; + + return false; +} + bool X86FastISel::X86SelectFPExt(const Instruction *I) { // fpext from float to double. if (X86ScalarSSEf64 && @@ -1633,8 +2153,8 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, } unsigned Reg; - bool RV = X86FastEmitLoad(VT, SrcAM, Reg); - RV &= X86FastEmitStore(VT, Reg, DestAM); + bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg); + RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM); assert(RV && "Failed to emit load or store??"); unsigned Size = VT.getSizeInBits()/8; @@ -1646,10 +2166,74 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, return true; } +static bool isCommutativeIntrinsic(IntrinsicInst const &I) { + switch (I.getIntrinsicID()) { + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: + return true; + default: + return false; + } +} + bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { // FIXME: Handle more intrinsics. switch (I.getIntrinsicID()) { default: return false; + case Intrinsic::frameaddress: { + Type *RetTy = I.getCalledFunction()->getReturnType(); + + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + unsigned Opc; + const TargetRegisterClass *RC = nullptr; + + switch (VT.SimpleTy) { + default: llvm_unreachable("Invalid result type for frameaddress."); + case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break; + case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break; + } + + // This needs to be set before we call getFrameRegister, otherwise we get + // the wrong frame register. + MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo(); + MFI->setFrameAddressIsTaken(true); + + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); + unsigned FrameReg = RegInfo->getFrameRegister(*(FuncInfo.MF)); + assert(((FrameReg == X86::RBP && VT == MVT::i64) || + (FrameReg == X86::EBP && VT == MVT::i32)) && + "Invalid Frame Register!"); + + // Always make a copy of the frame register to to a vreg first, so that we + // never directly reference the frame register (the TwoAddressInstruction- + // Pass doesn't like that). + unsigned SrcReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg); + + // Now recursively load from the frame address. + // movq (%rbp), %rax + // movq (%rax), %rax + // movq (%rax), %rax + // ... + unsigned DestReg; + unsigned Depth = cast<ConstantInt>(I.getOperand(0))->getZExtValue(); + while (Depth--) { + DestReg = createResultReg(RC); + addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), DestReg), SrcReg); + SrcReg = DestReg; + } + + UpdateValueMap(&I, SrcReg); + return true; + } case Intrinsic::memcpy: { const MemCpyInst &MCI = cast<MemCpyInst>(I); // Don't handle volatile or variable length memcpys. @@ -1726,52 +2310,233 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP)); return true; } - case Intrinsic::sadd_with_overflow: - case Intrinsic::uadd_with_overflow: { - // FIXME: Should fold immediates. + case Intrinsic::sqrt: { + if (!Subtarget->hasSSE1()) + return false; - // Replace "add with overflow" intrinsics with an "add" instruction followed - // by a seto/setc instruction. - const Function *Callee = I.getCalledFunction(); - Type *RetTy = - cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0)); + Type *RetTy = I.getCalledFunction()->getReturnType(); MVT VT; if (!isTypeLegal(RetTy, VT)) return false; - const Value *Op1 = I.getArgOperand(0); - const Value *Op2 = I.getArgOperand(1); - unsigned Reg1 = getRegForValue(Op1); - unsigned Reg2 = getRegForValue(Op2); + // Unfortunately we can't use FastEmit_r, because the AVX version of FSQRT + // is not generated by FastISel yet. + // FIXME: Update this code once tablegen can handle it. + static const unsigned SqrtOpc[2][2] = { + {X86::SQRTSSr, X86::VSQRTSSr}, + {X86::SQRTSDr, X86::VSQRTSDr} + }; + bool HasAVX = Subtarget->hasAVX(); + unsigned Opc; + const TargetRegisterClass *RC; + switch (VT.SimpleTy) { + default: return false; + case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break; + case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break; + } + + const Value *SrcVal = I.getArgOperand(0); + unsigned SrcReg = getRegForValue(SrcVal); - if (Reg1 == 0 || Reg2 == 0) - // FIXME: Handle values *not* in registers. + if (SrcReg == 0) return false; - unsigned OpC = 0; - if (VT == MVT::i32) - OpC = X86::ADD32rr; - else if (VT == MVT::i64) - OpC = X86::ADD64rr; - else + unsigned ImplicitDefReg = 0; + if (HasAVX) { + ImplicitDefReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); + } + + unsigned ResultReg = createResultReg(RC); + MachineInstrBuilder MIB; + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), + ResultReg); + + if (ImplicitDefReg) + MIB.addReg(ImplicitDefReg); + + MIB.addReg(SrcReg); + + UpdateValueMap(&I, ResultReg); + return true; + } + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: { + // This implements the basic lowering of the xalu with overflow intrinsics + // into add/sub/mul followed by either seto or setb. + const Function *Callee = I.getCalledFunction(); + auto *Ty = cast<StructType>(Callee->getReturnType()); + Type *RetTy = Ty->getTypeAtIndex(0U); + Type *CondTy = Ty->getTypeAtIndex(1); + + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + if (VT < MVT::i8 || VT > MVT::i64) + return false; + + const Value *LHS = I.getArgOperand(0); + const Value *RHS = I.getArgOperand(1); + + // Canonicalize immediate to the RHS. + if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && + isCommutativeIntrinsic(I)) + std::swap(LHS, RHS); + + unsigned BaseOpc, CondOpc; + switch (I.getIntrinsicID()) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::sadd_with_overflow: + BaseOpc = ISD::ADD; CondOpc = X86::SETOr; break; + case Intrinsic::uadd_with_overflow: + BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break; + case Intrinsic::ssub_with_overflow: + BaseOpc = ISD::SUB; CondOpc = X86::SETOr; break; + case Intrinsic::usub_with_overflow: + BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break; + case Intrinsic::smul_with_overflow: + BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break; + case Intrinsic::umul_with_overflow: + BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break; + } + + unsigned LHSReg = getRegForValue(LHS); + if (LHSReg == 0) return false; + bool LHSIsKill = hasTrivialKill(LHS); - // The call to CreateRegs builds two sequential registers, to store the - // both the returned values. - unsigned ResultReg = FuncInfo.CreateRegs(I.getType()); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpC), ResultReg) - .addReg(Reg1).addReg(Reg2); + unsigned ResultReg = 0; + // Check if we have an immediate version. + if (auto const *C = dyn_cast<ConstantInt>(RHS)) { + ResultReg = FastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill, + C->getZExtValue()); + } - unsigned Opc = X86::SETBr; - if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow) - Opc = X86::SETOr; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), - ResultReg + 1); + unsigned RHSReg; + bool RHSIsKill; + if (!ResultReg) { + RHSReg = getRegForValue(RHS); + if (RHSReg == 0) + return false; + RHSIsKill = hasTrivialKill(RHS); + ResultReg = FastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg, + RHSIsKill); + } + + // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit + // it manually. + if (BaseOpc == X86ISD::UMUL && !ResultReg) { + static const unsigned MULOpc[] = + { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r }; + static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX }; + // First copy the first operand into RAX, which is an implicit input to + // the X86::MUL*r instruction. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8]) + .addReg(LHSReg, getKillRegState(LHSIsKill)); + ResultReg = FastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8], + TLI.getRegClassFor(VT), RHSReg, RHSIsKill); + } else if (BaseOpc == X86ISD::SMUL && !ResultReg) { + static const unsigned MULOpc[] = + { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr }; + if (VT == MVT::i8) { + // Copy the first operand into AL, which is an implicit input to the + // X86::IMUL8r instruction. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), X86::AL) + .addReg(LHSReg, getKillRegState(LHSIsKill)); + ResultReg = FastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg, + RHSIsKill); + } else + ResultReg = FastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8], + TLI.getRegClassFor(VT), LHSReg, LHSIsKill, + RHSReg, RHSIsKill); + } + + if (!ResultReg) + return false; + + unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy); + assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers."); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc), + ResultReg2); UpdateValueMap(&I, ResultReg, 2); return true; } + case Intrinsic::x86_sse_cvttss2si: + case Intrinsic::x86_sse_cvttss2si64: + case Intrinsic::x86_sse2_cvttsd2si: + case Intrinsic::x86_sse2_cvttsd2si64: { + bool IsInputDouble; + switch (I.getIntrinsicID()) { + default: llvm_unreachable("Unexpected intrinsic."); + case Intrinsic::x86_sse_cvttss2si: + case Intrinsic::x86_sse_cvttss2si64: + if (!Subtarget->hasSSE1()) + return false; + IsInputDouble = false; + break; + case Intrinsic::x86_sse2_cvttsd2si: + case Intrinsic::x86_sse2_cvttsd2si64: + if (!Subtarget->hasSSE2()) + return false; + IsInputDouble = true; + break; + } + + Type *RetTy = I.getCalledFunction()->getReturnType(); + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + static const unsigned CvtOpc[2][2][2] = { + { { X86::CVTTSS2SIrr, X86::VCVTTSS2SIrr }, + { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr } }, + { { X86::CVTTSD2SIrr, X86::VCVTTSD2SIrr }, + { X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr } } + }; + bool HasAVX = Subtarget->hasAVX(); + unsigned Opc; + switch (VT.SimpleTy) { + default: llvm_unreachable("Unexpected result type."); + case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break; + case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break; + } + + // Check if we can fold insertelement instructions into the convert. + const Value *Op = I.getArgOperand(0); + while (auto *IE = dyn_cast<InsertElementInst>(Op)) { + const Value *Index = IE->getOperand(2); + if (!isa<ConstantInt>(Index)) + break; + unsigned Idx = cast<ConstantInt>(Index)->getZExtValue(); + + if (Idx == 0) { + Op = IE->getOperand(1); + break; + } + Op = IE->getOperand(0); + } + + unsigned Reg = getRegForValue(Op); + if (Reg == 0) + return false; + + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addReg(Reg); + + UpdateValueMap(&I, ResultReg); + return true; + } } } @@ -1794,31 +2559,43 @@ bool X86FastISel::FastLowerArguments() { return false; // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments. - unsigned Idx = 1; - for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; ++I, ++Idx) { - if (Idx > 6) - return false; - + unsigned GPRCnt = 0; + unsigned FPRCnt = 0; + unsigned Idx = 0; + for (auto const &Arg : F->args()) { + // The first argument is at index 1. + ++Idx; if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) || F->getAttributes().hasAttribute(Idx, Attribute::InReg) || F->getAttributes().hasAttribute(Idx, Attribute::StructRet) || F->getAttributes().hasAttribute(Idx, Attribute::Nest)) return false; - Type *ArgTy = I->getType(); + Type *ArgTy = Arg.getType(); if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy()) return false; EVT ArgVT = TLI.getValueType(ArgTy); if (!ArgVT.isSimple()) return false; switch (ArgVT.getSimpleVT().SimpleTy) { + default: return false; case MVT::i32: case MVT::i64: + ++GPRCnt; + break; + case MVT::f32: + case MVT::f64: + if (!Subtarget->hasSSE1()) + return false; + ++FPRCnt; break; - default: - return false; } + + if (GPRCnt > 6) + return false; + + if (FPRCnt > 8) + return false; } static const MCPhysReg GPR32ArgRegs[] = { @@ -1827,24 +2604,33 @@ bool X86FastISel::FastLowerArguments() { static const MCPhysReg GPR64ArgRegs[] = { X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9 }; + static const MCPhysReg XMMArgRegs[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; - Idx = 0; - const TargetRegisterClass *RC32 = TLI.getRegClassFor(MVT::i32); - const TargetRegisterClass *RC64 = TLI.getRegClassFor(MVT::i64); - for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; ++I, ++Idx) { - bool is32Bit = TLI.getValueType(I->getType()) == MVT::i32; - const TargetRegisterClass *RC = is32Bit ? RC32 : RC64; - unsigned SrcReg = is32Bit ? GPR32ArgRegs[Idx] : GPR64ArgRegs[Idx]; + unsigned GPRIdx = 0; + unsigned FPRIdx = 0; + for (auto const &Arg : F->args()) { + MVT VT = TLI.getSimpleValueType(Arg.getType()); + const TargetRegisterClass *RC = TLI.getRegClassFor(VT); + unsigned SrcReg; + switch (VT.SimpleTy) { + default: llvm_unreachable("Unexpected value type."); + case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break; + case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break; + case MVT::f32: // fall-through + case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break; + } unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); // FIXME: Unfortunately it's necessary to emit a copy from the livein copy. // Without this, EmitLiveInCopies may eliminate the livein if its only // use is a bitcast (which isn't turned into an instruction). unsigned ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), - ResultReg).addReg(DstReg, getKillRegState(true)); - UpdateValueMap(I, ResultReg); + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(DstReg, getKillRegState(true)); + UpdateValueMap(&Arg, ResultReg); } return true; } @@ -2147,7 +2933,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { if (!X86FastEmitStore(ArgVT, ArgVal, AM)) return false; } else { - if (!X86FastEmitStore(ArgVT, Arg, AM)) + if (!X86FastEmitStore(ArgVT, Arg, /*ValIsKill=*/false, AM)) return false; } } @@ -2430,7 +3216,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { return 0; } - // Materialize addresses with LEA instructions. + // Materialize addresses with LEA/MOV instructions. if (isa<GlobalValue>(C)) { X86AddressMode AM; if (X86SelectAddress(C, AM)) { @@ -2440,10 +3226,19 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr) return AM.Base.Reg; - Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; unsigned ResultReg = createResultReg(RC); - addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + if (TM.getRelocationModel() == Reloc::Static && + TLI.getPointerTy() == MVT::i64) { + // The displacement code be more than 32 bits away so we need to use + // an instruction with a 64 bit immediate + Opc = X86::MOV64ri; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg).addGlobalAddress(cast<GlobalValue>(C)); + } else { + Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg), AM); + } return ResultReg; } return 0; @@ -2544,8 +3339,9 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) { bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, const LoadInst *LI) { + const Value *Ptr = LI->getPointerOperand(); X86AddressMode AM; - if (!X86SelectAddress(LI->getOperand(0), AM)) + if (!X86SelectAddress(Ptr, AM)) return false; const X86InstrInfo &XII = (const X86InstrInfo&)TII; @@ -2553,13 +3349,18 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, unsigned Size = DL.getTypeAllocSize(LI->getType()); unsigned Alignment = LI->getAlignment(); + if (Alignment == 0) // Ensure that codegen never sees alignment 0 + Alignment = DL.getABITypeAlignment(LI->getType()); + SmallVector<MachineOperand, 8> AddrOps; AM.getFullAddress(AddrOps); MachineInstr *Result = XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment); - if (!Result) return false; + if (!Result) + return false; + Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI)); FuncInfo.MBB->insert(FuncInfo.InsertPt, Result); MI->eraseFromParent(); return true; diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp index 6c5b86f..4be766a 100644 --- a/lib/Target/X86/X86FixupLEAs.cpp +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -32,86 +32,89 @@ using namespace llvm; STATISTIC(NumLEAs, "Number of LEA instructions created"); namespace { - class FixupLEAPass : public MachineFunctionPass { - enum RegUsageState { RU_NotUsed, RU_Write, RU_Read }; - static char ID; - /// \brief Loop over all of the instructions in the basic block - /// replacing applicable instructions with LEA instructions, - /// where appropriate. - bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI); +class FixupLEAPass : public MachineFunctionPass { + enum RegUsageState { RU_NotUsed, RU_Write, RU_Read }; + static char ID; + /// \brief Loop over all of the instructions in the basic block + /// replacing applicable instructions with LEA instructions, + /// where appropriate. + bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI); - const char *getPassName() const override { return "X86 Atom LEA Fixup";} + const char *getPassName() const override { return "X86 Atom LEA Fixup"; } - /// \brief Given a machine register, look for the instruction - /// which writes it in the current basic block. If found, - /// try to replace it with an equivalent LEA instruction. - /// If replacement succeeds, then also process the the newly created - /// instruction. - void seekLEAFixup(MachineOperand& p, MachineBasicBlock::iterator& I, - MachineFunction::iterator MFI); + /// \brief Given a machine register, look for the instruction + /// which writes it in the current basic block. If found, + /// try to replace it with an equivalent LEA instruction. + /// If replacement succeeds, then also process the the newly created + /// instruction. + void seekLEAFixup(MachineOperand &p, MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI); - /// \brief Given a memory access or LEA instruction - /// whose address mode uses a base and/or index register, look for - /// an opportunity to replace the instruction which sets the base or index - /// register with an equivalent LEA instruction. - void processInstruction(MachineBasicBlock::iterator& I, - MachineFunction::iterator MFI); + /// \brief Given a memory access or LEA instruction + /// whose address mode uses a base and/or index register, look for + /// an opportunity to replace the instruction which sets the base or index + /// register with an equivalent LEA instruction. + void processInstruction(MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI); - /// \brief Given a LEA instruction which is unprofitable - /// on Silvermont try to replace it with an equivalent ADD instruction - void processInstructionForSLM(MachineBasicBlock::iterator& I, - MachineFunction::iterator MFI); + /// \brief Given a LEA instruction which is unprofitable + /// on Silvermont try to replace it with an equivalent ADD instruction + void processInstructionForSLM(MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI); - /// \brief Determine if an instruction references a machine register - /// and, if so, whether it reads or writes the register. - RegUsageState usesRegister(MachineOperand& p, - MachineBasicBlock::iterator I); + /// \brief Determine if an instruction references a machine register + /// and, if so, whether it reads or writes the register. + RegUsageState usesRegister(MachineOperand &p, MachineBasicBlock::iterator I); - /// \brief Step backwards through a basic block, looking - /// for an instruction which writes a register within - /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles. - MachineBasicBlock::iterator searchBackwards(MachineOperand& p, - MachineBasicBlock::iterator& I, - MachineFunction::iterator MFI); + /// \brief Step backwards through a basic block, looking + /// for an instruction which writes a register within + /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles. + MachineBasicBlock::iterator searchBackwards(MachineOperand &p, + MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI); - /// \brief if an instruction can be converted to an - /// equivalent LEA, insert the new instruction into the basic block - /// and return a pointer to it. Otherwise, return zero. - MachineInstr* postRAConvertToLEA(MachineFunction::iterator &MFI, - MachineBasicBlock::iterator &MBBI) const; + /// \brief if an instruction can be converted to an + /// equivalent LEA, insert the new instruction into the basic block + /// and return a pointer to it. Otherwise, return zero. + MachineInstr *postRAConvertToLEA(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI) const; - public: - FixupLEAPass() : MachineFunctionPass(ID) {} +public: + FixupLEAPass() : MachineFunctionPass(ID) {} - /// \brief Loop over all of the basic blocks, - /// replacing instructions by equivalent LEA instructions - /// if needed and when possible. - bool runOnMachineFunction(MachineFunction &MF) override; + /// \brief Loop over all of the basic blocks, + /// replacing instructions by equivalent LEA instructions + /// if needed and when possible. + bool runOnMachineFunction(MachineFunction &MF) override; - private: - MachineFunction *MF; - const TargetMachine *TM; - const X86InstrInfo *TII; // Machine instruction info. - - }; - char FixupLEAPass::ID = 0; +private: + MachineFunction *MF; + const TargetMachine *TM; + const X86InstrInfo *TII; // Machine instruction info. +}; +char FixupLEAPass::ID = 0; } MachineInstr * FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, MachineBasicBlock::iterator &MBBI) const { - MachineInstr* MI = MBBI; - MachineInstr* NewMI; + MachineInstr *MI = MBBI; + MachineInstr *NewMI; switch (MI->getOpcode()) { case X86::MOV32rr: case X86::MOV64rr: { - const MachineOperand& Src = MI->getOperand(1); - const MachineOperand& Dest = MI->getOperand(0); + const MachineOperand &Src = MI->getOperand(1); + const MachineOperand &Dest = MI->getOperand(0); NewMI = BuildMI(*MF, MI->getDebugLoc(), - TII->get( MI->getOpcode() == X86::MOV32rr ? X86::LEA32r : X86::LEA64r)) - .addOperand(Dest) - .addOperand(Src).addImm(1).addReg(0).addImm(0).addReg(0); - MFI->insert(MBBI, NewMI); // Insert the new inst + TII->get(MI->getOpcode() == X86::MOV32rr ? X86::LEA32r + : X86::LEA64r)) + .addOperand(Dest) + .addOperand(Src) + .addImm(1) + .addReg(0) + .addImm(0) + .addReg(0); + MFI->insert(MBBI, NewMI); // Insert the new inst return NewMI; } case X86::ADD64ri32: @@ -144,17 +147,16 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, return TII->convertToThreeAddress(MFI, MBBI, nullptr); } -FunctionPass *llvm::createX86FixupLEAs() { - return new FixupLEAPass(); -} +FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); } bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { + MF = &Func; TM = &Func.getTarget(); const X86Subtarget &ST = TM->getSubtarget<X86Subtarget>(); if (!ST.LEAusesAG() && !ST.slowLEA()) return false; - TII = static_cast<const X86InstrInfo*>(TM->getInstrInfo()); + TII = static_cast<const X86InstrInfo *>(TM->getInstrInfo()); DEBUG(dbgs() << "Start X86FixupLEAs\n";); // Process all basic blocks. @@ -165,14 +167,14 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { return true; } -FixupLEAPass::RegUsageState FixupLEAPass::usesRegister(MachineOperand& p, - MachineBasicBlock::iterator I) { +FixupLEAPass::RegUsageState +FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) { RegUsageState RegUsage = RU_NotUsed; - MachineInstr* MI = I; + MachineInstr *MI = I; for (unsigned int i = 0; i < MI->getNumOperands(); ++i) { - MachineOperand& opnd = MI->getOperand(i); - if (opnd.isReg() && opnd.getReg() == p.getReg()){ + MachineOperand &opnd = MI->getOperand(i); + if (opnd.isReg() && opnd.getReg() == p.getReg()) { if (opnd.isDef()) return RU_Write; RegUsage = RU_Read; @@ -185,23 +187,22 @@ FixupLEAPass::RegUsageState FixupLEAPass::usesRegister(MachineOperand& p, /// block, return a reference to the previous instruction in the block, /// wrapping around to the last instruction of the block if the block /// branches to itself. -static inline bool getPreviousInstr(MachineBasicBlock::iterator& I, +static inline bool getPreviousInstr(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI) { if (I == MFI->begin()) { if (MFI->isPredecessor(MFI)) { I = --MFI->end(); return true; - } - else + } else return false; } --I; return true; } -MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p, - MachineBasicBlock::iterator& I, - MachineFunction::iterator MFI) { +MachineBasicBlock::iterator +FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI) { int InstrDistance = 1; MachineBasicBlock::iterator CurInst; static const int INSTR_DISTANCE_THRESHOLD = 5; @@ -209,12 +210,12 @@ MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p, CurInst = I; bool Found; Found = getPreviousInstr(CurInst, MFI); - while( Found && I != CurInst) { + while (Found && I != CurInst) { if (CurInst->isCall() || CurInst->isInlineAsm()) break; if (InstrDistance > INSTR_DISTANCE_THRESHOLD) break; // too far back to make a difference - if (usesRegister(p, CurInst) == RU_Write){ + if (usesRegister(p, CurInst) == RU_Write) { return CurInst; } InstrDistance += TII->getInstrLatency(TM->getInstrItineraryData(), CurInst); @@ -223,32 +224,32 @@ MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p, return nullptr; } -void FixupLEAPass::processInstruction(MachineBasicBlock::iterator& I, +void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI) { // Process a load, store, or LEA instruction. MachineInstr *MI = I; int opcode = MI->getOpcode(); - const MCInstrDesc& Desc = MI->getDesc(); + const MCInstrDesc &Desc = MI->getDesc(); int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags, opcode); if (AddrOffset >= 0) { AddrOffset += X86II::getOperandBias(Desc); - MachineOperand& p = MI->getOperand(AddrOffset + X86::AddrBaseReg); + MachineOperand &p = MI->getOperand(AddrOffset + X86::AddrBaseReg); if (p.isReg() && p.getReg() != X86::ESP) { seekLEAFixup(p, I, MFI); } - MachineOperand& q = MI->getOperand(AddrOffset + X86::AddrIndexReg); + MachineOperand &q = MI->getOperand(AddrOffset + X86::AddrIndexReg); if (q.isReg() && q.getReg() != X86::ESP) { seekLEAFixup(q, I, MFI); } } } -void FixupLEAPass::seekLEAFixup(MachineOperand& p, - MachineBasicBlock::iterator& I, +void FixupLEAPass::seekLEAFixup(MachineOperand &p, + MachineBasicBlock::iterator &I, MachineFunction::iterator MFI) { MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI); if (MBI) { - MachineInstr* NewMI = postRAConvertToLEA(MFI, MBI); + MachineInstr *NewMI = postRAConvertToLEA(MFI, MBI); if (NewMI) { ++NumLEAs; DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump();); @@ -256,7 +257,7 @@ void FixupLEAPass::seekLEAFixup(MachineOperand& p, DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump();); MFI->erase(MBI); MachineBasicBlock::iterator J = - static_cast<MachineBasicBlock::iterator> (NewMI); + static_cast<MachineBasicBlock::iterator>(NewMI); processInstruction(J, MFI); } } @@ -299,7 +300,7 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, } DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump();); DEBUG(dbgs() << "FixLEA: Replaced by: ";); - MachineInstr *NewMI = 0; + MachineInstr *NewMI = nullptr; const MachineOperand &Dst = MI->getOperand(0); // Make ADD instruction for two registers writing to LEA's destination if (SrcR1 != 0 && SrcR2 != 0) { diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 4c1374f..8c029a8 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -29,6 +29,7 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Support/Debug.h" using namespace llvm; @@ -45,7 +46,7 @@ bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { bool X86FrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineModuleInfo &MMI = MF.getMMI(); - const TargetRegisterInfo *RegInfo = TM.getRegisterInfo(); + const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo(); return (MF.getTarget().Options.DisableFramePointerElim(MF) || RegInfo->needsStackRealignment(MF) || @@ -305,65 +306,25 @@ static bool isEAXLiveIn(MachineFunction &MF) { return false; } -void X86FrameLowering::emitCalleeSavedFrameMoves( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, - unsigned FramePtr) const { +void +X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo *MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); - const X86InstrInfo &TII = *TM.getInstrInfo(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); // Add callee saved registers to move list. const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); if (CSI.empty()) return; - const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); - bool HasFP = hasFP(MF); - - // Calculate amount of bytes used for return address storing. - int stackGrowth = -RegInfo->getSlotSize(); - - // FIXME: This is dirty hack. The code itself is pretty mess right now. - // It should be rewritten from scratch and generalized sometimes. - - // Determine maximum offset (minimum due to stack growth). - int64_t MaxOffset = 0; - for (std::vector<CalleeSavedInfo>::const_iterator - I = CSI.begin(), E = CSI.end(); I != E; ++I) - MaxOffset = std::min(MaxOffset, - MFI->getObjectOffset(I->getFrameIdx())); - // Calculate offsets. - int64_t saveAreaOffset = (HasFP ? 3 : 2) * stackGrowth; for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(), E = CSI.end(); I != E; ++I) { int64_t Offset = MFI->getObjectOffset(I->getFrameIdx()); unsigned Reg = I->getReg(); - Offset = MaxOffset - Offset + saveAreaOffset; - - // Don't output a new machine move if we're re-saving the frame - // pointer. This happens when the PrologEpilogInserter has inserted an extra - // "PUSH" of the frame pointer -- the "emitPrologue" method automatically - // generates one when frame pointers are used. If we generate a "machine - // move" for this extra "PUSH", the linker will lose track of the fact that - // the frame pointer should have the value of the first "PUSH" when it's - // trying to unwind. - // - // FIXME: This looks inelegant. It's possibly correct, but it's covering up - // another bug. I.e., one where we generate a prolog like this: - // - // pushl %ebp - // movl %esp, %ebp - // pushl %ebp - // pushl %esi - // ... - // - // The immediate re-push of EBP is unnecessary. At the least, it's an - // optimization bug. EBP can be used as a scratch register in certain - // cases, but probably not when we have a frame pointer. - if (HasFP && FramePtr == Reg) - continue; unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); unsigned CFIIndex = @@ -395,23 +356,107 @@ static bool usesTheStack(const MachineFunction &MF) { /// automatically adjust the stack pointer. Adjust the stack pointer to allocate /// space for local variables. Also emit labels used by the exception handler to /// generate the exception handling frames. + +/* + Here's a gist of what gets emitted: + + ; Establish frame pointer, if needed + [if needs FP] + push %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + .seh_pushreg %rpb + mov %rsp, %rbp + .cfi_def_cfa_register %rbp + + ; Spill general-purpose registers + [for all callee-saved GPRs] + pushq %<reg> + [if not needs FP] + .cfi_def_cfa_offset (offset from RETADDR) + .seh_pushreg %<reg> + + ; If the required stack alignment > default stack alignment + ; rsp needs to be re-aligned. This creates a "re-alignment gap" + ; of unknown size in the stack frame. + [if stack needs re-alignment] + and $MASK, %rsp + + ; Allocate space for locals + [if target is Windows and allocated space > 4096 bytes] + ; Windows needs special care for allocations larger + ; than one page. + mov $NNN, %rax + call ___chkstk_ms/___chkstk + sub %rax, %rsp + [else] + sub $NNN, %rsp + + [if needs FP] + .seh_stackalloc (size of XMM spill slots) + .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots + [else] + .seh_stackalloc NNN + + ; Spill XMMs + ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved, + ; they may get spilled on any platform, if the current function + ; calls @llvm.eh.unwind.init + [if needs FP] + [for all callee-saved XMM registers] + movaps %<xmm reg>, -MMM(%rbp) + [for all callee-saved XMM registers] + .seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset) + ; i.e. the offset relative to (%rbp - SEHFrameOffset) + [else] + [for all callee-saved XMM registers] + movaps %<xmm reg>, KKK(%rsp) + [for all callee-saved XMM registers] + .seh_savexmm %<xmm reg>, KKK + + .seh_endprologue + + [if needs base pointer] + mov %rsp, %rbx + + ; Emit CFI info + [if needs FP] + [for all callee-saved registers] + .cfi_offset %<reg>, (offset from %rbp) + [else] + .cfi_def_cfa_offset (offset from RETADDR) + [for all callee-saved registers] + .cfi_offset %<reg>, (offset from %rsp) + + Notes: + - .seh directives are emitted only for Windows 64 ABI + - .cfi directives are emitted for all other ABIs + - for 32-bit code, substitute %e?? registers for %r?? +*/ + void X86FrameLowering::emitPrologue(MachineFunction &MF) const { MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB. MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *Fn = MF.getFunction(); - const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); - const X86InstrInfo &TII = *TM.getInstrInfo(); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - bool needsFrameMoves = MMI.hasDebugInfo() || - Fn->needsUnwindTableEntry(); uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate. bool HasFP = hasFP(MF); + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool Is64Bit = STI.is64Bit(); bool IsLP64 = STI.isTarget64BitLP64(); bool IsWin64 = STI.isTargetWin64(); + bool IsWinEH = + MF.getTarget().getMCAsmInfo()->getExceptionHandlingType() == + ExceptionHandling::WinEH; // Not necessarily synonymous with IsWin64. + bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry(); + bool NeedsDwarfCFI = + !IsWinEH && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); bool UseLEA = STI.useLeaForSP(); unsigned StackAlign = getStackAlignment(); unsigned SlotSize = RegInfo->getSlotSize(); @@ -509,7 +554,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addReg(FramePtr, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); - if (needsFrameMoves) { + if (NeedsDwarfCFI) { // Mark the place where EBP/RBP was saved. // Define the current CFA rule to use the provided offset. assert(StackSize); @@ -527,13 +572,19 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addCFIIndex(CFIIndex); } + if (NeedsWinEH) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) + .addImm(FramePtr) + .setMIFlag(MachineInstr::FrameSetup); + } + // Update EBP with the new base value. BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); - if (needsFrameMoves) { + if (NeedsDwarfCFI) { // Mark effective beginning of when frame pointer becomes valid. // Define the current CFA to use the EBP/RBP register. unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true); @@ -543,9 +594,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addCFIIndex(CFIIndex); } - // Mark the FramePtr as live-in in every block except the entry. - for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end(); - I != E; ++I) + // Mark the FramePtr as live-in in every block. + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) I->addLiveIn(FramePtr); } else { NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); @@ -559,10 +609,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { (MBBI->getOpcode() == X86::PUSH32r || MBBI->getOpcode() == X86::PUSH64r)) { PushedRegs = true; - MBBI->setFlag(MachineInstr::FrameSetup); + unsigned Reg = MBBI->getOperand(0).getReg(); ++MBBI; - if (!HasFP && needsFrameMoves) { + if (!HasFP && NeedsDwarfCFI) { // Mark callee-saved push instruction. // Define the current CFA rule to use the provided offset. assert(StackSize); @@ -572,16 +622,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addCFIIndex(CFIIndex); StackOffset += stackGrowth; } + + if (NeedsWinEH) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag( + MachineInstr::FrameSetup); + } } // Realign stack after we pushed callee-saved registers (so that we'll be // able to calculate their offsets from the frame pointer). - - // NOTE: We push the registers before realigning the stack, so - // vector callee-saved (xmm) registers may be saved w/o proper - // alignment in this way. However, currently these regs are saved in - // stack slots (see X86FrameLowering::spillCalleeSavedRegisters()), so - // this shouldn't be a problem. if (RegInfo->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); MachineInstr *MI = @@ -680,23 +729,88 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { MI->setFlag(MachineInstr::FrameSetup); MBB.insert(MBBI, MI); } - } else if (NumBytes) + } else if (NumBytes) { emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64, UseLEA, TII, *RegInfo); + } + + int SEHFrameOffset = 0; + if (NeedsWinEH) { + if (HasFP) { + // We need to set frame base offset low enough such that all saved + // register offsets would be positive relative to it, but we can't + // just use NumBytes, because .seh_setframe offset must be <=240. + // So we pretend to have only allocated enough space to spill the + // non-volatile registers. + // We don't care about the rest of stack allocation, because unwinder + // will restore SP to (BP - SEHFrameOffset) + for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) { + int offset = MFI->getObjectOffset(Info.getFrameIdx()); + SEHFrameOffset = std::max(SEHFrameOffset, abs(offset)); + } + SEHFrameOffset += SEHFrameOffset % 16; // ensure alignmant + + // This only needs to account for XMM spill slots, GPR slots + // are covered by the .seh_pushreg's emitted above. + unsigned Size = SEHFrameOffset - X86FI->getCalleeSavedFrameSize(); + if (Size) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) + .addImm(Size) + .setMIFlag(MachineInstr::FrameSetup); + } + + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) + .addImm(FramePtr) + .addImm(SEHFrameOffset) + .setMIFlag(MachineInstr::FrameSetup); + } else { + // SP will be the base register for restoring XMMs + if (NumBytes) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } + } + } + + // Skip the rest of register spilling code + while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) + ++MBBI; + + // Emit SEH info for non-GPRs + if (NeedsWinEH) { + for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) { + unsigned Reg = Info.getReg(); + if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) + continue; + assert(X86::FR64RegClass.contains(Reg) && "Unexpected register class"); + + int Offset = getFrameIndexOffset(MF, Info.getFrameIdx()); + Offset += SEHFrameOffset; + + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) + .addImm(Reg) + .addImm(Offset) + .setMIFlag(MachineInstr::FrameSetup); + } + + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue)) + .setMIFlag(MachineInstr::FrameSetup); + } // If we need a base pointer, set it up here. It's whatever the value // of the stack pointer is at this point. Any variable size objects // will be allocated after this, so we can still use the base pointer // to reference locals. if (RegInfo->hasBasePointer(MF)) { - // Update the frame pointer with the current stack pointer. + // Update the base pointer with the current stack pointer. unsigned Opc = Is64Bit ? X86::MOV64rr : X86::MOV32rr; BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); } - if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) { + if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) { // Mark end of stack pointer adjustment. if (!HasFP && NumBytes) { // Define the current CFA rule to use the provided offset. @@ -711,7 +825,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // Emit DWARF info specifying the offsets of the callee-saved registers. if (PushedRegs) - emitCalleeSavedFrameMoves(MBB, MBBI, DL, HasFP ? FramePtr : StackPtr); + emitCalleeSavedFrameMoves(MBB, MBBI, DL); } } @@ -719,12 +833,14 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); - const X86InstrInfo &TII = *TM.getInstrInfo(); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); assert(MBBI != MBB.end() && "Returning block has no instructions"); unsigned RetOpcode = MBBI->getOpcode(); DebugLoc DL = MBBI->getDebugLoc(); + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool Is64Bit = STI.is64Bit(); bool IsLP64 = STI.isTarget64BitLP64(); bool UseLEA = STI.useLeaForSP(); @@ -969,46 +1085,97 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, return getFrameIndexOffset(MF, FI); } -bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const { - if (CSI.empty()) - return false; +bool X86FrameLowering::assignCalleeSavedSpillSlots( + MachineFunction &MF, const TargetRegisterInfo *TRI, + std::vector<CalleeSavedInfo> &CSI) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()); + unsigned SlotSize = RegInfo->getSlotSize(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - DebugLoc DL = MBB.findDebugLoc(MI); + unsigned CalleeSavedFrameSize = 0; + int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta(); - MachineFunction &MF = *MBB.getParent(); + if (hasFP(MF)) { + // emitPrologue always spills frame register the first thing. + SpillSlotOffset -= SlotSize; + MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); + + // Since emitPrologue and emitEpilogue will handle spilling and restoring of + // the frame register, we can delete it from CSI list and not have to worry + // about avoiding it later. + unsigned FPReg = RegInfo->getFrameRegister(MF); + for (unsigned i = 0; i < CSI.size(); ++i) { + if (CSI[i].getReg() == FPReg) { + CSI.erase(CSI.begin() + i); + break; + } + } + } + + // Assign slots for GPRs. It increases frame size. + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i - 1].getReg(); + + if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) + continue; - unsigned SlotSize = STI.is64Bit() ? 8 : 4; - unsigned FPReg = TRI->getFrameRegister(MF); - unsigned CalleeFrameSize = 0; + SpillSlotOffset -= SlotSize; + CalleeSavedFrameSize += SlotSize; + + int SlotIndex = MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); + CSI[i - 1].setFrameIdx(SlotIndex); + } + + X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize); + + // Assign slots for XMMs. + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i - 1].getReg(); + if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) + continue; + + const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); + // ensure alignment + SpillSlotOffset -= abs(SpillSlotOffset) % RC->getAlignment(); + // spill into slot + SpillSlotOffset -= RC->getSize(); + int SlotIndex = + MFI->CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset); + CSI[i - 1].setFrameIdx(SlotIndex); + MFI->ensureMaxAlignment(RC->getAlignment()); + } + + return true; +} +bool X86FrameLowering::spillCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + DebugLoc DL = MBB.findDebugLoc(MI); + + MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); - X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); // Push GPRs. It increases frame size. unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; for (unsigned i = CSI.size(); i != 0; --i) { - unsigned Reg = CSI[i-1].getReg(); - if (!X86::GR64RegClass.contains(Reg) && - !X86::GR32RegClass.contains(Reg)) + unsigned Reg = CSI[i - 1].getReg(); + + if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); - if (Reg == FPReg) - // X86RegisterInfo::emitPrologue will handle spilling of frame register. - continue; - CalleeFrameSize += SlotSize; + BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); } - X86FI->setCalleeSavedFrameSize(CalleeFrameSize); - // Make XMM regs spilled. X86 does not have ability of push/pop XMM. // It can be done by spilling XMMs to stack frame. - // Note that only Win64 ABI might spill XMMs. for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); if (X86::GR64RegClass.contains(Reg) || @@ -1017,8 +1184,12 @@ bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(), - RC, TRI); + + TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC, + TRI); + --MI; + MI->setFlag(MachineInstr::FrameSetup); + ++MI; } return true; @@ -1035,6 +1206,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); // Reload XMMs from stack frame. for (unsigned i = 0, e = CSI.size(); i != e; ++i) { @@ -1042,22 +1214,19 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), - RC, TRI); + TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI); } // POP GPRs. - unsigned FPReg = TRI->getFrameRegister(MF); unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r; for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; - if (Reg == FPReg) - // X86RegisterInfo::emitEpilogue will handle restoring of frame register. - continue; + BuildMI(MBB, MI, DL, TII.get(Opc), Reg); } return true; @@ -1065,9 +1234,10 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, void X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) const { + RegScavenger *RS) const { MachineFrameInfo *MFI = MF.getFrameInfo(); - const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()); unsigned SlotSize = RegInfo->getSlotSize(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); @@ -1087,22 +1257,6 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, TailCallReturnAddrDelta - SlotSize, true); } - if (hasFP(MF)) { - assert((TailCallReturnAddrDelta <= 0) && - "The Delta should always be zero or negative"); - const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering(); - - // Create a frame entry for the EBP register that must be saved. - int FrameIdx = MFI->CreateFixedObject(SlotSize, - -(int)SlotSize + - TFI.getOffsetOfLocalArea() + - TailCallReturnAddrDelta, - true); - assert(FrameIdx == MFI->getObjectIndexBegin() && - "Slot for EBP register must be last in order to be found!"); - (void)FrameIdx; - } - // Spill the BasePtr if it's used. if (RegInfo->hasBasePointer(MF)) MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister()); @@ -1160,8 +1314,9 @@ void X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { MachineBasicBlock &prologueMBB = MF.front(); MachineFrameInfo *MFI = MF.getFrameInfo(); - const X86InstrInfo &TII = *TM.getInstrInfo(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); uint64_t StackSize; + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool Is64Bit = STI.is64Bit(); unsigned TlsReg, TlsOffset; DebugLoc DL; @@ -1368,9 +1523,12 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { /// temp0 = sp - MaxStack /// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { - const X86InstrInfo &TII = *TM.getInstrInfo(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); MachineFrameInfo *MFI = MF.getFrameInfo(); - const unsigned SlotSize = TM.getRegisterInfo()->getSlotSize(); + const unsigned SlotSize = + static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()) + ->getSlotSize(); + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); const bool Is64Bit = STI.is64Bit(); DebugLoc DL; // HiPE-specific values @@ -1499,12 +1657,14 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { void X86FrameLowering:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - const X86InstrInfo &TII = *TM.getInstrInfo(); - const X86RegisterInfo &RegInfo = *TM.getRegisterInfo(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const X86RegisterInfo &RegInfo = + *static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()); unsigned StackPtr = RegInfo.getStackRegister(); bool reseveCallFrame = hasReservedCallFrame(MF); int Opcode = I->getOpcode(); bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode(); + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool IsLP64 = STI.isTarget64BitLP64(); DebugLoc DL = I->getDebugLoc(); uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0; @@ -1522,7 +1682,8 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next // alignment boundary. - unsigned StackAlign = TM.getFrameLowering()->getStackAlignment(); + unsigned StackAlign = + MF.getTarget().getFrameLowering()->getStackAlignment(); Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign; MachineInstr *New = nullptr; diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index 208bb8b..5ad3d4d 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -14,7 +14,6 @@ #ifndef X86_FRAMELOWERING_H #define X86_FRAMELOWERING_H -#include "X86Subtarget.h" #include "llvm/Target/TargetFrameLowering.h" namespace llvm { @@ -23,19 +22,13 @@ class MCSymbol; class X86TargetMachine; class X86FrameLowering : public TargetFrameLowering { - const X86TargetMachine &TM; - const X86Subtarget &STI; public: - explicit X86FrameLowering(const X86TargetMachine &tm, const X86Subtarget &sti) - : TargetFrameLowering(StackGrowsDown, - sti.getStackAlignment(), - (sti.is64Bit() ? -8 : -4)), - TM(tm), STI(sti) { - } + explicit X86FrameLowering(StackDirection D, unsigned StackAl, int LAO) + : TargetFrameLowering(StackGrowsDown, StackAl, LAO) {} void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, DebugLoc DL, - unsigned FramePtr) const; + MachineBasicBlock::iterator MBBI, + DebugLoc DL) const; /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. @@ -49,6 +42,11 @@ public: void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, RegScavenger *RS = nullptr) const override; + bool + assignCalleeSavedSpillSlots(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector<CalleeSavedInfo> &CSI) const override; + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI, diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 74386d3..ba2f5f6 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2126,38 +2126,6 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { return getGlobalBaseReg(); - case X86ISD::ATOMOR64_DAG: - case X86ISD::ATOMXOR64_DAG: - case X86ISD::ATOMADD64_DAG: - case X86ISD::ATOMSUB64_DAG: - case X86ISD::ATOMNAND64_DAG: - case X86ISD::ATOMAND64_DAG: - case X86ISD::ATOMMAX64_DAG: - case X86ISD::ATOMMIN64_DAG: - case X86ISD::ATOMUMAX64_DAG: - case X86ISD::ATOMUMIN64_DAG: - case X86ISD::ATOMSWAP64_DAG: { - unsigned Opc; - switch (Opcode) { - default: llvm_unreachable("Impossible opcode"); - case X86ISD::ATOMOR64_DAG: Opc = X86::ATOMOR6432; break; - case X86ISD::ATOMXOR64_DAG: Opc = X86::ATOMXOR6432; break; - case X86ISD::ATOMADD64_DAG: Opc = X86::ATOMADD6432; break; - case X86ISD::ATOMSUB64_DAG: Opc = X86::ATOMSUB6432; break; - case X86ISD::ATOMNAND64_DAG: Opc = X86::ATOMNAND6432; break; - case X86ISD::ATOMAND64_DAG: Opc = X86::ATOMAND6432; break; - case X86ISD::ATOMMAX64_DAG: Opc = X86::ATOMMAX6432; break; - case X86ISD::ATOMMIN64_DAG: Opc = X86::ATOMMIN6432; break; - case X86ISD::ATOMUMAX64_DAG: Opc = X86::ATOMUMAX6432; break; - case X86ISD::ATOMUMIN64_DAG: Opc = X86::ATOMUMIN6432; break; - case X86ISD::ATOMSWAP64_DAG: Opc = X86::ATOMSWAP6432; break; - } - SDNode *RetVal = SelectAtomic64(Node, Opc); - if (RetVal) - return RetVal; - break; - } - case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_AND: case ISD::ATOMIC_LOAD_OR: diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index cbaf44e..5ccff20 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -44,11 +44,13 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" #include <bitset> +#include <numeric> #include <cctype> using namespace llvm; @@ -56,6 +58,17 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); +static cl::opt<bool> ExperimentalVectorWideningLegalization( + "x86-experimental-vector-widening-legalization", cl::init(false), + cl::desc("Enable an experimental vector type legalization through widening " + "rather than promotion."), + cl::Hidden); + +static cl::opt<bool> ExperimentalVectorShuffleLowering( + "x86-experimental-vector-shuffle-lowering", cl::init(false), + cl::desc("Enable an experimental vector shuffle lowering code path."), + cl::Hidden); + // Forward declarations. static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2); @@ -178,29 +191,28 @@ static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, return Insert256BitVector(V, V2, NumElems/2, DAG, dl); } -static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { - const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); - bool is64Bit = Subtarget->is64Bit(); - - if (Subtarget->isTargetMacho()) { - if (is64Bit) +static TargetLoweringObjectFile *createTLOF(const Triple &TT) { + if (TT.isOSBinFormatMachO()) { + if (TT.getArch() == Triple::x86_64) return new X86_64MachoTargetObjectFile(); return new TargetLoweringObjectFileMachO(); } - if (Subtarget->isTargetLinux()) + if (TT.isOSLinux()) return new X86LinuxTargetObjectFile(); - if (Subtarget->isTargetELF()) + if (TT.isOSBinFormatELF()) return new TargetLoweringObjectFileELF(); - if (Subtarget->isTargetKnownWindowsMSVC()) + if (TT.isKnownWindowsMSVCEnvironment()) return new X86WindowsTargetObjectFile(); - if (Subtarget->isTargetCOFF()) + if (TT.isOSBinFormatCOFF()) return new TargetLoweringObjectFileCOFF(); llvm_unreachable("unknown subtarget type"); } +// FIXME: This should stop caching the target machine as soon as +// we can remove resetOperationActions et al. X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) - : TargetLowering(TM, createTLOF(TM)) { + : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) { Subtarget = &TM.getSubtarget<X86Subtarget>(); X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); @@ -443,7 +455,13 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::BR_CC , MVT::i16, Expand); setOperationAction(ISD::BR_CC , MVT::i32, Expand); setOperationAction(ISD::BR_CC , MVT::i64, Expand); - setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); + setOperationAction(ISD::SELECT_CC , MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC , MVT::f64, Expand); + setOperationAction(ISD::SELECT_CC , MVT::f80, Expand); + setOperationAction(ISD::SELECT_CC , MVT::i8, Expand); + setOperationAction(ISD::SELECT_CC , MVT::i16, Expand); + setOperationAction(ISD::SELECT_CC , MVT::i32, Expand); + setOperationAction(ISD::SELECT_CC , MVT::i64, Expand); if (Subtarget->is64Bit()) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); @@ -497,6 +515,14 @@ void X86TargetLowering::resetOperationActions() { } } + // Special handling for half-precision floating point conversions. + // If we don't have F16C support, then lower half float conversions + // into library calls. + if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) { + setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand); + setOperationAction(ISD::FP32_TO_FP16, MVT::i16, Expand); + } + if (Subtarget->hasPOPCNT()) { setOperationAction(ISD::CTPOP , MVT::i8 , Promote); } else { @@ -575,34 +601,18 @@ void X86TargetLowering::resetOperationActions() { // Expand certain atomics for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { MVT VT = IntVTs[i]; - setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); setOperationAction(ISD::ATOMIC_STORE, VT, Custom); } - if (!Subtarget->is64Bit()) { - setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom); - } - if (Subtarget->hasCmpxchg16b()) { - setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); } // FIXME - use subtarget debug flags - if (!Subtarget->isTargetDarwin() && - !Subtarget->isTargetELF() && - !Subtarget->isTargetCygMing()) { + if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() && + !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) { setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); } @@ -861,6 +871,7 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::ZERO_EXTEND, VT, Expand); setOperationAction(ISD::ANY_EXTEND, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE; InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) setTruncStoreAction(VT, @@ -1433,6 +1444,11 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::OR, MVT::v16i32, Legal); setOperationAction(ISD::XOR, MVT::v16i32, Legal); + if (Subtarget->hasCDI()) { + setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); + setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); + } + // Custom lower several nodes. for (int i = MVT::FIRST_VECTOR_VALUETYPE; i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { @@ -1563,6 +1579,7 @@ void X86TargetLowering::resetOperationActions() { setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); + setTargetDAGCombine(ISD::BUILD_VECTOR); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); @@ -1585,6 +1602,16 @@ void X86TargetLowering::resetOperationActions() { setPrefFunctionAlignment(4); // 2^4 bytes. } +TargetLoweringBase::LegalizeTypeAction +X86TargetLowering::getPreferredVectorAction(EVT VT) const { + if (ExperimentalVectorWideningLegalization && + VT.getVectorNumElements() != 1 && + VT.getVectorElementType().getSimpleVT() != MVT::i1) + return TypeWidenVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); +} + EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { if (!VT.isVector()) return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; @@ -1725,7 +1752,7 @@ const MCExpr * X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, unsigned uid,MCContext &Ctx) const{ - assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && + assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ && Subtarget->isPICStyleGOT()); // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF // entries. @@ -1824,7 +1851,7 @@ X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), + CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(), RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_X86); } @@ -1844,7 +1871,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), + CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); @@ -2016,7 +2043,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, SmallVector<CCValAssign, 16> RVLocs; bool Is64Bit = Subtarget->is64Bit(); CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + DAG.getTarget(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. @@ -2166,8 +2193,8 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, unsigned i) const { // Create the nodes corresponding to a load from this parameter slot. ISD::ArgFlagsTy Flags = Ins[i].Flags; - bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv, - getTargetMachine().Options.GuaranteedTailCallOpt); + bool AlwaysUseMutable = FuncIsMadeTailCallSafe( + CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); EVT ValVT; @@ -2224,7 +2251,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), + CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 @@ -2388,7 +2415,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, TotalNumXMMRegs = 0; if (IsWin64) { - const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); + const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering(); // Get to the caller-allocated home save location. Add 8 to account // for the return address. int HomeOffset = TFI.getOffsetOfLocalArea() + 8; @@ -2587,7 +2614,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), + CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 @@ -2602,7 +2629,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // This is a sibcall. The memory operands are available in caller's // own caller's stack. NumBytes = 0; - else if (getTargetMachine().Options.GuaranteedTailCallOpt && + else if (MF.getTarget().Options.GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); @@ -2649,7 +2676,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { // Skip inalloca arguments, they have already been written. ISD::ArgFlagsTy Flags = Outs[i].Flags; @@ -2840,7 +2867,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, InFlag = Chain.getValue(1); } - if (getTargetMachine().getCodeModel() == CodeModel::Large) { + if (DAG.getTarget().getCodeModel() == CodeModel::Large) { assert(Is64Bit && "Large code model is only legal in 64-bit mode."); // In the 64-bit large code model, we have to make all calls // through a register, since the call instruction's 32-bit @@ -2864,7 +2891,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // has hidden or protected visibility, or if it is static or local, then // we don't need to use the PLT - we can directly call it. if (Subtarget->isTargetELF() && - getTargetMachine().getRelocationModel() == Reloc::PIC_ && + DAG.getTarget().getRelocationModel() == Reloc::PIC_ && GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && @@ -2906,7 +2933,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // On ELF targets, in either X86-64 or X86-32 mode, direct calls to // external symbols should go through the PLT. if (Subtarget->isTargetELF() && - getTargetMachine().getRelocationModel() == Reloc::PIC_) { + DAG.getTarget().getRelocationModel() == Reloc::PIC_) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && (!Subtarget->getTargetTriple().isMacOSX() || @@ -2945,7 +2972,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -2969,7 +2996,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Create the CALLSEQ_END node. unsigned NumBytesForCalleeToPop; if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, - getTargetMachine().Options.GuaranteedTailCallOpt)) + DAG.getTarget().Options.GuaranteedTailCallOpt)) NumBytesForCalleeToPop = NumBytes; // Callee pops everything else if (!Is64Bit && !IsTailCallConvention(CallConv) && !Subtarget->getTargetTriple().isOSMSVCRT() && @@ -3140,7 +3167,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC); - if (getTargetMachine().Options.GuaranteedTailCallOpt) { + if (DAG.getTarget().Options.GuaranteedTailCallOpt) { if (IsTailCallConvention(CalleeCC) && CCMatch) return true; return false; @@ -3152,7 +3179,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to // emit a special epilogue. const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); if (RegInfo->needsStackRealignment(MF)) return false; @@ -3181,7 +3208,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + DAG.getTarget(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CC_X86); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) @@ -3202,7 +3229,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (Unused) { SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + DAG.getTarget(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; @@ -3216,12 +3243,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (!CCMatch) { SmallVector<CCValAssign, 16> RVLocs1; CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), - getTargetMachine(), RVLocs1, *DAG.getContext()); + DAG.getTarget(), RVLocs1, *DAG.getContext()); CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); SmallVector<CCValAssign, 16> RVLocs2; CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), - getTargetMachine(), RVLocs2, *DAG.getContext()); + DAG.getTarget(), RVLocs2, *DAG.getContext()); CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); if (RVLocs1.size() != RVLocs2.size()) @@ -3248,7 +3275,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // argument is passed on the stack. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + DAG.getTarget(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (IsCalleeWin64) @@ -3265,7 +3292,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); const X86InstrInfo *TII = - ((const X86TargetMachine&)getTargetMachine()).getInstrInfo(); + static_cast<const X86InstrInfo *>(DAG.getTarget().getInstrInfo()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; @@ -3288,12 +3315,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (!Subtarget->is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) && !isa<ExternalSymbolSDNode>(Callee)) || - getTargetMachine().getRelocationModel() == Reloc::PIC_)) { + DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { unsigned NumInRegs = 0; // In PIC we need an extra register to formulate the address computation // for the callee. unsigned MaxInRegs = - (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3; + (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -3417,7 +3444,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); int ReturnAddrIndex = FuncInfo->getRAIndex(); @@ -3967,14 +3994,22 @@ static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) { unsigned CorrectPosV1 = 0; unsigned CorrectPosV2 = 0; - for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) + for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) { + if (Mask[i] == -1) { + ++CorrectPosV1; + ++CorrectPosV2; + continue; + } + if (Mask[i] == i) ++CorrectPosV1; else if (Mask[i] == i + 4) ++CorrectPosV2; + } if (CorrectPosV1 == 3 || CorrectPosV2 == 3) - // We have 3 elements from one vector, and one from another. + // We have 3 elements (undefs count as elements from any vector) from one + // vector, and one from another. return true; return false; @@ -4823,19 +4858,6 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, return true; } -/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are -/// all the same. -static bool isSplatVector(SDNode *N) { - if (N->getOpcode() != ISD::BUILD_VECTOR) - return false; - - SDValue SplatValue = N->getOperand(0); - for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) - if (N->getOperand(i) != SplatValue) - return false; - return true; -} - /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved /// to an zero vector. /// FIXME: move to dag combiner / method on ShuffleVectorSDNode @@ -5744,18 +5766,22 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, return SDValue(); case ISD::BUILD_VECTOR: { - // The BUILD_VECTOR node must be a splat. - if (!isSplatVector(Op.getNode())) + auto *BVOp = cast<BuildVectorSDNode>(Op.getNode()); + BitVector UndefElements; + SDValue Splat = BVOp->getSplatValue(&UndefElements); + + // We need a splat of a single value to use broadcast, and it doesn't + // make any sense if the value is only in one element of the vector. + if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1) return SDValue(); - Ld = Op.getOperand(0); + Ld = Splat; ConstSplatVal = (Ld.getOpcode() == ISD::Constant || - Ld.getOpcode() == ISD::ConstantFP); + Ld.getOpcode() == ISD::ConstantFP); - // The suspected load node has several users. Make sure that all - // of its users are from the BUILD_VECTOR node. - // Constants may have multiple users. - if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) + // Make sure that all of the users of a non-constant load are from the + // BUILD_VECTOR node. + if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode())) return SDValue(); break; } @@ -6042,6 +6068,433 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BITCAST, dl, VT, Select); } +/// \brief Return true if \p N implements a horizontal binop and return the +/// operands for the horizontal binop into V0 and V1. +/// +/// This is a helper function of PerformBUILD_VECTORCombine. +/// This function checks that the build_vector \p N in input implements a +/// horizontal operation. Parameter \p Opcode defines the kind of horizontal +/// operation to match. +/// For example, if \p Opcode is equal to ISD::ADD, then this function +/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode +/// is equal to ISD::SUB, then this function checks if this is a horizontal +/// arithmetic sub. +/// +/// This function only analyzes elements of \p N whose indices are +/// in range [BaseIdx, LastIdx). +static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, + SelectionDAG &DAG, + unsigned BaseIdx, unsigned LastIdx, + SDValue &V0, SDValue &V1) { + EVT VT = N->getValueType(0); + + assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); + assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && + "Invalid Vector in input!"); + + bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); + bool CanFold = true; + unsigned ExpectedVExtractIdx = BaseIdx; + unsigned NumElts = LastIdx - BaseIdx; + V0 = DAG.getUNDEF(VT); + V1 = DAG.getUNDEF(VT); + + // Check if N implements a horizontal binop. + for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) { + SDValue Op = N->getOperand(i + BaseIdx); + + // Skip UNDEFs. + if (Op->getOpcode() == ISD::UNDEF) { + // Update the expected vector extract index. + if (i * 2 == NumElts) + ExpectedVExtractIdx = BaseIdx; + ExpectedVExtractIdx += 2; + continue; + } + + CanFold = Op->getOpcode() == Opcode && Op->hasOneUse(); + + if (!CanFold) + break; + + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + // Try to match the following pattern: + // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1)) + CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Op0.getOperand(0) == Op1.getOperand(0) && + isa<ConstantSDNode>(Op0.getOperand(1)) && + isa<ConstantSDNode>(Op1.getOperand(1))); + if (!CanFold) + break; + + unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); + unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue(); + + if (i * 2 < NumElts) { + if (V0.getOpcode() == ISD::UNDEF) + V0 = Op0.getOperand(0); + } else { + if (V1.getOpcode() == ISD::UNDEF) + V1 = Op0.getOperand(0); + if (i * 2 == NumElts) + ExpectedVExtractIdx = BaseIdx; + } + + SDValue Expected = (i * 2 < NumElts) ? V0 : V1; + if (I0 == ExpectedVExtractIdx) + CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected; + else if (IsCommutable && I1 == ExpectedVExtractIdx) { + // Try to match the following dag sequence: + // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I)) + CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected; + } else + CanFold = false; + + ExpectedVExtractIdx += 2; + } + + return CanFold; +} + +/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by +/// a concat_vector. +/// +/// This is a helper function of PerformBUILD_VECTORCombine. +/// This function expects two 256-bit vectors called V0 and V1. +/// At first, each vector is split into two separate 128-bit vectors. +/// Then, the resulting 128-bit vectors are used to implement two +/// horizontal binary operations. +/// +/// The kind of horizontal binary operation is defined by \p X86Opcode. +/// +/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to +/// the two new horizontal binop. +/// When Mode is set, the first horizontal binop dag node would take as input +/// the lower 128-bit of V0 and the upper 128-bit of V0. The second +/// horizontal binop dag node would take as input the lower 128-bit of V1 +/// and the upper 128-bit of V1. +/// Example: +/// HADD V0_LO, V0_HI +/// HADD V1_LO, V1_HI +/// +/// Otherwise, the first horizontal binop dag node takes as input the lower +/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop +/// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1. +/// Example: +/// HADD V0_LO, V1_LO +/// HADD V0_HI, V1_HI +/// +/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower +/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to +/// the upper 128-bits of the result. +static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, + SDLoc DL, SelectionDAG &DAG, + unsigned X86Opcode, bool Mode, + bool isUndefLO, bool isUndefHI) { + EVT VT = V0.getValueType(); + assert(VT.is256BitVector() && VT == V1.getValueType() && + "Invalid nodes in input!"); + + unsigned NumElts = VT.getVectorNumElements(); + SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL); + SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL); + SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL); + SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL); + EVT NewVT = V0_LO.getValueType(); + + SDValue LO = DAG.getUNDEF(NewVT); + SDValue HI = DAG.getUNDEF(NewVT); + + if (Mode) { + // Don't emit a horizontal binop if the result is expected to be UNDEF. + if (!isUndefLO && V0->getOpcode() != ISD::UNDEF) + LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); + if (!isUndefHI && V1->getOpcode() != ISD::UNDEF) + HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); + } else { + // Don't emit a horizontal binop if the result is expected to be UNDEF. + if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF || + V1_LO->getOpcode() != ISD::UNDEF)) + LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); + + if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF || + V1_HI->getOpcode() != ISD::UNDEF)) + HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); + } + + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); +} + +/// \brief Try to fold a build_vector that performs an 'addsub' into the +/// sequence of 'vadd + vsub + blendi'. +static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDLoc DL(BV); + EVT VT = BV->getValueType(0); + unsigned NumElts = VT.getVectorNumElements(); + SDValue InVec0 = DAG.getUNDEF(VT); + SDValue InVec1 = DAG.getUNDEF(VT); + + assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || + VT == MVT::v2f64) && "build_vector with an invalid type found!"); + + // Don't try to emit a VSELECT that cannot be lowered into a blend. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) + return SDValue(); + + // Odd-numbered elements in the input build vector are obtained from + // adding two integer/float elements. + // Even-numbered elements in the input build vector are obtained from + // subtracting two integer/float elements. + unsigned ExpectedOpcode = ISD::FSUB; + unsigned NextExpectedOpcode = ISD::FADD; + bool AddFound = false; + bool SubFound = false; + + for (unsigned i = 0, e = NumElts; i != e; i++) { + SDValue Op = BV->getOperand(i); + + // Skip 'undef' values. + unsigned Opcode = Op.getOpcode(); + if (Opcode == ISD::UNDEF) { + std::swap(ExpectedOpcode, NextExpectedOpcode); + continue; + } + + // Early exit if we found an unexpected opcode. + if (Opcode != ExpectedOpcode) + return SDValue(); + + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + // Try to match the following pattern: + // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i)) + // Early exit if we cannot match that sequence. + if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa<ConstantSDNode>(Op0.getOperand(1)) || + !isa<ConstantSDNode>(Op1.getOperand(1)) || + Op0.getOperand(1) != Op1.getOperand(1)) + return SDValue(); + + unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); + if (I0 != i) + return SDValue(); + + // We found a valid add/sub node. Update the information accordingly. + if (i & 1) + AddFound = true; + else + SubFound = true; + + // Update InVec0 and InVec1. + if (InVec0.getOpcode() == ISD::UNDEF) + InVec0 = Op0.getOperand(0); + if (InVec1.getOpcode() == ISD::UNDEF) + InVec1 = Op1.getOperand(0); + + // Make sure that operands in input to each add/sub node always + // come from a same pair of vectors. + if (InVec0 != Op0.getOperand(0)) { + if (ExpectedOpcode == ISD::FSUB) + return SDValue(); + + // FADD is commutable. Try to commute the operands + // and then test again. + std::swap(Op0, Op1); + if (InVec0 != Op0.getOperand(0)) + return SDValue(); + } + + if (InVec1 != Op1.getOperand(0)) + return SDValue(); + + // Update the pair of expected opcodes. + std::swap(ExpectedOpcode, NextExpectedOpcode); + } + + // Don't try to fold this build_vector into a VSELECT if it has + // too many UNDEF operands. + if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF && + InVec1.getOpcode() != ISD::UNDEF) { + // Emit a sequence of vector add and sub followed by a VSELECT. + // The new VSELECT will be lowered into a BLENDI. + // At ISel stage, we pattern-match the sequence 'add + sub + BLENDI' + // and emit a single ADDSUB instruction. + SDValue Sub = DAG.getNode(ExpectedOpcode, DL, VT, InVec0, InVec1); + SDValue Add = DAG.getNode(NextExpectedOpcode, DL, VT, InVec0, InVec1); + + // Construct the VSELECT mask. + EVT MaskVT = VT.changeVectorElementTypeToInteger(); + EVT SVT = MaskVT.getVectorElementType(); + unsigned SVTBits = SVT.getSizeInBits(); + SmallVector<SDValue, 8> Ops; + + for (unsigned i = 0, e = NumElts; i != e; ++i) { + APInt Value = i & 1 ? APInt::getNullValue(SVTBits) : + APInt::getAllOnesValue(SVTBits); + SDValue Constant = DAG.getConstant(Value, SVT); + Ops.push_back(Constant); + } + + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, Ops); + return DAG.getSelect(DL, VT, Mask, Sub, Add); + } + + return SDValue(); +} + +static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + unsigned NumElts = VT.getVectorNumElements(); + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N); + SDValue InVec0, InVec1; + + // Try to match an ADDSUB. + if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) { + SDValue Value = matchAddSub(BV, DAG, Subtarget); + if (Value.getNode()) + return Value; + } + + // Try to match horizontal ADD/SUB. + unsigned NumUndefsLO = 0; + unsigned NumUndefsHI = 0; + unsigned Half = NumElts/2; + + // Count the number of UNDEF operands in the build_vector in input. + for (unsigned i = 0, e = Half; i != e; ++i) + if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) + NumUndefsLO++; + + for (unsigned i = Half, e = NumElts; i != e; ++i) + if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) + NumUndefsHI++; + + // Early exit if this is either a build_vector of all UNDEFs or all the + // operands but one are UNDEF. + if (NumUndefsLO + NumUndefsHI + 1 >= NumElts) + return SDValue(); + + if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) { + // Try to match an SSE3 float HADD/HSUB. + if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) + return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); + + if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) + return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); + } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) { + // Try to match an SSSE3 integer HADD/HSUB. + if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) + return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1); + + if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) + return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1); + } + + if (!Subtarget->hasAVX()) + return SDValue(); + + if ((VT == MVT::v8f32 || VT == MVT::v4f64)) { + // Try to match an AVX horizontal add/sub of packed single/double + // precision floating point values from 256-bit vectors. + SDValue InVec2, InVec3; + if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) + return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); + + if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) + return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); + } else if (VT == MVT::v8i32 || VT == MVT::v16i16) { + // Try to match an AVX2 horizontal add/sub of signed integers. + SDValue InVec2, InVec3; + unsigned X86Opcode; + bool CanFold = true; + + if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) + X86Opcode = X86ISD::HADD; + else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) + X86Opcode = X86ISD::HSUB; + else + CanFold = false; + + if (CanFold) { + // Fold this build_vector into a single horizontal add/sub. + // Do this only if the target has AVX2. + if (Subtarget->hasAVX2()) + return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1); + + // Do not try to expand this build_vector into a pair of horizontal + // add/sub if we can emit a pair of scalar add/sub. + if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) + return SDValue(); + + // Convert this build_vector into a pair of horizontal binop followed by + // a concat vector. + bool isUndefLO = NumUndefsLO == Half; + bool isUndefHI = NumUndefsHI == Half; + return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false, + isUndefLO, isUndefHI); + } + } + + if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || + VT == MVT::v16i16) && Subtarget->hasAVX()) { + unsigned X86Opcode; + if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) + X86Opcode = X86ISD::HADD; + else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) + X86Opcode = X86ISD::HSUB; + else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) + X86Opcode = X86ISD::FHADD; + else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) + X86Opcode = X86ISD::FHSUB; + else + return SDValue(); + + // Don't try to expand this build_vector into a pair of horizontal add/sub + // if we can simply emit a pair of scalar add/sub. + if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) + return SDValue(); + + // Convert this build_vector into two horizontal add/sub followed by + // a concat vector. + bool isUndefLO = NumUndefsLO == Half; + bool isUndefHI = NumUndefsHI == Half; + return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true, + isUndefLO, isUndefHI); + } + + return SDValue(); +} + SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -6429,38 +6882,1160 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { return LowerAVXCONCAT_VECTORS(Op, DAG); } -// Try to lower a shuffle node into a simple blend instruction. -static SDValue -LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, - const X86Subtarget *Subtarget, SelectionDAG &DAG) { - SDValue V1 = SVOp->getOperand(0); - SDValue V2 = SVOp->getOperand(1); - SDLoc dl(SVOp); - MVT VT = SVOp->getSimpleValueType(0); + +//===----------------------------------------------------------------------===// +// Vector shuffle lowering +// +// This is an experimental code path for lowering vector shuffles on x86. It is +// designed to handle arbitrary vector shuffles and blends, gracefully +// degrading performance as necessary. It works hard to recognize idiomatic +// shuffles and lower them to optimal instruction patterns without leaving +// a framework that allows reasonably efficient handling of all vector shuffle +// patterns. +//===----------------------------------------------------------------------===// + +/// \brief Tiny helper function to identify a no-op mask. +/// +/// This is a somewhat boring predicate function. It checks whether the mask +/// array input, which is assumed to be a single-input shuffle mask of the kind +/// used by the X86 shuffle instructions (not a fully general +/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an +/// in-place shuffle are 'no-op's. +static bool isNoopShuffleMask(ArrayRef<int> Mask) { + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] != -1 && Mask[i] != i) + return false; + return true; +} + +/// \brief Helper function to classify a mask as a single-input mask. +/// +/// This isn't a generic single-input test because in the vector shuffle +/// lowering we canonicalize single inputs to be the first input operand. This +/// means we can more quickly test for a single input by only checking whether +/// an input from the second operand exists. We also assume that the size of +/// mask corresponds to the size of the input vectors which isn't true in the +/// fully general case. +static bool isSingleInputShuffleMask(ArrayRef<int> Mask) { + for (int M : Mask) + if (M >= (int)Mask.size()) + return false; + return true; +} + +/// \brief Get a 4-lane 8-bit shuffle immediate for a mask. +/// +/// This helper function produces an 8-bit shuffle immediate corresponding to +/// the ubiquitous shuffle encoding scheme used in x86 instructions for +/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for +/// example. +/// +/// NB: We rely heavily on "undef" masks preserving the input lane. +static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); + assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); + assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); + assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"); + assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); + + unsigned Imm = 0; + Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0; + Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2; + Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4; + Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6; + return DAG.getConstant(Imm, MVT::i8); +} + +/// \brief Handle lowering of 2-lane 64-bit floating point shuffles. +/// +/// This is the basis function for the 2-lane 64-bit shuffles as we have full +/// support for floating point shuffles but not integer shuffles. These +/// instructions will incur a domain crossing penalty on some chips though so +/// it is better to avoid lowering through this for integer vectors where +/// possible. +static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); + + if (isSingleInputShuffleMask(Mask)) { + // Straight shuffle of a single input vector. Simulate this by using the + // single input as both of the "inputs" to this instruction.. + unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); + return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1, + DAG.getConstant(SHUFPDMask, MVT::i8)); + } + assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); + assert(Mask[1] >= 2 && "Non-canonicalized blend!"); + + unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); + return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2, + DAG.getConstant(SHUFPDMask, MVT::i8)); +} + +/// \brief Handle lowering of 2-lane 64-bit integer shuffles. +/// +/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by +/// the integer unit to minimize domain crossing penalties. However, for blends +/// it falls back to the floating point shuffle operation with appropriate bit +/// casting. +static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); + + if (isSingleInputShuffleMask(Mask)) { + // Straight shuffle of a single input vector. For everything from SSE2 + // onward this has a single fast instruction with no scary immediates. + // We have to map the mask as it is actually a v4i32 shuffle instruction. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1); + int WidenedMask[4] = { + std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1, + std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1}; + return DAG.getNode( + ISD::BITCAST, DL, MVT::v2i64, + DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1, + getV4X86ShuffleImm8ForMask(WidenedMask, DAG))); + } + + // We implement this with SHUFPD which is pretty lame because it will likely + // incur 2 cycles of stall for integer vectors on Nehalem and older chips. + // However, all the alternatives are still more cycles and newer chips don't + // have this problem. It would be really nice if x86 had better shuffles here. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2); + return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, + DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); +} + +/// \brief Lower 4-lane 32-bit floating point shuffles. +/// +/// Uses instructions exclusively from the floating point unit to minimize +/// domain crossing penalties, as these are sufficient to implement all v4f32 +/// shuffles. +static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + SDValue LowV = V1, HighV = V2; + int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; + + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + + if (NumV2Elements == 0) + // Straight shuffle of a single input vector. We pass the input vector to + // both operands to simulate this with a SHUFPS. + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + if (NumV2Elements == 1) { + int V2Index = + std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - + Mask.begin(); + // Compute the index adjacent to V2Index and in the same half by toggling + // the low bit. + int V2AdjIndex = V2Index ^ 1; + + if (Mask[V2AdjIndex] == -1) { + // Handles all the cases where we have a single V2 element and an undef. + // This will only ever happen in the high lanes because we commute the + // vector otherwise. + if (V2Index < 2) + std::swap(LowV, HighV); + NewMask[V2Index] -= 4; + } else { + // Handle the case where the V2 element ends up adjacent to a V1 element. + // To make this work, blend them together as the first step. + int V1Index = V2AdjIndex; + int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; + V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1, + getV4X86ShuffleImm8ForMask(BlendMask, DAG)); + + // Now proceed to reconstruct the final blend as we have the necessary + // high or low half formed. + if (V2Index < 2) { + LowV = V2; + HighV = V1; + } else { + HighV = V2; + } + NewMask[V1Index] = 2; // We put the V1 element in V2[2]. + NewMask[V2Index] = 0; // We shifted the V2 element into V2[0]. + } + } else if (NumV2Elements == 2) { + if (Mask[0] < 4 && Mask[1] < 4) { + // Handle the easy case where we have V1 in the low lanes and V2 in the + // high lanes. We never see this reversed because we sort the shuffle. + NewMask[2] -= 4; + NewMask[3] -= 4; + } else { + // We have a mixture of V1 and V2 in both low and high lanes. Rather than + // trying to place elements directly, just blend them and set up the final + // shuffle to place them. + + // The first two blend mask elements are for V1, the second two are for + // V2. + int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1], + Mask[2] < 4 ? Mask[2] : Mask[3], + (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, + (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; + V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2, + getV4X86ShuffleImm8ForMask(BlendMask, DAG)); + + // Now we do a normal shuffle of V1 by giving V1 as both operands to + // a blend. + LowV = HighV = V1; + NewMask[0] = Mask[0] < 4 ? 0 : 2; + NewMask[1] = Mask[0] < 4 ? 2 : 0; + NewMask[2] = Mask[2] < 4 ? 1 : 3; + NewMask[3] = Mask[2] < 4 ? 3 : 1; + } + } + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV, + getV4X86ShuffleImm8ForMask(NewMask, DAG)); +} + +/// \brief Lower 4-lane i32 vector shuffles. +/// +/// We try to handle these with integer-domain shuffles where we can, but for +/// blends we use the floating point domain blend instructions. +static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + if (isSingleInputShuffleMask(Mask)) + // Straight shuffle of a single input vector. For everything from SSE2 + // onward this has a single fast instruction with no scary immediates. + return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // We implement this with SHUFPS because it can blend from two vectors. + // Because we're going to eventually use SHUFPS, we use SHUFPS even to build + // up the inputs, bypassing domain shift penalties that we would encur if we + // directly used PSHUFD on Nehalem and older. For newer chips, this isn't + // relevant. + return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, + DAG.getVectorShuffle( + MVT::v4f32, DL, + DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1), + DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask)); +} + +/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 +/// shuffle lowering, and the most complex part. +/// +/// The lowering strategy is to try to form pairs of input lanes which are +/// targeted at the same half of the final vector, and then use a dword shuffle +/// to place them onto the right half, and finally unpack the paired lanes into +/// their final position. +/// +/// The exact breakdown of how to form these dword pairs and align them on the +/// correct sides is really tricky. See the comments within the function for +/// more of the details. +static SDValue lowerV8I16SingleInputVectorShuffle( + SDLoc DL, SDValue V, MutableArrayRef<int> Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); + MutableArrayRef<int> LoMask = Mask.slice(0, 4); + MutableArrayRef<int> HiMask = Mask.slice(4, 4); + + SmallVector<int, 4> LoInputs; + std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs), + [](int M) { return M >= 0; }); + std::sort(LoInputs.begin(), LoInputs.end()); + LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); + SmallVector<int, 4> HiInputs; + std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs), + [](int M) { return M >= 0; }); + std::sort(HiInputs.begin(), HiInputs.end()); + HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); + int NumLToL = + std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin(); + int NumHToL = LoInputs.size() - NumLToL; + int NumLToH = + std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin(); + int NumHToH = HiInputs.size() - NumLToH; + MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL); + MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH); + MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL); + MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH); + + // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all + // such inputs we can swap two of the dwords across the half mark and end up + // with <=2 inputs to each half in each half. Once there, we can fall through + // to the generic code below. For example: + // + // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] + // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] + // + // Before we had 3-1 in the low half and 3-1 in the high half. Afterward, 2-2 + // and 2-2. + auto balanceSides = [&](ArrayRef<int> ThreeInputs, int OneInput, + int ThreeInputHalfSum, int OneInputHalfOffset) { + // Compute the index of dword with only one word among the three inputs in + // a half by taking the sum of the half with three inputs and subtracting + // the sum of the actual three inputs. The difference is the remaining + // slot. + int DWordA = (ThreeInputHalfSum - + std::accumulate(ThreeInputs.begin(), ThreeInputs.end(), 0)) / + 2; + int DWordB = OneInputHalfOffset / 2 + (OneInput / 2 + 1) % 2; + + int PSHUFDMask[] = {0, 1, 2, 3}; + PSHUFDMask[DWordA] = DWordB; + PSHUFDMask[DWordB] = DWordA; + V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); + + // Adjust the mask to match the new locations of A and B. + for (int &M : Mask) + if (M != -1 && M/2 == DWordA) + M = 2 * DWordB + M % 2; + else if (M != -1 && M/2 == DWordB) + M = 2 * DWordA + M % 2; + + // Recurse back into this routine to re-compute state now that this isn't + // a 3 and 1 problem. + return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), + Mask); + }; + if (NumLToL == 3 && NumHToL == 1) + return balanceSides(LToLInputs, HToLInputs[0], 0 + 1 + 2 + 3, 4); + else if (NumLToL == 1 && NumHToL == 3) + return balanceSides(HToLInputs, LToLInputs[0], 4 + 5 + 6 + 7, 0); + else if (NumLToH == 1 && NumHToH == 3) + return balanceSides(HToHInputs, LToHInputs[0], 4 + 5 + 6 + 7, 0); + else if (NumLToH == 3 && NumHToH == 1) + return balanceSides(LToHInputs, HToHInputs[0], 0 + 1 + 2 + 3, 4); + + // At this point there are at most two inputs to the low and high halves from + // each half. That means the inputs can always be grouped into dwords and + // those dwords can then be moved to the correct half with a dword shuffle. + // We use at most one low and one high word shuffle to collect these paired + // inputs into dwords, and finally a dword shuffle to place them. + int PSHUFLMask[4] = {-1, -1, -1, -1}; + int PSHUFHMask[4] = {-1, -1, -1, -1}; + int PSHUFDMask[4] = {-1, -1, -1, -1}; + + // First fix the masks for all the inputs that are staying in their + // original halves. This will then dictate the targets of the cross-half + // shuffles. + auto fixInPlaceInputs = [&PSHUFDMask]( + ArrayRef<int> InPlaceInputs, MutableArrayRef<int> SourceHalfMask, + MutableArrayRef<int> HalfMask, int HalfOffset) { + if (InPlaceInputs.empty()) + return; + if (InPlaceInputs.size() == 1) { + SourceHalfMask[InPlaceInputs[0] - HalfOffset] = + InPlaceInputs[0] - HalfOffset; + PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; + return; + } + + assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); + SourceHalfMask[InPlaceInputs[0] - HalfOffset] = + InPlaceInputs[0] - HalfOffset; + // Put the second input next to the first so that they are packed into + // a dword. We find the adjacent index by toggling the low bit. + int AdjIndex = InPlaceInputs[0] ^ 1; + SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset; + std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); + PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; + }; + if (!HToLInputs.empty()) + fixInPlaceInputs(LToLInputs, PSHUFLMask, LoMask, 0); + if (!LToHInputs.empty()) + fixInPlaceInputs(HToHInputs, PSHUFHMask, HiMask, 4); + + // Now gather the cross-half inputs and place them into a free dword of + // their target half. + // FIXME: This operation could almost certainly be simplified dramatically to + // look more like the 3-1 fixing operation. + auto moveInputsToRightHalf = [&PSHUFDMask]( + MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs, + MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask, + int SourceOffset, int DestOffset) { + auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) { + return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word; + }; + auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask, + int Word) { + int LowWord = Word & ~1; + int HighWord = Word | 1; + return isWordClobbered(SourceHalfMask, LowWord) || + isWordClobbered(SourceHalfMask, HighWord); + }; + + if (IncomingInputs.empty()) + return; + + if (ExistingInputs.empty()) { + // Map any dwords with inputs from them into the right half. + for (int Input : IncomingInputs) { + // If the source half mask maps over the inputs, turn those into + // swaps and use the swapped lane. + if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) { + if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) { + SourceHalfMask[SourceHalfMask[Input - SourceOffset]] = + Input - SourceOffset; + // We have to swap the uses in our half mask in one sweep. + for (int &M : HalfMask) + if (M == SourceHalfMask[Input - SourceOffset]) + M = Input; + else if (M == Input) + M = SourceHalfMask[Input - SourceOffset] + SourceOffset; + } else { + assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == + Input - SourceOffset && + "Previous placement doesn't match!"); + } + // Note that this correctly re-maps both when we do a swap and when + // we observe the other side of the swap above. We rely on that to + // avoid swapping the members of the input list directly. + Input = SourceHalfMask[Input - SourceOffset] + SourceOffset; + } + + // Map the input's dword into the correct half. + if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1) + PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2; + else + assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == + Input / 2 && + "Previous placement doesn't match!"); + } + + // And just directly shift any other-half mask elements to be same-half + // as we will have mirrored the dword containing the element into the + // same position within that half. + for (int &M : HalfMask) + if (M >= SourceOffset && M < SourceOffset + 4) { + M = M - SourceOffset + DestOffset; + assert(M >= 0 && "This should never wrap below zero!"); + } + return; + } + + // Ensure we have the input in a viable dword of its current half. This + // is particularly tricky because the original position may be clobbered + // by inputs being moved and *staying* in that half. + if (IncomingInputs.size() == 1) { + if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { + int InputFixed = std::find(std::begin(SourceHalfMask), + std::end(SourceHalfMask), -1) - + std::begin(SourceHalfMask) + SourceOffset; + SourceHalfMask[InputFixed - SourceOffset] = + IncomingInputs[0] - SourceOffset; + std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0], + InputFixed); + IncomingInputs[0] = InputFixed; + } + } else if (IncomingInputs.size() == 2) { + if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || + isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { + int SourceDWordBase = !isDWordClobbered(SourceHalfMask, 0) ? 0 : 2; + assert(!isDWordClobbered(SourceHalfMask, SourceDWordBase) && + "Not all dwords can be clobbered!"); + SourceHalfMask[SourceDWordBase] = IncomingInputs[0] - SourceOffset; + SourceHalfMask[SourceDWordBase + 1] = IncomingInputs[1] - SourceOffset; + for (int &M : HalfMask) + if (M == IncomingInputs[0]) + M = SourceDWordBase + SourceOffset; + else if (M == IncomingInputs[1]) + M = SourceDWordBase + 1 + SourceOffset; + IncomingInputs[0] = SourceDWordBase + SourceOffset; + IncomingInputs[1] = SourceDWordBase + 1 + SourceOffset; + } + } else { + llvm_unreachable("Unhandled input size!"); + } + + // Now hoist the DWord down to the right half. + int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2; + assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free"); + PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; + for (int Input : IncomingInputs) + std::replace(HalfMask.begin(), HalfMask.end(), Input, + FreeDWord * 2 + Input % 2); + }; + moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, + /*SourceOffset*/ 4, /*DestOffset*/ 0); + moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, + /*SourceOffset*/ 0, /*DestOffset*/ 4); + + // Now enact all the shuffles we've computed to move the inputs into their + // target half. + if (!isNoopShuffleMask(PSHUFLMask)) + V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V, + getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG)); + if (!isNoopShuffleMask(PSHUFHMask)) + V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V, + getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG)); + if (!isNoopShuffleMask(PSHUFDMask)) + V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); + + // At this point, each half should contain all its inputs, and we can then + // just shuffle them into their final position. + assert(std::count_if(LoMask.begin(), LoMask.end(), + [](int M) { return M >= 4; }) == 0 && + "Failed to lift all the high half inputs to the low mask!"); + assert(std::count_if(HiMask.begin(), HiMask.end(), + [](int M) { return M >= 0 && M < 4; }) == 0 && + "Failed to lift all the low half inputs to the high mask!"); + + // Do a half shuffle for the low mask. + if (!isNoopShuffleMask(LoMask)) + V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V, + getV4X86ShuffleImm8ForMask(LoMask, DAG)); + + // Do a half shuffle with the high mask after shifting its values down. + for (int &M : HiMask) + if (M >= 0) + M -= 4; + if (!isNoopShuffleMask(HiMask)) + V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V, + getV4X86ShuffleImm8ForMask(HiMask, DAG)); + + return V; +} + +/// \brief Detect whether the mask pattern should be lowered through +/// interleaving. +/// +/// This essentially tests whether viewing the mask as an interleaving of two +/// sub-sequences reduces the cross-input traffic of a blend operation. If so, +/// lowering it through interleaving is a significantly better strategy. +static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) { + int NumEvenInputs[2] = {0, 0}; + int NumOddInputs[2] = {0, 0}; + int NumLoInputs[2] = {0, 0}; + int NumHiInputs[2] = {0, 0}; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Mask[i] < 0) + continue; + + int InputIdx = Mask[i] >= Size; + + if (i < Size / 2) + ++NumLoInputs[InputIdx]; + else + ++NumHiInputs[InputIdx]; + + if ((i % 2) == 0) + ++NumEvenInputs[InputIdx]; + else + ++NumOddInputs[InputIdx]; + } + + // The minimum number of cross-input results for both the interleaved and + // split cases. If interleaving results in fewer cross-input results, return + // true. + int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0], + NumEvenInputs[0] + NumOddInputs[1]); + int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0], + NumLoInputs[0] + NumHiInputs[1]); + return InterleavedCrosses < SplitCrosses; +} + +/// \brief Blend two v8i16 vectors using a naive unpack strategy. +/// +/// This strategy only works when the inputs from each vector fit into a single +/// half of that vector, and generally there are not so many inputs as to leave +/// the in-place shuffles required highly constrained (and thus expensive). It +/// shifts all the inputs into a single side of both input vectors and then +/// uses an unpack to interleave these inputs in a single vector. At that +/// point, we will fall back on the generic single input shuffle lowering. +static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1, + SDValue V2, + MutableArrayRef<int> Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); + assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); + SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs; + for (int i = 0; i < 8; ++i) + if (Mask[i] >= 0 && Mask[i] < 4) + LoV1Inputs.push_back(i); + else if (Mask[i] >= 4 && Mask[i] < 8) + HiV1Inputs.push_back(i); + else if (Mask[i] >= 8 && Mask[i] < 12) + LoV2Inputs.push_back(i); + else if (Mask[i] >= 12) + HiV2Inputs.push_back(i); + + int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size(); + int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size(); + (void)NumV1Inputs; + (void)NumV2Inputs; + assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported"); + assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported"); + assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs"); + + bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >= + HiV1Inputs.size() + HiV2Inputs.size(); + + auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs, + ArrayRef<int> HiInputs, bool MoveToLo, + int MaskOffset) { + ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs; + ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs; + if (BadInputs.empty()) + return V; + + int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int MoveOffset = MoveToLo ? 0 : 4; + + if (GoodInputs.empty()) { + for (int BadInput : BadInputs) { + MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset; + Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset; + } + } else { + if (GoodInputs.size() == 2) { + // If the low inputs are spread across two dwords, pack them into + // a single dword. + MoveMask[Mask[GoodInputs[0]] % 2 + MoveOffset] = + Mask[GoodInputs[0]] - MaskOffset; + MoveMask[Mask[GoodInputs[1]] % 2 + MoveOffset] = + Mask[GoodInputs[1]] - MaskOffset; + Mask[GoodInputs[0]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset; + Mask[GoodInputs[1]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset; + } else { + // Otherwise pin the low inputs. + for (int GoodInput : GoodInputs) + MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset; + } + + int MoveMaskIdx = + std::find(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), -1) - + std::begin(MoveMask); + assert(MoveMaskIdx >= MoveOffset && "Established above"); + + if (BadInputs.size() == 2) { + assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot"); + assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot"); + MoveMask[MoveMaskIdx + Mask[BadInputs[0]] % 2] = + Mask[BadInputs[0]] - MaskOffset; + MoveMask[MoveMaskIdx + Mask[BadInputs[1]] % 2] = + Mask[BadInputs[1]] - MaskOffset; + Mask[BadInputs[0]] = MoveMaskIdx + Mask[BadInputs[0]] % 2 + MaskOffset; + Mask[BadInputs[1]] = MoveMaskIdx + Mask[BadInputs[1]] % 2 + MaskOffset; + } else { + assert(BadInputs.size() == 1 && "All sizes handled"); + MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset; + Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset; + } + } + + return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), + MoveMask); + }; + V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo, + /*MaskOffset*/ 0); + V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo, + /*MaskOffset*/ 8); + + // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes + // cross-half traffic in the final shuffle. + + // Munge the mask to be a single-input mask after the unpack merges the + // results. + for (int &M : Mask) + if (M != -1) + M = 2 * (M % 4) + (M / 8); + + return DAG.getVectorShuffle( + MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, + DL, MVT::v8i16, V1, V2), + DAG.getUNDEF(MVT::v8i16), Mask); +} + +/// \brief Generic lowering of 8-lane i16 shuffles. +/// +/// This handles both single-input shuffles and combined shuffle/blends with +/// two inputs. The single input shuffles are immediately delegated to +/// a dedicated lowering routine. +/// +/// The blends are lowered in one of three fundamental ways. If there are few +/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle +/// of the input is significantly cheaper when lowered as an interleaving of +/// the two inputs, try to interleave them. Otherwise, blend the low and high +/// halves of the inputs separately (making them have relatively few inputs) +/// and then concatenate them. +static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> OrigMask = SVOp->getMask(); + int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], + OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]}; + MutableArrayRef<int> Mask(MaskStorage); + + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + + auto isV1 = [](int M) { return M >= 0 && M < 8; }; + auto isV2 = [](int M) { return M >= 8; }; + + int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1); + int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2); + + if (NumV2Inputs == 0) + return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG); + + assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized " + "to be V1-input shuffles."); + + if (NumV1Inputs + NumV2Inputs <= 4) + return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG); + + // Check whether an interleaving lowering is likely to be more efficient. + // This isn't perfect but it is a strong heuristic that tends to work well on + // the kinds of shuffles that show up in practice. + // + // FIXME: Handle 1x, 2x, and 4x interleaving. + if (shouldLowerAsInterleaving(Mask)) { + // FIXME: Figure out whether we should pack these into the low or high + // halves. + + int EMask[8], OMask[8]; + for (int i = 0; i < 4; ++i) { + EMask[i] = Mask[2*i]; + OMask[i] = Mask[2*i + 1]; + EMask[i + 4] = -1; + OMask[i + 4] = -1; + } + + SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask); + SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask); + + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds); + } + + int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + + for (int i = 0; i < 4; ++i) { + LoBlendMask[i] = Mask[i]; + HiBlendMask[i] = Mask[i + 4]; + } + + SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask); + SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask); + LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV); + HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV); + + return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV)); +} + +/// \brief Generic lowering of v16i8 shuffles. +/// +/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to +/// detect any complexity reducing interleaving. If that doesn't help, it uses +/// UNPCK to spread the i8 elements across two i16-element vectors, and uses +/// the existing lowering for v8i16 blends on each half, finally PACK-ing them +/// back together. +static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> OrigMask = SVOp->getMask(); + assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + int MaskStorage[16] = { + OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], + OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7], + OrigMask[8], OrigMask[9], OrigMask[10], OrigMask[11], + OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]}; + MutableArrayRef<int> Mask(MaskStorage); + MutableArrayRef<int> LoMask = Mask.slice(0, 8); + MutableArrayRef<int> HiMask = Mask.slice(8, 8); + + // For single-input shuffles, there are some nicer lowering tricks we can use. + if (isSingleInputShuffleMask(Mask)) { + // Check whether we can widen this to an i16 shuffle by duplicating bytes. + // Notably, this handles splat and partial-splat shuffles more efficiently. + // However, it only makes sense if the pre-duplication shuffle simplifies + // things significantly. Currently, this means we need to be able to + // express the pre-duplication shuffle as an i16 shuffle. + // + // FIXME: We should check for other patterns which can be widened into an + // i16 shuffle as well. + auto canWidenViaDuplication = [](ArrayRef<int> Mask) { + for (int i = 0; i < 16; i += 2) { + if (Mask[i] != Mask[i + 1]) + return false; + } + return true; + }; + auto tryToWidenViaDuplication = [&]() -> SDValue { + if (!canWidenViaDuplication(Mask)) + return SDValue(); + SmallVector<int, 4> LoInputs; + std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs), + [](int M) { return M >= 0 && M < 8; }); + std::sort(LoInputs.begin(), LoInputs.end()); + LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), + LoInputs.end()); + SmallVector<int, 4> HiInputs; + std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs), + [](int M) { return M >= 8; }); + std::sort(HiInputs.begin(), HiInputs.end()); + HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), + HiInputs.end()); + + bool TargetLo = LoInputs.size() >= HiInputs.size(); + ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs; + ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs; + + int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1}; + SmallDenseMap<int, int, 8> LaneMap; + for (int I : InPlaceInputs) { + PreDupI16Shuffle[I/2] = I/2; + LaneMap[I] = I; + } + int j = TargetLo ? 0 : 4, je = j + 4; + for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) { + // Check if j is already a shuffle of this input. This happens when + // there are two adjacent bytes after we move the low one. + if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) { + // If we haven't yet mapped the input, search for a slot into which + // we can map it. + while (j < je && PreDupI16Shuffle[j] != -1) + ++j; + + if (j == je) + // We can't place the inputs into a single half with a simple i16 shuffle, so bail. + return SDValue(); + + // Map this input with the i16 shuffle. + PreDupI16Shuffle[j] = MovingInputs[i] / 2; + } + + // Update the lane map based on the mapping we ended up with. + LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2; + } + V1 = DAG.getNode( + ISD::BITCAST, DL, MVT::v16i8, + DAG.getVectorShuffle(MVT::v8i16, DL, + DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), + DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); + + // Unpack the bytes to form the i16s that will be shuffled into place. + V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, + MVT::v16i8, V1, V1); + + int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + for (int i = 0; i < 16; i += 2) { + if (Mask[i] != -1) + PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); + assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!"); + } + return DAG.getNode( + ISD::BITCAST, DL, MVT::v16i8, + DAG.getVectorShuffle(MVT::v8i16, DL, + DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), + DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle)); + }; + if (SDValue V = tryToWidenViaDuplication()) + return V; + } + + // Check whether an interleaving lowering is likely to be more efficient. + // This isn't perfect but it is a strong heuristic that tends to work well on + // the kinds of shuffles that show up in practice. + // + // FIXME: We need to handle other interleaving widths (i16, i32, ...). + if (shouldLowerAsInterleaving(Mask)) { + // FIXME: Figure out whether we should pack these into the low or high + // halves. + + int EMask[16], OMask[16]; + for (int i = 0; i < 8; ++i) { + EMask[i] = Mask[2*i]; + OMask[i] = Mask[2*i + 1]; + EMask[i + 8] = -1; + OMask[i + 8] = -1; + } + + SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask); + SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask); + + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds); + } + + int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + + auto buildBlendMasks = [](MutableArrayRef<int> HalfMask, + MutableArrayRef<int> V1HalfBlendMask, + MutableArrayRef<int> V2HalfBlendMask) { + for (int i = 0; i < 8; ++i) + if (HalfMask[i] >= 0 && HalfMask[i] < 16) { + V1HalfBlendMask[i] = HalfMask[i]; + HalfMask[i] = i; + } else if (HalfMask[i] >= 16) { + V2HalfBlendMask[i] = HalfMask[i] - 16; + HalfMask[i] = i + 8; + } + }; + buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask); + buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask); + + SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL); + + auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask, + MutableArrayRef<int> HiBlendMask) { + SDValue V1, V2; + // Check if any of the odd lanes in the v16i8 are used. If not, we can mask + // them out and avoid using UNPCK{L,H} to extract the elements of V as + // i16s. + if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(), + [](int M) { return M >= 0 && M % 2 == 1; }) && + std::none_of(HiBlendMask.begin(), HiBlendMask.end(), + [](int M) { return M >= 0 && M % 2 == 1; })) { + // Use a mask to drop the high bytes. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); + V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1, + DAG.getConstant(0x00FF, MVT::v8i16)); + + // This will be a single vector shuffle instead of a blend so nuke V2. + V2 = DAG.getUNDEF(MVT::v8i16); + + // Squash the masks to point directly into V1. + for (int &M : LoBlendMask) + if (M >= 0) + M /= 2; + for (int &M : HiBlendMask) + if (M >= 0) + M /= 2; + } else { + // Otherwise just unpack the low half of V into V1 and the high half into + // V2 so that we can blend them as i16s. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); + } + + SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask); + SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask); + return std::make_pair(BlendedLo, BlendedHi); + }; + SDValue V1Lo, V1Hi, V2Lo, V2Hi; + std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask); + std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask); + + SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask); + SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask); + + return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV); +} + +/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles. +/// +/// This routine breaks down the specific type of 128-bit shuffle and +/// dispatches to the lowering routines accordingly. +static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, + MVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + switch (VT.SimpleTy) { + case MVT::v2i64: + return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v2f64: + return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v4i32: + return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v4f32: + return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v8i16: + return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v16i8: + return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG); + + default: + llvm_unreachable("Unimplemented!"); + } +} + +/// \brief Tiny helper function to test whether adjacent masks are sequential. +static bool areAdjacentMasksSequential(ArrayRef<int> Mask) { + for (int i = 0, Size = Mask.size(); i < Size; i += 2) + if (Mask[i] + 1 != Mask[i+1]) + return false; + + return true; +} + +/// \brief Top-level lowering for x86 vector shuffles. +/// +/// This handles decomposition, canonicalization, and lowering of all x86 +/// vector shuffles. Most of the specific lowering strategies are encapsulated +/// above in helper routines. The canonicalization attempts to widen shuffles +/// to involve fewer lanes of wider elements, consolidate symmetric patterns +/// s.t. only one of the two inputs needs to be tested, etc. +static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + MVT VT = Op.getSimpleValueType(); + int NumElements = VT.getVectorNumElements(); + SDLoc dl(Op); + + assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); + + bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; + bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; + if (V1IsUndef && V2IsUndef) + return DAG.getUNDEF(VT); + + // When we create a shuffle node we put the UNDEF node to second operand, + // but in some cases the first operand may be transformed to UNDEF. + // In this case we should just commute the node. + if (V1IsUndef) + return CommuteVectorShuffle(SVOp, DAG); + + // Check for non-undef masks pointing at an undef vector and make the masks + // undef as well. This makes it easier to match the shuffle based solely on + // the mask. + if (V2IsUndef) + for (int M : Mask) + if (M >= NumElements) { + SmallVector<int, 8> NewMask(Mask.begin(), Mask.end()); + for (int &M : NewMask) + if (M >= NumElements) + M = -1; + return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask); + } + + // For integer vector shuffles, try to collapse them into a shuffle of fewer + // lanes but wider integers. We cap this to not form integers larger than i64 + // but it might be interesting to form i128 integers to handle flipping the + // low and high halves of AVX 256-bit vectors. + if (VT.isInteger() && VT.getScalarSizeInBits() < 64 && + areAdjacentMasksSequential(Mask)) { + SmallVector<int, 8> NewMask; + for (int i = 0, Size = Mask.size(); i < Size; i += 2) + NewMask.push_back(Mask[i] / 2); + MVT NewVT = + MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2), + VT.getVectorNumElements() / 2); + V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getVectorShuffle(NewVT, dl, V1, V2, NewMask)); + } + + int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0; + for (int M : SVOp->getMask()) + if (M < 0) + ++NumUndefElements; + else if (M < NumElements) + ++NumV1Elements; + else + ++NumV2Elements; + + // Commute the shuffle as needed such that more elements come from V1 than + // V2. This allows us to match the shuffle pattern strictly on how many + // elements come from V1 without handling the symmetric cases. + if (NumV2Elements > NumV1Elements) + return CommuteVectorShuffle(SVOp, DAG); + + // When the number of V1 and V2 elements are the same, try to minimize the + // number of uses of V2 in the low half of the vector. + if (NumV1Elements == NumV2Elements) { + int LowV1Elements = 0, LowV2Elements = 0; + for (int M : SVOp->getMask().slice(0, NumElements / 2)) + if (M >= NumElements) + ++LowV2Elements; + else if (M >= 0) + ++LowV1Elements; + if (LowV2Elements > LowV1Elements) + return CommuteVectorShuffle(SVOp, DAG); + } + + // For each vector width, delegate to a specialized lowering routine. + if (VT.getSizeInBits() == 128) + return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + + llvm_unreachable("Unimplemented!"); +} + + +//===----------------------------------------------------------------------===// +// Legacy vector shuffle lowering +// +// This code is the legacy code handling vector shuffles until the above +// replaces its functionality and performance. +//===----------------------------------------------------------------------===// + +static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41, + bool hasInt256, unsigned *MaskOut = nullptr) { MVT EltVT = VT.getVectorElementType(); - unsigned NumElems = VT.getVectorNumElements(); // There is no blend with immediate in AVX-512. if (VT.is512BitVector()) - return SDValue(); + return false; - if (!Subtarget->hasSSE41() || EltVT == MVT::i8) - return SDValue(); - if (!Subtarget->hasInt256() && VT == MVT::v16i16) - return SDValue(); + if (!hasSSE41 || EltVT == MVT::i8) + return false; + if (!hasInt256 && VT == MVT::v16i16) + return false; - // Check the mask for BLEND and build the value. unsigned MaskValue = 0; + unsigned NumElems = VT.getVectorNumElements(); // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. - unsigned NumLanes = (NumElems-1)/8 + 1; + unsigned NumLanes = (NumElems - 1) / 8 + 1; unsigned NumElemsInLane = NumElems / NumLanes; // Blend for v16i16 should be symetric for the both lanes. for (unsigned i = 0; i < NumElemsInLane; ++i) { - int SndLaneEltIdx = (NumLanes == 2) ? - SVOp->getMaskElt(i + NumElemsInLane) : -1; - int EltIdx = SVOp->getMaskElt(i); + int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1; + int EltIdx = MaskVals[i]; if ((EltIdx < 0 || EltIdx == (int)i) && (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane))) @@ -6469,11 +8044,34 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, if (((unsigned)EltIdx == (i + NumElems)) && (SndLaneEltIdx < 0 || (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane)) - MaskValue |= (1<<i); + MaskValue |= (1 << i); else - return SDValue(); + return false; } + if (MaskOut) + *MaskOut = MaskValue; + return true; +} + +// Try to lower a shuffle node into a simple blend instruction. +// This function assumes isBlendMask returns true for this +// SuffleVectorSDNode +static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, + unsigned MaskValue, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = SVOp->getSimpleValueType(0); + MVT EltVT = VT.getVectorElementType(); + assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(), + Subtarget->hasInt256() && "Trying to lower a " + "VECTOR_SHUFFLE to a Blend but " + "with the wrong mask")); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + SDLoc dl(SVOp); + unsigned NumElems = VT.getVectorNumElements(); + // Convert i32 vectors to floating point if it is not AVX2. // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. MVT BlendVT = VT; @@ -7450,8 +9048,9 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, assert((VT == MVT::v4f32 || VT == MVT::v4i32) && "unsupported vector type for insertps/pinsrd"); - int FromV1 = std::count_if(Mask.begin(), Mask.end(), - [](const int &i) { return i < 4; }); + auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; }; + auto FromV2Predicate = [](const int &i) { return i >= 4; }; + int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate); SDValue From; SDValue To; @@ -7459,23 +9058,26 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, if (FromV1 == 1) { From = V1; To = V2; - DestIndex = std::find_if(Mask.begin(), Mask.end(), - [](const int &i) { return i < 4; }) - + DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) - Mask.begin(); } else { + assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 && + "More than one element from V1 and from V2, or no elements from one " + "of the vectors. This case should not have returned true from " + "isINSERTPSMask"); From = V2; To = V1; - DestIndex = std::find_if(Mask.begin(), Mask.end(), - [](const int &i) { return i >= 4; }) - - Mask.begin(); + DestIndex = + std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin(); } + unsigned SrcIndex = Mask[DestIndex] % 4; if (MayFoldLoad(From)) { // Trivial case, when From comes from a load and is only used by the // shuffle. Make it use insertps from the vector that we need from that // load. SDValue NewLoad = - NarrowVectorLoadToElement(cast<LoadSDNode>(From), DestIndex, DAG); + NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG); if (!NewLoad.getNode()) return SDValue(); @@ -7496,7 +9098,6 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, } // Vector-element-to-vector - unsigned SrcIndex = Mask[DestIndex] % 4; SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6); return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask); } @@ -7663,6 +9264,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { bool OptForSize = MF.getFunction()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + // Check if we should use the experimental vector shuffle lowering. If so, + // delegate completely to that code path. + if (ExperimentalVectorShuffleLowering) + return lowerVectorShuffle(Op, Subtarget, DAG); + assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); if (V1IsUndef && V2IsUndef) @@ -7796,8 +9402,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { bool Commuted = false; // FIXME: This should also accept a bitcast of a splat? Be careful, not // 1,1,1,1 -> v8i16 though. - V1IsSplat = isSplatVector(V1.getNode()); - V2IsSplat = isSplatVector(V2.getNode()); + BitVector UndefElements; + if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode())) + if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none()) + V1IsSplat = true; + if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode())) + if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none()) + V2IsSplat = true; // Canonicalize the splat or undef, if present, to be on the RHS. if (!V2IsUndef && V1IsSplat && !V2IsSplat) { @@ -7873,6 +9484,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { getShufflePSHUFLWImmediate(SVOp), DAG); + unsigned MaskValue; + if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(), + &MaskValue)) + return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG); + if (isSHUFPMask(M, VT)) return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, getShuffleSHUFImmediate(SVOp), DAG); @@ -7910,10 +9526,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, V2, getShuffleVPERM2X128Immediate(SVOp), DAG); - SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG); - if (BlendOp.getNode()) - return BlendOp; - if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT)) return getINSERTPS(SVOp, dl, DAG); @@ -8530,7 +10142,7 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { // global base reg. unsigned char OpFlag = 0; unsigned WrapperKind = X86ISD::Wrapper; - CodeModel::Model M = getTargetMachine().getCodeModel(); + CodeModel::Model M = DAG.getTarget().getCodeModel(); if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) @@ -8563,7 +10175,7 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { // global base reg. unsigned char OpFlag = 0; unsigned WrapperKind = X86ISD::Wrapper; - CodeModel::Model M = getTargetMachine().getCodeModel(); + CodeModel::Model M = DAG.getTarget().getCodeModel(); if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) @@ -8596,7 +10208,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { // global base reg. unsigned char OpFlag = 0; unsigned WrapperKind = X86ISD::Wrapper; - CodeModel::Model M = getTargetMachine().getCodeModel(); + CodeModel::Model M = DAG.getTarget().getCodeModel(); if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) { @@ -8617,7 +10229,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); // With PIC, the address is actually $g + Offset. - if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && + if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ && !Subtarget->is64Bit()) { Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), DAG.getNode(X86ISD::GlobalBaseReg, @@ -8639,7 +10251,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { // Create the TargetBlockAddressAddress node. unsigned char OpFlags = Subtarget->ClassifyBlockAddressReference(); - CodeModel::Model M = getTargetMachine().getCodeModel(); + CodeModel::Model M = DAG.getTarget().getCodeModel(); const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); SDLoc dl(Op); @@ -8668,8 +10280,8 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, // Create the TargetGlobalAddress node, folding in the constant // offset if it is legal. unsigned char OpFlags = - Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); - CodeModel::Model M = getTargetMachine().getCodeModel(); + Subtarget->ClassifyGlobalReference(GV, DAG.getTarget()); + CodeModel::Model M = DAG.getTarget().getCodeModel(); SDValue Result; if (OpFlags == X86II::MO_NO_FLAG && X86::isOffsetSuitableForCodeModel(Offset, M)) { @@ -8868,7 +10480,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { const GlobalValue *GV = GA->getGlobal(); if (Subtarget->isTargetELF()) { - TLSModel::Model model = getTargetMachine().getTLSModel(GV); + TLSModel::Model model = DAG.getTarget().getTLSModel(GV); switch (model) { case TLSModel::GeneralDynamic: @@ -8880,9 +10492,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { Subtarget->is64Bit()); case TLSModel::InitialExec: case TLSModel::LocalExec: - return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, - Subtarget->is64Bit(), - getTargetMachine().getRelocationModel() == Reloc::PIC_); + return LowerToTLSExecModel( + GA, DAG, getPointerTy(), model, Subtarget->is64Bit(), + DAG.getTarget().getRelocationModel() == Reloc::PIC_); } llvm_unreachable("Unknown TLS model."); } @@ -8895,8 +10507,8 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. - bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && - !Subtarget->is64Bit(); + bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) && + !Subtarget->is64Bit(); if (PIC32) OpFlag = X86II::MO_TLVP_PIC_BASE; else @@ -10050,10 +11662,27 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, break; case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: - case X86::COND_O: case X86::COND_NO: - NeedOF = true; + case X86::COND_O: case X86::COND_NO: { + // Check if we really need to set the + // Overflow flag. If NoSignedWrap is present + // that is not actually needed. + switch (Op->getOpcode()) { + case ISD::ADD: + case ISD::SUB: + case ISD::MUL: + case ISD::SHL: { + const BinaryWithFlagsSDNode *BinNode = + cast<BinaryWithFlagsSDNode>(Op.getNode()); + if (BinNode->hasNoSignedWrap()) + break; + } + default: + NeedOF = true; + break; + } break; } + } // See if we can use the EFLAGS value from the operand instead of // doing a separate TEST. TEST always sets OF and CF to 0, so unless // we prove that the arithmetic won't overflow, we can't use OF or CF. @@ -10115,14 +11744,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) { // An add of one will be selected as an INC. - if (C->getAPIntValue() == 1) { + if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) { Opcode = X86ISD::INC; NumOperands = 1; break; } // An add of negative one (subtract of one) will be selected as a DEC. - if (C->getAPIntValue().isAllOnesValue()) { + if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) { Opcode = X86ISD::DEC; NumOperands = 1; break; @@ -10138,7 +11767,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, // If we have a constant logical shift that's only used in a comparison // against zero turn it into an equivalent AND. This allows turning it into // a TEST instruction later. - if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && + if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() && isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) { EVT VT = Op.getValueType(); unsigned BitWidth = VT.getSizeInBits(); @@ -11469,8 +13098,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { } if (addTest) { - CC = DAG.getConstant(X86::COND_NE, MVT::i8); - Cond = EmitTest(Cond, X86::COND_NE, dl, DAG); + X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE; + CC = DAG.getConstant(X86Cond, MVT::i8); + Cond = EmitTest(Cond, X86Cond, dl, DAG); } Cond = ConvertCmpIfNecessary(Cond, DAG); return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), @@ -11513,7 +13143,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue(); - const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); + const TargetFrameLowering &TFI = *DAG.getTarget().getFrameLowering(); unsigned StackAlign = TFI.getStackAlignment(); Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value if (Align > StackAlign) @@ -11572,7 +13202,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); unsigned SPReg = RegInfo->getStackRegister(); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); Chain = SP.getValue(1); @@ -11681,7 +13311,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { if (ArgMode == 2) { // Sanity Check: Make sure using fp_offset makes sense. - assert(!getTargetMachine().Options.UseSoftFloat && + assert(!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->getAttributes() .hasAttribute(AttributeSet::FunctionIndex, @@ -12158,11 +13788,37 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { Op.getOperand(1), Op.getOperand(2)); } + case Intrinsic::x86_sse2_packssdw_128: + case Intrinsic::x86_sse2_packsswb_128: + case Intrinsic::x86_avx2_packssdw: + case Intrinsic::x86_avx2_packsswb: + return DAG.getNode(X86ISD::PACKSS, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::x86_sse2_packuswb_128: + case Intrinsic::x86_sse41_packusdw: + case Intrinsic::x86_avx2_packuswb: + case Intrinsic::x86_avx2_packusdw: + return DAG.getNode(X86ISD::PACKUS, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_ssse3_pshuf_b_128: case Intrinsic::x86_avx2_pshuf_b: return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_sse2_pshuf_d: + return DAG.getNode(X86ISD::PSHUFD, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::x86_sse2_pshufl_w: + return DAG.getNode(X86ISD::PSHUFLW, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::x86_sse2_pshufh_w: + return DAG.getNode(X86ISD::PSHUFHW, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_ssse3_psign_b_128: case Intrinsic::x86_ssse3_psign_w_128: case Intrinsic::x86_ssse3_psign_d_128: @@ -12610,6 +14266,51 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, return SDValue(Res, 0); } +// getReadPerformanceCounter - Handles the lowering of builtin intrinsics that +// read performance monitor counters (x86_rdpmc). +static void getReadPerformanceCounter(SDNode *N, SDLoc DL, + SelectionDAG &DAG, const X86Subtarget *Subtarget, + SmallVectorImpl<SDValue> &Results) { + assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue LO, HI; + + // The ECX register is used to select the index of the performance counter + // to read. + SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, + N->getOperand(2)); + SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain); + + // Reads the content of a 64-bit performance counter and returns it in the + // registers EDX:EAX. + if (Subtarget->is64Bit()) { + LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); + HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, + LO.getValue(2)); + } else { + LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); + HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, + LO.getValue(2)); + } + Chain = HI.getValue(1); + + if (Subtarget->is64Bit()) { + // The EAX register is loaded with the low-order 32 bits. The EDX register + // is loaded with the supported high-order bits of the counter. + SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, + DAG.getConstant(32, MVT::i8)); + Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); + Results.push_back(Chain); + return; + } + + // Use a buildpair to merge the two 32-bit values into a 64-bit one. + SDValue Ops[] = { LO, HI }; + SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); + Results.push_back(Pair); + Results.push_back(Chain); +} + // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is // also used to custom lower READCYCLECOUNTER nodes. @@ -12674,7 +14375,7 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, } enum IntrinsicType { - GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDTSC, XTEST + GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST }; struct IntrinsicData { @@ -12768,6 +14469,8 @@ static void InitIntinsicsMap() { IntrinsicData(RDTSC, X86ISD::RDTSC_DAG, 0))); IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp, IntrinsicData(RDTSC, X86ISD::RDTSCP_DAG, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_rdpmc, + IntrinsicData(RDPMC, X86ISD::RDPMC_DAG, 0))); Initialized = true; } @@ -12826,7 +14529,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, case PREFETCH: { SDValue Hint = Op.getOperand(6); unsigned HintVal; - if (dyn_cast<ConstantSDNode> (Hint) == 0 || + if (dyn_cast<ConstantSDNode> (Hint) == nullptr || (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1) llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1"); unsigned Opcode = (HintVal ? Intr.Opc1 : Intr.Opc0); @@ -12843,6 +14546,12 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results); return DAG.getMergeValues(Results, dl); } + // Read Performance Monitoring Counters. + case RDPMC: { + SmallVector<SDValue, 2> Results; + getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results); + return DAG.getMergeValues(Results, dl); + } // XTEST intrinsics. case XTEST: { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); @@ -12873,7 +14582,7 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, PtrVT, @@ -12895,7 +14604,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && @@ -12924,7 +14633,7 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName, SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const { const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize()); } @@ -12936,7 +14645,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && @@ -12983,7 +14692,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SDLoc dl (Op); const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); - const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterInfo* TRI = DAG.getTarget().getRegisterInfo(); if (Subtarget->is64Bit()) { SDValue OutChains[6]; @@ -13431,7 +15140,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons CLI.setDebugLoc(dl).setChain(InChain) .setCallee(getLibcallCallingConv(LC), static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), - Callee, &Args, 0) + Callee, std::move(Args), 0) .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); @@ -13448,7 +15157,7 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, (VT == MVT::v8i32 && Subtarget->hasInt256())); // Get the high parts. - const int Mask[] = {1, 2, 3, 4, 5, 6, 7, 8}; + const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1}; SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask); SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask); @@ -13464,10 +15173,18 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1)); // Shuffle it back into the right order. - const int HighMask[] = {1, 5, 3, 7, 9, 13, 11, 15}; - SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); - const int LowMask[] = {0, 4, 2, 6, 8, 12, 10, 14}; - SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); + SDValue Highs, Lows; + if (VT == MVT::v8i32) { + const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15}; + Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); + const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14}; + Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); + } else { + const int HighMask[] = {1, 5, 3, 7}; + Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); + const int LowMask[] = {0, 4, 2, 6}; + Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); + } // If we have a signed multiply but no PMULDQ fix up the high parts of a // unsigned multiply. @@ -13494,10 +15211,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, SDValue Amt = Op.getOperand(1); // Optimize shl/srl/sra with constant shift amount. - if (isSplatVector(Amt.getNode())) { - SDValue SclrAmt = Amt->getOperand(0); - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { - uint64_t ShiftAmt = C->getZExtValue(); + if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) { + if (auto *ShiftConst = BVAmt->getConstantSplatNode()) { + uint64_t ShiftAmt = ShiftConst->getZExtValue(); if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || (Subtarget->hasInt256() && @@ -13804,15 +15520,14 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); SDValue V; - if (!Subtarget->hasSSE2()) - return SDValue(); + assert(VT.isVector() && "Custom lowering only for vector shifts!"); + assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!"); V = LowerScalarImmediateShift(Op, DAG, Subtarget); if (V.getNode()) @@ -14254,7 +15969,7 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, break; } SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, - Op.getOperand(2), SDValue()); + Op.getOperand(2), SDValue()); SDValue Ops[] = { cpIn.getValue(0), Op.getOperand(1), Op.getOperand(3), @@ -14264,9 +15979,18 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, Ops, T, MMO); + SDValue cpOut = DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); - return cpOut; + SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS, + MVT::i32, cpOut.getValue(2)); + SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1), + DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS); + + DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut); + DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success); + DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1)); + return SDValue(); } static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, @@ -14422,7 +16146,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::C, RetTy, Callee, &Args, 0); + .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0); std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); @@ -14446,7 +16170,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { default: llvm_unreachable("Should not custom lower this!"); case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); - case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op, Subtarget, DAG); + case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: + return LowerCMP_SWAP(Op, Subtarget, DAG); case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); @@ -14528,8 +16253,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { } static void ReplaceATOMIC_LOAD(SDNode *Node, - SmallVectorImpl<SDValue> &Results, - SelectionDAG &DAG) { + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) { SDLoc dl(Node); EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); @@ -14538,38 +16263,16 @@ static void ReplaceATOMIC_LOAD(SDNode *Node, // (The only way to get a 16-byte load is cmpxchg16b) // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment. SDValue Zero = DAG.getConstant(0, VT); - SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT, - Node->getOperand(0), - Node->getOperand(1), Zero, Zero, - cast<AtomicSDNode>(Node)->getMemOperand(), - cast<AtomicSDNode>(Node)->getOrdering(), - cast<AtomicSDNode>(Node)->getOrdering(), - cast<AtomicSDNode>(Node)->getSynchScope()); + SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other); + SDValue Swap = + DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, VT, VTs, + Node->getOperand(0), Node->getOperand(1), Zero, Zero, + cast<AtomicSDNode>(Node)->getMemOperand(), + cast<AtomicSDNode>(Node)->getOrdering(), + cast<AtomicSDNode>(Node)->getOrdering(), + cast<AtomicSDNode>(Node)->getSynchScope()); Results.push_back(Swap.getValue(0)); - Results.push_back(Swap.getValue(1)); -} - -static void -ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, - SelectionDAG &DAG, unsigned NewOp) { - SDLoc dl(Node); - assert (Node->getValueType(0) == MVT::i64 && - "Only know how to expand i64 atomics"); - - SDValue Chain = Node->getOperand(0); - SDValue In1 = Node->getOperand(1); - SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Node->getOperand(2), DAG.getIntPtrConstant(0)); - SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Node->getOperand(2), DAG.getIntPtrConstant(1)); - SDValue Ops[] = { Chain, In1, In2L, In2H }; - SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); - SDValue Result = - DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, MVT::i64, - cast<MemSDNode>(Node)->getMemOperand()); - SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; - Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF)); - Results.push_back(Result.getValue(2)); + Results.push_back(Swap.getValue(2)); } /// ReplaceNodeResults - Replace a node with an illegal result type @@ -14656,13 +16359,15 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case Intrinsic::x86_rdtscp: return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget, Results); + case Intrinsic::x86_rdpmc: + return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results); } } case ISD::READCYCLECOUNTER: { return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, Results); } - case ISD::ATOMIC_CMP_SWAP: { + case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: { EVT T = N->getValueType(0); assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); bool Regs64bit = T == MVT::i128; @@ -14704,61 +16409,33 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Regs64bit ? X86::RDX : X86::EDX, HalfT, cpOutL.getValue(2)); SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; + + SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS, + MVT::i32, cpOutH.getValue(2)); + SDValue Success = + DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS); + Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF)); - Results.push_back(cpOutH.getValue(1)); + Results.push_back(Success); + Results.push_back(EFLAGS.getValue(1)); return; } + case ISD::ATOMIC_SWAP: case ISD::ATOMIC_LOAD_ADD: + case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_AND: - case ISD::ATOMIC_LOAD_NAND: case ISD::ATOMIC_LOAD_OR: - case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_XOR: - case ISD::ATOMIC_LOAD_MAX: + case ISD::ATOMIC_LOAD_NAND: case ISD::ATOMIC_LOAD_MIN: - case ISD::ATOMIC_LOAD_UMAX: + case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: - case ISD::ATOMIC_SWAP: { - unsigned Opc; - switch (N->getOpcode()) { - default: llvm_unreachable("Unexpected opcode"); - case ISD::ATOMIC_LOAD_ADD: - Opc = X86ISD::ATOMADD64_DAG; - break; - case ISD::ATOMIC_LOAD_AND: - Opc = X86ISD::ATOMAND64_DAG; - break; - case ISD::ATOMIC_LOAD_NAND: - Opc = X86ISD::ATOMNAND64_DAG; - break; - case ISD::ATOMIC_LOAD_OR: - Opc = X86ISD::ATOMOR64_DAG; - break; - case ISD::ATOMIC_LOAD_SUB: - Opc = X86ISD::ATOMSUB64_DAG; - break; - case ISD::ATOMIC_LOAD_XOR: - Opc = X86ISD::ATOMXOR64_DAG; - break; - case ISD::ATOMIC_LOAD_MAX: - Opc = X86ISD::ATOMMAX64_DAG; - break; - case ISD::ATOMIC_LOAD_MIN: - Opc = X86ISD::ATOMMIN64_DAG; - break; - case ISD::ATOMIC_LOAD_UMAX: - Opc = X86ISD::ATOMUMAX64_DAG; - break; - case ISD::ATOMIC_LOAD_UMIN: - Opc = X86ISD::ATOMUMIN64_DAG; - break; - case ISD::ATOMIC_SWAP: - Opc = X86ISD::ATOMSWAP64_DAG; - break; - } - ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc); - return; - } + case ISD::ATOMIC_LOAD_UMAX: + // Delegate to generic TypeLegalization. Situations we can really handle + // should have already been dealt with by X86AtomicExpand.cpp. + break; case ISD::ATOMIC_LOAD: { ReplaceATOMIC_LOAD(N, Results, DAG); return; @@ -14779,6 +16456,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, MVT::v2f64, N->getOperand(0)); SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded); + if (ExperimentalVectorWideningLegalization) { + // If we are legalizing vectors by widening, we already have the desired + // legal vector type, just return it. + Results.push_back(ToVecInt); + return; + } + SmallVector<SDValue, 8> Elts; for (unsigned i = 0, e = NumElts; i != e; ++i) Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, @@ -14810,6 +16494,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FST: return "X86ISD::FST"; case X86ISD::CALL: return "X86ISD::CALL"; case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; + case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG"; + case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG"; case X86ISD::BT: return "X86ISD::BT"; case X86ISD::CMP: return "X86ISD::CMP"; case X86ISD::COMI: return "X86ISD::COMI"; @@ -14863,12 +16549,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; - case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; - case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; - case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; - case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; - case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; - case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; + case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; case X86ISD::VZEXT: return "X86ISD::VZEXT"; @@ -14909,6 +16590,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::TESTM: return "X86ISD::TESTM"; case X86ISD::TESTNM: return "X86ISD::TESTNM"; case X86ISD::KORTEST: return "X86ISD::KORTEST"; + case X86ISD::PACKSS: return "X86ISD::PACKSS"; + case X86ISD::PACKUS: return "X86ISD::PACKUS"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; @@ -15173,7 +16856,8 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, isUNPCKLMask(M, SVT, Subtarget->hasInt256()) || isUNPCKHMask(M, SVT, Subtarget->hasInt256()) || isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || - isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256())); + isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || + isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256())); } bool @@ -15256,685 +16940,6 @@ static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, return sinkMBB; } -// Get CMPXCHG opcode for the specified data type. -static unsigned getCmpXChgOpcode(EVT VT) { - switch (VT.getSimpleVT().SimpleTy) { - case MVT::i8: return X86::LCMPXCHG8; - case MVT::i16: return X86::LCMPXCHG16; - case MVT::i32: return X86::LCMPXCHG32; - case MVT::i64: return X86::LCMPXCHG64; - default: - break; - } - llvm_unreachable("Invalid operand size!"); -} - -// Get LOAD opcode for the specified data type. -static unsigned getLoadOpcode(EVT VT) { - switch (VT.getSimpleVT().SimpleTy) { - case MVT::i8: return X86::MOV8rm; - case MVT::i16: return X86::MOV16rm; - case MVT::i32: return X86::MOV32rm; - case MVT::i64: return X86::MOV64rm; - default: - break; - } - llvm_unreachable("Invalid operand size!"); -} - -// Get opcode of the non-atomic one from the specified atomic instruction. -static unsigned getNonAtomicOpcode(unsigned Opc) { - switch (Opc) { - case X86::ATOMAND8: return X86::AND8rr; - case X86::ATOMAND16: return X86::AND16rr; - case X86::ATOMAND32: return X86::AND32rr; - case X86::ATOMAND64: return X86::AND64rr; - case X86::ATOMOR8: return X86::OR8rr; - case X86::ATOMOR16: return X86::OR16rr; - case X86::ATOMOR32: return X86::OR32rr; - case X86::ATOMOR64: return X86::OR64rr; - case X86::ATOMXOR8: return X86::XOR8rr; - case X86::ATOMXOR16: return X86::XOR16rr; - case X86::ATOMXOR32: return X86::XOR32rr; - case X86::ATOMXOR64: return X86::XOR64rr; - } - llvm_unreachable("Unhandled atomic-load-op opcode!"); -} - -// Get opcode of the non-atomic one from the specified atomic instruction with -// extra opcode. -static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc, - unsigned &ExtraOpc) { - switch (Opc) { - case X86::ATOMNAND8: ExtraOpc = X86::NOT8r; return X86::AND8rr; - case X86::ATOMNAND16: ExtraOpc = X86::NOT16r; return X86::AND16rr; - case X86::ATOMNAND32: ExtraOpc = X86::NOT32r; return X86::AND32rr; - case X86::ATOMNAND64: ExtraOpc = X86::NOT64r; return X86::AND64rr; - case X86::ATOMMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVL32rr; - case X86::ATOMMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr; - case X86::ATOMMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr; - case X86::ATOMMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr; - case X86::ATOMMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVG32rr; - case X86::ATOMMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr; - case X86::ATOMMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr; - case X86::ATOMMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr; - case X86::ATOMUMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVB32rr; - case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr; - case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr; - case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr; - case X86::ATOMUMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVA32rr; - case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr; - case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr; - case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr; - } - llvm_unreachable("Unhandled atomic-load-op opcode!"); -} - -// Get opcode of the non-atomic one from the specified atomic instruction for -// 64-bit data type on 32-bit target. -static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) { - switch (Opc) { - case X86::ATOMAND6432: HiOpc = X86::AND32rr; return X86::AND32rr; - case X86::ATOMOR6432: HiOpc = X86::OR32rr; return X86::OR32rr; - case X86::ATOMXOR6432: HiOpc = X86::XOR32rr; return X86::XOR32rr; - case X86::ATOMADD6432: HiOpc = X86::ADC32rr; return X86::ADD32rr; - case X86::ATOMSUB6432: HiOpc = X86::SBB32rr; return X86::SUB32rr; - case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr; - case X86::ATOMMAX6432: HiOpc = X86::SETLr; return X86::SETLr; - case X86::ATOMMIN6432: HiOpc = X86::SETGr; return X86::SETGr; - case X86::ATOMUMAX6432: HiOpc = X86::SETBr; return X86::SETBr; - case X86::ATOMUMIN6432: HiOpc = X86::SETAr; return X86::SETAr; - } - llvm_unreachable("Unhandled atomic-load-op opcode!"); -} - -// Get opcode of the non-atomic one from the specified atomic instruction for -// 64-bit data type on 32-bit target with extra opcode. -static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc, - unsigned &HiOpc, - unsigned &ExtraOpc) { - switch (Opc) { - case X86::ATOMNAND6432: - ExtraOpc = X86::NOT32r; - HiOpc = X86::AND32rr; - return X86::AND32rr; - } - llvm_unreachable("Unhandled atomic-load-op opcode!"); -} - -// Get pseudo CMOV opcode from the specified data type. -static unsigned getPseudoCMOVOpc(EVT VT) { - switch (VT.getSimpleVT().SimpleTy) { - case MVT::i8: return X86::CMOV_GR8; - case MVT::i16: return X86::CMOV_GR16; - case MVT::i32: return X86::CMOV_GR32; - default: - break; - } - llvm_unreachable("Unknown CMOV opcode!"); -} - -// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions. -// They will be translated into a spin-loop or compare-exchange loop from -// -// ... -// dst = atomic-fetch-op MI.addr, MI.val -// ... -// -// to -// -// ... -// t1 = LOAD MI.addr -// loop: -// t4 = phi(t1, t3 / loop) -// t2 = OP MI.val, t4 -// EAX = t4 -// LCMPXCHG [MI.addr], t2, [EAX is implicitly used & defined] -// t3 = EAX -// JNE loop -// sink: -// dst = t3 -// ... -MachineBasicBlock * -X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, - MachineBasicBlock *MBB) const { - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - DebugLoc DL = MI->getDebugLoc(); - - MachineFunction *MF = MBB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - - const BasicBlock *BB = MBB->getBasicBlock(); - MachineFunction::iterator I = MBB; - ++I; - - assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 && - "Unexpected number of operands"); - - assert(MI->hasOneMemOperand() && - "Expected atomic-load-op to have one memoperand"); - - // Memory Reference - MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); - MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); - - unsigned DstReg, SrcReg; - unsigned MemOpndSlot; - - unsigned CurOp = 0; - - DstReg = MI->getOperand(CurOp++).getReg(); - MemOpndSlot = CurOp; - CurOp += X86::AddrNumOperands; - SrcReg = MI->getOperand(CurOp++).getReg(); - - const TargetRegisterClass *RC = MRI.getRegClass(DstReg); - MVT::SimpleValueType VT = *RC->vt_begin(); - unsigned t1 = MRI.createVirtualRegister(RC); - unsigned t2 = MRI.createVirtualRegister(RC); - unsigned t3 = MRI.createVirtualRegister(RC); - unsigned t4 = MRI.createVirtualRegister(RC); - unsigned PhyReg = getX86SubSuperRegister(X86::EAX, VT); - - unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT); - unsigned LOADOpc = getLoadOpcode(VT); - - // For the atomic load-arith operator, we generate - // - // thisMBB: - // t1 = LOAD [MI.addr] - // mainMBB: - // t4 = phi(t1 / thisMBB, t3 / mainMBB) - // t1 = OP MI.val, EAX - // EAX = t4 - // LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] - // t3 = EAX - // JNE mainMBB - // sinkMBB: - // dst = t3 - - MachineBasicBlock *thisMBB = MBB; - MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); - MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); - MF->insert(I, mainMBB); - MF->insert(I, sinkMBB); - - MachineInstrBuilder MIB; - - // Transfer the remainder of BB and its successor edges to sinkMBB. - sinkMBB->splice(sinkMBB->begin(), MBB, - std::next(MachineBasicBlock::iterator(MI)), MBB->end()); - sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); - - // thisMBB: - MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); - if (NewMO.isReg()) - NewMO.setIsKill(false); - MIB.addOperand(NewMO); - } - for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) { - unsigned flags = (*MMOI)->getFlags(); - flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad; - MachineMemOperand *MMO = - MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags, - (*MMOI)->getSize(), - (*MMOI)->getBaseAlignment(), - (*MMOI)->getTBAAInfo(), - (*MMOI)->getRanges()); - MIB.addMemOperand(MMO); - } - - thisMBB->addSuccessor(mainMBB); - - // mainMBB: - MachineBasicBlock *origMainMBB = mainMBB; - - // Add a PHI. - MachineInstr *Phi = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4) - .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB); - - unsigned Opc = MI->getOpcode(); - switch (Opc) { - default: - llvm_unreachable("Unhandled atomic-load-op opcode!"); - case X86::ATOMAND8: - case X86::ATOMAND16: - case X86::ATOMAND32: - case X86::ATOMAND64: - case X86::ATOMOR8: - case X86::ATOMOR16: - case X86::ATOMOR32: - case X86::ATOMOR64: - case X86::ATOMXOR8: - case X86::ATOMXOR16: - case X86::ATOMXOR32: - case X86::ATOMXOR64: { - unsigned ARITHOpc = getNonAtomicOpcode(Opc); - BuildMI(mainMBB, DL, TII->get(ARITHOpc), t2).addReg(SrcReg) - .addReg(t4); - break; - } - case X86::ATOMNAND8: - case X86::ATOMNAND16: - case X86::ATOMNAND32: - case X86::ATOMNAND64: { - unsigned Tmp = MRI.createVirtualRegister(RC); - unsigned NOTOpc; - unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc); - BuildMI(mainMBB, DL, TII->get(ANDOpc), Tmp).addReg(SrcReg) - .addReg(t4); - BuildMI(mainMBB, DL, TII->get(NOTOpc), t2).addReg(Tmp); - break; - } - case X86::ATOMMAX8: - case X86::ATOMMAX16: - case X86::ATOMMAX32: - case X86::ATOMMAX64: - case X86::ATOMMIN8: - case X86::ATOMMIN16: - case X86::ATOMMIN32: - case X86::ATOMMIN64: - case X86::ATOMUMAX8: - case X86::ATOMUMAX16: - case X86::ATOMUMAX32: - case X86::ATOMUMAX64: - case X86::ATOMUMIN8: - case X86::ATOMUMIN16: - case X86::ATOMUMIN32: - case X86::ATOMUMIN64: { - unsigned CMPOpc; - unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc); - - BuildMI(mainMBB, DL, TII->get(CMPOpc)) - .addReg(SrcReg) - .addReg(t4); - - if (Subtarget->hasCMov()) { - if (VT != MVT::i8) { - // Native support - BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2) - .addReg(SrcReg) - .addReg(t4); - } else { - // Promote i8 to i32 to use CMOV32 - const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); - const TargetRegisterClass *RC32 = - TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit); - unsigned SrcReg32 = MRI.createVirtualRegister(RC32); - unsigned AccReg32 = MRI.createVirtualRegister(RC32); - unsigned Tmp = MRI.createVirtualRegister(RC32); - - unsigned Undef = MRI.createVirtualRegister(RC32); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef); - - BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32) - .addReg(Undef) - .addReg(SrcReg) - .addImm(X86::sub_8bit); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32) - .addReg(Undef) - .addReg(t4) - .addImm(X86::sub_8bit); - - BuildMI(mainMBB, DL, TII->get(CMOVOpc), Tmp) - .addReg(SrcReg32) - .addReg(AccReg32); - - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t2) - .addReg(Tmp, 0, X86::sub_8bit); - } - } else { - // Use pseudo select and lower them. - assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) && - "Invalid atomic-load-op transformation!"); - unsigned SelOpc = getPseudoCMOVOpc(VT); - X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc); - assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!"); - MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t2) - .addReg(SrcReg).addReg(t4) - .addImm(CC); - mainMBB = EmitLoweredSelect(MIB, mainMBB); - // Replace the original PHI node as mainMBB is changed after CMOV - // lowering. - BuildMI(*origMainMBB, Phi, DL, TII->get(X86::PHI), t4) - .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB); - Phi->eraseFromParent(); - } - break; - } - } - - // Copy PhyReg back from virtual register. - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), PhyReg) - .addReg(t4); - - MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); - if (NewMO.isReg()) - NewMO.setIsKill(false); - MIB.addOperand(NewMO); - } - MIB.addReg(t2); - MIB.setMemRefs(MMOBegin, MMOEnd); - - // Copy PhyReg back to virtual register. - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3) - .addReg(PhyReg); - - BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); - - mainMBB->addSuccessor(origMainMBB); - mainMBB->addSuccessor(sinkMBB); - - // sinkMBB: - BuildMI(*sinkMBB, sinkMBB->begin(), DL, - TII->get(TargetOpcode::COPY), DstReg) - .addReg(t3); - - MI->eraseFromParent(); - return sinkMBB; -} - -// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic -// instructions. They will be translated into a spin-loop or compare-exchange -// loop from -// -// ... -// dst = atomic-fetch-op MI.addr, MI.val -// ... -// -// to -// -// ... -// t1L = LOAD [MI.addr + 0] -// t1H = LOAD [MI.addr + 4] -// loop: -// t4L = phi(t1L, t3L / loop) -// t4H = phi(t1H, t3H / loop) -// t2L = OP MI.val.lo, t4L -// t2H = OP MI.val.hi, t4H -// EAX = t4L -// EDX = t4H -// EBX = t2L -// ECX = t2H -// LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] -// t3L = EAX -// t3H = EDX -// JNE loop -// sink: -// dstL = t3L -// dstH = t3H -// ... -MachineBasicBlock * -X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, - MachineBasicBlock *MBB) const { - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - DebugLoc DL = MI->getDebugLoc(); - - MachineFunction *MF = MBB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - - const BasicBlock *BB = MBB->getBasicBlock(); - MachineFunction::iterator I = MBB; - ++I; - - assert(MI->getNumOperands() <= X86::AddrNumOperands + 7 && - "Unexpected number of operands"); - - assert(MI->hasOneMemOperand() && - "Expected atomic-load-op32 to have one memoperand"); - - // Memory Reference - MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); - MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); - - unsigned DstLoReg, DstHiReg; - unsigned SrcLoReg, SrcHiReg; - unsigned MemOpndSlot; - - unsigned CurOp = 0; - - DstLoReg = MI->getOperand(CurOp++).getReg(); - DstHiReg = MI->getOperand(CurOp++).getReg(); - MemOpndSlot = CurOp; - CurOp += X86::AddrNumOperands; - SrcLoReg = MI->getOperand(CurOp++).getReg(); - SrcHiReg = MI->getOperand(CurOp++).getReg(); - - const TargetRegisterClass *RC = &X86::GR32RegClass; - const TargetRegisterClass *RC8 = &X86::GR8RegClass; - - unsigned t1L = MRI.createVirtualRegister(RC); - unsigned t1H = MRI.createVirtualRegister(RC); - unsigned t2L = MRI.createVirtualRegister(RC); - unsigned t2H = MRI.createVirtualRegister(RC); - unsigned t3L = MRI.createVirtualRegister(RC); - unsigned t3H = MRI.createVirtualRegister(RC); - unsigned t4L = MRI.createVirtualRegister(RC); - unsigned t4H = MRI.createVirtualRegister(RC); - - unsigned LCMPXCHGOpc = X86::LCMPXCHG8B; - unsigned LOADOpc = X86::MOV32rm; - - // For the atomic load-arith operator, we generate - // - // thisMBB: - // t1L = LOAD [MI.addr + 0] - // t1H = LOAD [MI.addr + 4] - // mainMBB: - // t4L = phi(t1L / thisMBB, t3L / mainMBB) - // t4H = phi(t1H / thisMBB, t3H / mainMBB) - // t2L = OP MI.val.lo, t4L - // t2H = OP MI.val.hi, t4H - // EBX = t2L - // ECX = t2H - // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] - // t3L = EAX - // t3H = EDX - // JNE loop - // sinkMBB: - // dstL = t3L - // dstH = t3H - - MachineBasicBlock *thisMBB = MBB; - MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); - MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); - MF->insert(I, mainMBB); - MF->insert(I, sinkMBB); - - MachineInstrBuilder MIB; - - // Transfer the remainder of BB and its successor edges to sinkMBB. - sinkMBB->splice(sinkMBB->begin(), MBB, - std::next(MachineBasicBlock::iterator(MI)), MBB->end()); - sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); - - // thisMBB: - // Lo - MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1L); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); - if (NewMO.isReg()) - NewMO.setIsKill(false); - MIB.addOperand(NewMO); - } - for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) { - unsigned flags = (*MMOI)->getFlags(); - flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad; - MachineMemOperand *MMO = - MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags, - (*MMOI)->getSize(), - (*MMOI)->getBaseAlignment(), - (*MMOI)->getTBAAInfo(), - (*MMOI)->getRanges()); - MIB.addMemOperand(MMO); - }; - MachineInstr *LowMI = MIB; - - // Hi - MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1H); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - if (i == X86::AddrDisp) { - MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32) - } else { - MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); - if (NewMO.isReg()) - NewMO.setIsKill(false); - MIB.addOperand(NewMO); - } - } - MIB.setMemRefs(LowMI->memoperands_begin(), LowMI->memoperands_end()); - - thisMBB->addSuccessor(mainMBB); - - // mainMBB: - MachineBasicBlock *origMainMBB = mainMBB; - - // Add PHIs. - MachineInstr *PhiL = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4L) - .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB); - MachineInstr *PhiH = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4H) - .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB); - - unsigned Opc = MI->getOpcode(); - switch (Opc) { - default: - llvm_unreachable("Unhandled atomic-load-op6432 opcode!"); - case X86::ATOMAND6432: - case X86::ATOMOR6432: - case X86::ATOMXOR6432: - case X86::ATOMADD6432: - case X86::ATOMSUB6432: { - unsigned HiOpc; - unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); - BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(t4L) - .addReg(SrcLoReg); - BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(t4H) - .addReg(SrcHiReg); - break; - } - case X86::ATOMNAND6432: { - unsigned HiOpc, NOTOpc; - unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc); - unsigned TmpL = MRI.createVirtualRegister(RC); - unsigned TmpH = MRI.createVirtualRegister(RC); - BuildMI(mainMBB, DL, TII->get(LoOpc), TmpL).addReg(SrcLoReg) - .addReg(t4L); - BuildMI(mainMBB, DL, TII->get(HiOpc), TmpH).addReg(SrcHiReg) - .addReg(t4H); - BuildMI(mainMBB, DL, TII->get(NOTOpc), t2L).addReg(TmpL); - BuildMI(mainMBB, DL, TII->get(NOTOpc), t2H).addReg(TmpH); - break; - } - case X86::ATOMMAX6432: - case X86::ATOMMIN6432: - case X86::ATOMUMAX6432: - case X86::ATOMUMIN6432: { - unsigned HiOpc; - unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); - unsigned cL = MRI.createVirtualRegister(RC8); - unsigned cH = MRI.createVirtualRegister(RC8); - unsigned cL32 = MRI.createVirtualRegister(RC); - unsigned cH32 = MRI.createVirtualRegister(RC); - unsigned cc = MRI.createVirtualRegister(RC); - // cl := cmp src_lo, lo - BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) - .addReg(SrcLoReg).addReg(t4L); - BuildMI(mainMBB, DL, TII->get(LoOpc), cL); - BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL); - // ch := cmp src_hi, hi - BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) - .addReg(SrcHiReg).addReg(t4H); - BuildMI(mainMBB, DL, TII->get(HiOpc), cH); - BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH); - // cc := if (src_hi == hi) ? cl : ch; - if (Subtarget->hasCMov()) { - BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc) - .addReg(cH32).addReg(cL32); - } else { - MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc) - .addReg(cH32).addReg(cL32) - .addImm(X86::COND_E); - mainMBB = EmitLoweredSelect(MIB, mainMBB); - } - BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc); - if (Subtarget->hasCMov()) { - BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2L) - .addReg(SrcLoReg).addReg(t4L); - BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2H) - .addReg(SrcHiReg).addReg(t4H); - } else { - MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2L) - .addReg(SrcLoReg).addReg(t4L) - .addImm(X86::COND_NE); - mainMBB = EmitLoweredSelect(MIB, mainMBB); - // As the lowered CMOV won't clobber EFLAGS, we could reuse it for the - // 2nd CMOV lowering. - mainMBB->addLiveIn(X86::EFLAGS); - MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2H) - .addReg(SrcHiReg).addReg(t4H) - .addImm(X86::COND_NE); - mainMBB = EmitLoweredSelect(MIB, mainMBB); - // Replace the original PHI node as mainMBB is changed after CMOV - // lowering. - BuildMI(*origMainMBB, PhiL, DL, TII->get(X86::PHI), t4L) - .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB); - BuildMI(*origMainMBB, PhiH, DL, TII->get(X86::PHI), t4H) - .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB); - PhiL->eraseFromParent(); - PhiH->eraseFromParent(); - } - break; - } - case X86::ATOMSWAP6432: { - unsigned HiOpc; - unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); - BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg); - BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg); - break; - } - } - - // Copy EDX:EAX back from HiReg:LoReg - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(t4L); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(t4H); - // Copy ECX:EBX from t1H:t1L - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t2L); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t2H); - - MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); - if (NewMO.isReg()) - NewMO.setIsKill(false); - MIB.addOperand(NewMO); - } - MIB.setMemRefs(MMOBegin, MMOEnd); - - // Copy EDX:EAX back to t3H:t3L - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3L).addReg(X86::EAX); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3H).addReg(X86::EDX); - - BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); - - mainMBB->addSuccessor(origMainMBB); - mainMBB->addSuccessor(sinkMBB); - - // sinkMBB: - BuildMI(*sinkMBB, sinkMBB->begin(), DL, - TII->get(TargetOpcode::COPY), DstLoReg) - .addReg(t3L); - BuildMI(*sinkMBB, sinkMBB->begin(), DL, - TII->get(TargetOpcode::COPY), DstHiReg) - .addReg(t3H); - - MI->eraseFromParent(); - return sinkMBB; -} - // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 // or XMM0_V32I8 in AVX all of this code can be replaced with that // in the .td file. @@ -16068,7 +17073,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter( MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); // Machine Information - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo(); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); @@ -16324,7 +17329,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( XMMSaveMBB->addSuccessor(EndMBB); // Now add the instructions. - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); unsigned CountReg = MI->getOperand(0).getReg(); @@ -16407,7 +17412,7 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *BB) const { - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); // To "insert" a SELECT_CC instruction, we actually have to insert the @@ -16433,7 +17438,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. - const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterInfo* TRI = BB->getParent()->getTarget().getRegisterInfo(); if (!MI->killsRegister(X86::EFLAGS) && !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { copy0MBB->addLiveIn(X86::EFLAGS); @@ -16474,9 +17479,9 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, bool Is64Bit) const { - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - DebugLoc DL = MI->getDebugLoc(); MachineFunction *MF = BB->getParent(); + const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); assert(MF->shouldSplitStack()); @@ -16546,7 +17551,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, // Calls into a routine in libgcc to allocate more space from the heap. const uint32_t *RegMask = - getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); + MF->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C); if (Is64Bit) { BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) .addReg(sizeVReg); @@ -16594,8 +17599,8 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, MachineBasicBlock * X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, - MachineBasicBlock *BB) const { - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); assert(!Subtarget->isTargetMacho()); @@ -16651,10 +17656,10 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, // our load from the relocation, sticking it in either RDI (x86-64) // or EAX and doing an indirect call. The return value will then // be in the normal return register. + MachineFunction *F = BB->getParent(); const X86InstrInfo *TII - = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); + = static_cast<const X86InstrInfo*>(F->getTarget().getInstrInfo()); DebugLoc DL = MI->getDebugLoc(); - MachineFunction *F = BB->getParent(); assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); assert(MI->getOperand(3).isGlobal() && "This should be a global"); @@ -16663,7 +17668,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, // FIXME: The 32-bit calls have non-standard calling conventions. Use a // proper register mask. const uint32_t *RegMask = - getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); + F->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C); if (Subtarget->is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI) @@ -16675,7 +17680,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); addDirectMem(MIB, X86::RDI); MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); - } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { + } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) .addReg(0) @@ -16707,9 +17712,8 @@ MachineBasicBlock * X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - MachineFunction *MF = MBB->getParent(); + const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); @@ -16771,8 +17775,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, unsigned PtrStoreOpc = 0; unsigned LabelReg = 0; const int64_t LabelOffset = 1 * PVT.getStoreSize(); - Reloc::Model RM = getTargetMachine().getRelocationModel(); - bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) && + Reloc::Model RM = MF->getTarget().getRelocationModel(); + bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) && (RM == Reloc::Static || RM == Reloc::DynamicNoPIC); // Prepare IP either in reg or imm. @@ -16816,7 +17820,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, .addMBB(restoreMBB); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo()); MIB.addRegMask(RegInfo->getNoPreservedMask()); thisMBB->addSuccessor(mainMBB); thisMBB->addSuccessor(restoreMBB); @@ -16845,9 +17849,8 @@ MachineBasicBlock * X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - MachineFunction *MF = MBB->getParent(); + const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); // Memory Reference @@ -16863,7 +17866,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, unsigned Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo()); unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; unsigned SP = RegInfo->getStackRegister(); @@ -17038,12 +18041,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::FP80_TO_INT16_IN_MEM: case X86::FP80_TO_INT32_IN_MEM: case X86::FP80_TO_INT64_IN_MEM: { - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + MachineFunction *F = BB->getParent(); + const TargetInstrInfo *TII = F->getTarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); // Change the floating point control register to use "round towards zero" // mode when truncating to an integer value. - MachineFunction *F = BB->getParent(); int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx); @@ -17123,7 +18126,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VPCMPESTRM128MEM: assert(Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); - return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo()); + return EmitPCMPSTRM(MI, BB, BB->getParent()->getTarget().getInstrInfo()); // String/text processing lowering. case X86::PCMPISTRIREG: @@ -17136,71 +18139,15 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VPCMPESTRIMEM: assert(Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); - return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo()); + return EmitPCMPSTRI(MI, BB, BB->getParent()->getTarget().getInstrInfo()); // Thread synchronization. case X86::MONITOR: - return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget); + return EmitMonitor(MI, BB, BB->getParent()->getTarget().getInstrInfo(), Subtarget); // xbegin case X86::XBEGIN: - return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo()); - - // Atomic Lowering. - case X86::ATOMAND8: - case X86::ATOMAND16: - case X86::ATOMAND32: - case X86::ATOMAND64: - // Fall through - case X86::ATOMOR8: - case X86::ATOMOR16: - case X86::ATOMOR32: - case X86::ATOMOR64: - // Fall through - case X86::ATOMXOR16: - case X86::ATOMXOR8: - case X86::ATOMXOR32: - case X86::ATOMXOR64: - // Fall through - case X86::ATOMNAND8: - case X86::ATOMNAND16: - case X86::ATOMNAND32: - case X86::ATOMNAND64: - // Fall through - case X86::ATOMMAX8: - case X86::ATOMMAX16: - case X86::ATOMMAX32: - case X86::ATOMMAX64: - // Fall through - case X86::ATOMMIN8: - case X86::ATOMMIN16: - case X86::ATOMMIN32: - case X86::ATOMMIN64: - // Fall through - case X86::ATOMUMAX8: - case X86::ATOMUMAX16: - case X86::ATOMUMAX32: - case X86::ATOMUMAX64: - // Fall through - case X86::ATOMUMIN8: - case X86::ATOMUMIN16: - case X86::ATOMUMIN32: - case X86::ATOMUMIN64: - return EmitAtomicLoadArith(MI, BB); - - // This group does 64-bit operations on a 32-bit host. - case X86::ATOMAND6432: - case X86::ATOMOR6432: - case X86::ATOMXOR6432: - case X86::ATOMNAND6432: - case X86::ATOMADD6432: - case X86::ATOMSUB6432: - case X86::ATOMMAX6432: - case X86::ATOMMIN6432: - case X86::ATOMUMAX6432: - case X86::ATOMUMIN6432: - case X86::ATOMSWAP6432: - return EmitAtomicLoadArith6432(MI, BB); + return EmitXBegin(MI, BB, BB->getParent()->getTarget().getInstrInfo()); case X86::VASTART_SAVE_XMM_REGS: return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); @@ -17473,13 +18420,385 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// \brief Get the PSHUF-style mask from PSHUF node. +/// +/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 +/// PSHUF-style masks that can be reused with such instructions. +static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) { + SmallVector<int, 4> Mask; + bool IsUnary; + bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary); + (void)HaveMask; + assert(HaveMask); + + switch (N.getOpcode()) { + case X86ISD::PSHUFD: + return Mask; + case X86ISD::PSHUFLW: + Mask.resize(4); + return Mask; + case X86ISD::PSHUFHW: + Mask.erase(Mask.begin(), Mask.begin() + 4); + for (int &M : Mask) + M -= 4; + return Mask; + default: + llvm_unreachable("No valid shuffle instruction found!"); + } +} + +/// \brief Search for a combinable shuffle across a chain ending in pshufd. +/// +/// We walk up the chain and look for a combinable shuffle, skipping over +/// shuffles that we could hoist this shuffle's transformation past without +/// altering anything. +static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + assert(N.getOpcode() == X86ISD::PSHUFD && + "Called with something other than an x86 128-bit half shuffle!"); + SDLoc DL(N); + + // Walk up a single-use chain looking for a combinable shuffle. + SDValue V = N.getOperand(0); + for (; V.hasOneUse(); V = V.getOperand(0)) { + switch (V.getOpcode()) { + default: + return false; // Nothing combined! + + case ISD::BITCAST: + // Skip bitcasts as we always know the type for the target specific + // instructions. + continue; + + case X86ISD::PSHUFD: + // Found another dword shuffle. + break; + + case X86ISD::PSHUFLW: + // Check that the low words (being shuffled) are the identity in the + // dword shuffle, and the high words are self-contained. + if (Mask[0] != 0 || Mask[1] != 1 || + !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4)) + return false; + + continue; + + case X86ISD::PSHUFHW: + // Check that the high words (being shuffled) are the identity in the + // dword shuffle, and the low words are self-contained. + if (Mask[2] != 2 || Mask[3] != 3 || + !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2)) + return false; + + continue; + + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword + // shuffle into a preceding word shuffle. + if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16) + return false; + + // Search for a half-shuffle which we can combine with. + unsigned CombineOp = + V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; + if (V.getOperand(0) != V.getOperand(1) || + !V->isOnlyUserOf(V.getOperand(0).getNode())) + return false; + V = V.getOperand(0); + do { + switch (V.getOpcode()) { + default: + return false; // Nothing to combine. + + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + if (V.getOpcode() == CombineOp) + break; + + // Fallthrough! + case ISD::BITCAST: + V = V.getOperand(0); + continue; + } + break; + } while (V.hasOneUse()); + break; + } + // Break out of the loop if we break out of the switch. + break; + } + + if (!V.hasOneUse()) + // We fell out of the loop without finding a viable combining instruction. + return false; + + // Record the old value to use in RAUW-ing. + SDValue Old = V; + + // Merge this node's mask and our incoming mask. + SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); + for (int &M : Mask) + M = VMask[M]; + V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0), + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // It is possible that one of the combinable shuffles was completely absorbed + // by the other, just replace it and revisit all users in that case. + if (Old.getNode() == V.getNode()) { + DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo=*/true); + return true; + } + + // Replace N with its operand as we're going to combine that shuffle away. + DAG.ReplaceAllUsesWith(N, N.getOperand(0)); + + // Replace the combinable shuffle with the combined one, updating all users + // so that we re-evaluate the chain here. + DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); + return true; +} + +/// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw. +/// +/// We walk up the chain, skipping shuffles of the other half and looking +/// through shuffles which switch halves trying to find a shuffle of the same +/// pair of dwords. +static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + assert( + (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && + "Called with something other than an x86 128-bit half shuffle!"); + SDLoc DL(N); + unsigned CombineOpcode = N.getOpcode(); + + // Walk up a single-use chain looking for a combinable shuffle. + SDValue V = N.getOperand(0); + for (; V.hasOneUse(); V = V.getOperand(0)) { + switch (V.getOpcode()) { + default: + return false; // Nothing combined! + + case ISD::BITCAST: + // Skip bitcasts as we always know the type for the target specific + // instructions. + continue; + + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + if (V.getOpcode() == CombineOpcode) + break; + + // Other-half shuffles are no-ops. + continue; + + case X86ISD::PSHUFD: { + // We can only handle pshufd if the half we are combining either stays in + // its half, or switches to the other half. Bail if one of these isn't + // true. + SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); + int DOffset = CombineOpcode == X86ISD::PSHUFLW ? 0 : 2; + if (!((VMask[DOffset + 0] < 2 && VMask[DOffset + 1] < 2) || + (VMask[DOffset + 0] >= 2 && VMask[DOffset + 1] >= 2))) + return false; + + // Map the mask through the pshufd and keep walking up the chain. + for (int i = 0; i < 4; ++i) + Mask[i] = 2 * (VMask[DOffset + Mask[i] / 2] % 2) + Mask[i] % 2; + + // Switch halves if the pshufd does. + CombineOpcode = + VMask[DOffset + Mask[0] / 2] < 2 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; + continue; + } + } + // Break out of the loop if we break out of the switch. + break; + } + + if (!V.hasOneUse()) + // We fell out of the loop without finding a viable combining instruction. + return false; + + // Record the old value to use in RAUW-ing. + SDValue Old = V; + + // Merge this node's mask and our incoming mask (adjusted to account for all + // the pshufd instructions encountered). + SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); + for (int &M : Mask) + M = VMask[M]; + V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0), + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // Replace N with its operand as we're going to combine that shuffle away. + DAG.ReplaceAllUsesWith(N, N.getOperand(0)); + + // Replace the combinable shuffle with the combined one, updating all users + // so that we re-evaluate the chain here. + DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); + return true; +} + +/// \brief Try to combine x86 target specific shuffles. +static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDLoc DL(N); + MVT VT = N.getSimpleValueType(); + SmallVector<int, 4> Mask; + + switch (N.getOpcode()) { + case X86ISD::PSHUFD: + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + Mask = getPSHUFShuffleMask(N); + assert(Mask.size() == 4); + break; + default: + return SDValue(); + } + + // Nuke no-op shuffles that show up after combining. + if (isNoopShuffleMask(Mask)) + return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); + + // Look for simplifications involving one or two shuffle instructions. + SDValue V = N.getOperand(0); + switch (N.getOpcode()) { + default: + break; + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + assert(VT == MVT::v8i16); + (void)VT; + + if (combineRedundantHalfShuffle(N, Mask, DAG, DCI)) + return SDValue(); // We combined away this shuffle, so we're done. + + // See if this reduces to a PSHUFD which is no more expensive and can + // combine with more operations. + if (Mask[0] % 2 == 0 && Mask[2] % 2 == 0 && + areAdjacentMasksSequential(Mask)) { + int DMask[] = {-1, -1, -1, -1}; + int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; + DMask[DOffset + 0] = DOffset + Mask[0] / 2; + DMask[DOffset + 1] = DOffset + Mask[2] / 2; + V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V); + DCI.AddToWorklist(V.getNode()); + V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V, + getV4X86ShuffleImm8ForMask(DMask, DAG)); + DCI.AddToWorklist(V.getNode()); + return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); + } + + // Look for shuffle patterns which can be implemented as a single unpack. + // FIXME: This doesn't handle the location of the PSHUFD generically, and + // only works when we have a PSHUFD followed by two half-shuffles. + if (Mask[0] == Mask[1] && Mask[2] == Mask[3] && + (V.getOpcode() == X86ISD::PSHUFLW || + V.getOpcode() == X86ISD::PSHUFHW) && + V.getOpcode() != N.getOpcode() && + V.hasOneUse()) { + SDValue D = V.getOperand(0); + while (D.getOpcode() == ISD::BITCAST && D.hasOneUse()) + D = D.getOperand(0); + if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) { + SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); + SmallVector<int, 4> DMask = getPSHUFShuffleMask(D); + int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; + int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; + int WordMask[8]; + for (int i = 0; i < 4; ++i) { + WordMask[i + NOffset] = Mask[i] + NOffset; + WordMask[i + VOffset] = VMask[i] + VOffset; + } + // Map the word mask through the DWord mask. + int MappedMask[8]; + for (int i = 0; i < 8; ++i) + MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2; + const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3}; + const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7}; + if (std::equal(std::begin(MappedMask), std::end(MappedMask), + std::begin(UnpackLoMask)) || + std::equal(std::begin(MappedMask), std::end(MappedMask), + std::begin(UnpackHiMask))) { + // We can replace all three shuffles with an unpack. + V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0)); + DCI.AddToWorklist(V.getNode()); + return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL + : X86ISD::UNPCKH, + DL, MVT::v8i16, V, V); + } + } + } + + break; + + case X86ISD::PSHUFD: + if (combineRedundantDWordShuffle(N, Mask, DAG, DCI)) + return SDValue(); // We combined away this shuffle. + + break; + } + + return SDValue(); +} + /// PerformShuffleCombine - Performs several different shuffle combines. static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { SDLoc dl(N); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); + // Canonicalize shuffles that perform 'addsub' on packed float vectors + // according to the rule: + // (shuffle (FADD A, B), (FSUB A, B), Mask) -> + // (shuffle (FSUB A, -B), (FADD A, -B), Mask) + // + // Where 'Mask' is: + // <0,5,2,7> -- for v4f32 and v4f64 shuffles; + // <0,3> -- for v2f64 shuffles; + // <0,9,2,11,4,13,6,15> -- for v8f32 shuffles. + // + // This helps pattern-matching more SSE3/AVX ADDSUB instructions + // during ISel stage. + if (N->getOpcode() == ISD::VECTOR_SHUFFLE && + ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && + N0->getOpcode() == ISD::FADD && N1->getOpcode() == ISD::FSUB && + // Operands to the FADD and FSUB must be the same. + ((N0->getOperand(0) == N1->getOperand(0) && + N0->getOperand(1) == N1->getOperand(1)) || + // FADD is commutable. See if by commuting the operands of the FADD + // we would still be able to match the operands of the FSUB dag node. + (N0->getOperand(1) == N1->getOperand(0) && + N0->getOperand(0) == N1->getOperand(1))) && + N0->getOperand(0)->getOpcode() != ISD::UNDEF && + N0->getOperand(1)->getOpcode() != ISD::UNDEF) { + + ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N); + unsigned NumElts = VT.getVectorNumElements(); + ArrayRef<int> Mask = SV->getMask(); + bool CanFold = true; + + for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) + CanFold = Mask[i] == (int)((i & 1) ? i + NumElts : i); + + if (CanFold) { + SDValue Op0 = N1->getOperand(0); + SDValue Op1 = DAG.getNode(ISD::FNEG, dl, VT, N1->getOperand(1)); + SDValue Sub = DAG.getNode(ISD::FSUB, dl, VT, Op0, Op1); + SDValue Add = DAG.getNode(ISD::FADD, dl, VT, Op0, Op1); + return DAG.getVectorShuffle(VT, dl, Sub, Add, Mask); + } + } + // Don't create instructions with illegal types after legalize types has run. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) @@ -17490,6 +18809,57 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, N->getOpcode() == ISD::VECTOR_SHUFFLE) return PerformShuffleCombine256(N, DAG, DCI, Subtarget); + // During Type Legalization, when promoting illegal vector types, + // the backend might introduce new shuffle dag nodes and bitcasts. + // + // This code performs the following transformation: + // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) -> + // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>) + // + // We do this only if both the bitcast and the BINOP dag nodes have + // one use. Also, perform this transformation only if the new binary + // operation is legal. This is to avoid introducing dag nodes that + // potentially need to be further expanded (or custom lowered) into a + // less optimal sequence of dag nodes. + if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() && + N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() && + N0.getOpcode() == ISD::BITCAST) { + SDValue BC0 = N0.getOperand(0); + EVT SVT = BC0.getValueType(); + unsigned Opcode = BC0.getOpcode(); + unsigned NumElts = VT.getVectorNumElements(); + + if (BC0.hasOneUse() && SVT.isVector() && + SVT.getVectorNumElements() * 2 == NumElts && + TLI.isOperationLegal(Opcode, VT)) { + bool CanFold = false; + switch (Opcode) { + default : break; + case ISD::ADD : + case ISD::FADD : + case ISD::SUB : + case ISD::FSUB : + case ISD::MUL : + case ISD::FMUL : + CanFold = true; + } + + unsigned SVTNumElts = SVT.getVectorNumElements(); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i) + CanFold = SVOp->getMaskElt(i) == (int)(i * 2); + for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i) + CanFold = SVOp->getMaskElt(i) < 0; + + if (CanFold) { + SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0)); + SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1)); + SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01); + return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]); + } + } + } + // Only handle 128 wide vector from here on. if (!VT.is128BitVector()) return SDValue(); @@ -17501,7 +18871,18 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); - return EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); + SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); + if (LD.getNode()) + return LD; + + if (isTargetShuffle(N->getOpcode())) { + SDValue Shuffle = + PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget); + if (Shuffle.getNode()) + return Shuffle; + } + + return SDValue(); } /// PerformTruncateCombine - Converts truncate operation to @@ -18155,28 +19536,34 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS)) return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); - // If the RHS is a constant we have to reverse the const canonicalization. - // x > C-1 ? x+-C : 0 --> subus x, C - if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && - isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) { - APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue(); - if (CondRHS.getConstantOperandVal(0) == -A-1) - return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, - DAG.getConstant(-A, VT)); - } - - // Another special case: If C was a sign bit, the sub has been - // canonicalized into a xor. - // FIXME: Would it be better to use computeKnownBits to determine whether - // it's safe to decanonicalize the xor? - // x s< 0 ? x^C : 0 --> subus x, C - if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && - ISD::isBuildVectorAllZeros(CondRHS.getNode()) && - isSplatVector(OpRHS.getNode())) { - APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue(); - if (A.isSignBit()) - return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); - } + if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) + if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) { + if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS)) + if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode()) + // If the RHS is a constant we have to reverse the const + // canonicalization. + // x > C-1 ? x+-C : 0 --> subus x, C + if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && + CondRHSConst->getAPIntValue() == + (-OpRHSConst->getAPIntValue() - 1)) + return DAG.getNode( + X86ISD::SUBUS, DL, VT, OpLHS, + DAG.getConstant(-OpRHSConst->getAPIntValue(), VT)); + + // Another special case: If C was a sign bit, the sub has been + // canonicalized into a xor. + // FIXME: Would it be better to use computeKnownBits to determine + // whether it's safe to decanonicalize the xor? + // x s< 0 ? x^C : 0 --> subus x, C + if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && + ISD::isBuildVectorAllZeros(CondRHS.getNode()) && + OpRHSConst->getAPIntValue().isSignBit()) + // Note that we have to rebuild the RHS constant here to ensure we + // don't rely on particular values of undef lanes. + return DAG.getNode( + X86ISD::SUBUS, DL, VT, OpLHS, + DAG.getConstant(OpRHSConst->getAPIntValue(), VT)); + } } } @@ -18743,6 +20130,8 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, if (C->isAllOnesValue()) return Op1; } + + return SDValue(); } // Packed SSE2/AVX2 arithmetic shift immediate intrinsics. @@ -18882,16 +20271,15 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { // vector operations in many cases. Also, on sandybridge ADD is faster than // shl. // (shl V, 1) -> add V,V - if (isSplatVector(N1.getNode())) { - assert(N0.getValueType().isVector() && "Invalid vector shift type"); - ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0)); - // We shift all of the values by one. In many cases we do not have - // hardware support for this operation. This is better expressed as an ADD - // of two values. - if (N1C && (1 == N1C->getZExtValue())) { - return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); + if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1)) + if (auto *N1SplatC = N1BV->getConstantSplatNode()) { + assert(N0.getValueType().isVector() && "Invalid vector shift type"); + // We shift all of the values by one. In many cases we do not have + // hardware support for this operation. This is better expressed as an ADD + // of two values. + if (N1SplatC->getZExtValue() == 1) + return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); } - } return SDValue(); } @@ -18910,10 +20298,9 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, SDValue Amt = N->getOperand(1); SDLoc DL(N); - if (isSplatVector(Amt.getNode())) { - SDValue SclrAmt = Amt->getOperand(0); - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { - APInt ShiftAmt = C->getAPIntValue(); + if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt)) + if (auto *AmtSplat = AmtBV->getConstantSplatNode()) { + APInt ShiftAmt = AmtSplat->getAPIntValue(); unsigned MaxAmount = VT.getVectorElementType().getSizeInBits(); // SSE2/AVX2 logical shifts always return a vector of 0s @@ -18923,7 +20310,6 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, if (ShiftAmt.trunc(8).uge(MaxAmount)) return getZeroVector(VT, Subtarget, DAG, DL); } - } return SDValue(); } @@ -19117,9 +20503,10 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, // The right side has to be a 'trunc' or a constant vector. bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE; - bool RHSConst = (isSplatVector(N1.getNode()) && - isa<ConstantSDNode>(N1->getOperand(0))); - if (!RHSTrunc && !RHSConst) + ConstantSDNode *RHSConstSplat = nullptr; + if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1)) + RHSConstSplat = RHSBV->getConstantSplatNode(); + if (!RHSTrunc && !RHSConstSplat) return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -19129,9 +20516,9 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, // Set N0 and N1 to hold the inputs to the new wide operation. N0 = N0->getOperand(0); - if (RHSConst) { + if (RHSConstSplat) { N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(), - N1->getOperand(0)); + SDValue(RHSConstSplat, 0)); SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1); N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C); } else if (RHSTrunc) { @@ -19277,12 +20664,9 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); unsigned SraAmt = ~0; if (Mask.getOpcode() == ISD::SRA) { - SDValue Amt = Mask.getOperand(1); - if (isSplatVector(Amt.getNode())) { - SDValue SclrAmt = Amt->getOperand(0); - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) - SraAmt = C->getZExtValue(); - } + if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1))) + if (auto *AmtConst = AmtBV->getConstantSplatNode()) + SraAmt = AmtConst->getZExtValue(); } else if (Mask.getOpcode() == X86ISD::VSRAI) { SDValue SraC = Mask.getOperand(1); SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); @@ -20642,6 +22026,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget); case X86ISD::INSERTPS: return PerformINSERTPSCombine(N, DAG, Subtarget); + case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget); } return SDValue(); @@ -21146,8 +22531,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, const GlobalValue *GV = GA->getGlobal(); // If we require an extra load to get this address, as in PIC mode, we // can't accept it. - if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, - getTargetMachine()))) + if (isGlobalStubReference( + Subtarget->ClassifyGlobalReference(GV, DAG.getTarget()))) return; Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), @@ -21425,3 +22810,7 @@ int X86TargetLowering::getScalingFactorCost(const AddrMode &AM, return AM.Scale != 0; return -1; } + +bool X86TargetLowering::isTargetFTOL() const { + return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit(); +} diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 9f51b53..c8cdce7 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -15,13 +15,13 @@ #ifndef X86ISELLOWERING_H #define X86ISELLOWERING_H -#include "X86Subtarget.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetOptions.h" namespace llvm { + class X86Subtarget; class X86TargetMachine; namespace X86ISD { @@ -86,6 +86,9 @@ namespace llvm { /// X86 Read Time-Stamp Counter and Processor ID. RDTSCP_DAG, + /// X86 Read Performance Monitoring Counters. + RDPMC_DAG, + /// X86 compare and logical compare instructions. CMP, COMI, UCOMI, @@ -315,6 +318,8 @@ namespace llvm { KORTEST, // Several flavors of instructions with vector shuffle behaviors. + PACKSS, + PACKUS, PALIGNR, PSHUFD, PSHUFHW, @@ -400,23 +405,8 @@ namespace llvm { // XTEST - Test if in transactional execution. XTEST, - // ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG, - // ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG - - // Atomic 64-bit binary operations. - ATOMADD64_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, - ATOMSUB64_DAG, - ATOMOR64_DAG, - ATOMXOR64_DAG, - ATOMAND64_DAG, - ATOMNAND64_DAG, - ATOMMAX64_DAG, - ATOMMIN64_DAG, - ATOMUMAX64_DAG, - ATOMUMIN64_DAG, - ATOMSWAP64_DAG, - // LCMPXCHG_DAG, LCMPXCHG8_DAG, LCMPXCHG16_DAG - Compare and swap. - LCMPXCHG_DAG, + LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, LCMPXCHG8_DAG, LCMPXCHG16_DAG, @@ -766,9 +756,7 @@ namespace llvm { /// isTargetFTOL - Return true if the target uses the MSVC _ftol2 routine /// for fptoui. - bool isTargetFTOL() const { - return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit(); - } + bool isTargetFTOL() const; /// isIntegerTypeFTOL - Return true if the MSVC _ftol2 routine should be /// used for fptoui to the given type. @@ -808,6 +796,9 @@ namespace llvm { /// \brief Reset the operation actions based on target options. void resetOperationActions() override; + /// \brief Customize the preferred legalization strategy for certain types. + LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; + protected: std::pair<const TargetRegisterClass*, uint8_t> findRepresentativeClass(MVT VT) const override; diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 37bcc52..41e900e 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -476,6 +476,28 @@ defm VPBROADCASTQZ : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem, loadi64, VR512, v8i64, v2i64, VK8WM>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +multiclass avx512_int_subvec_broadcast_rm<bits<8> opc, string OpcodeStr, + X86MemOperand x86memop, PatFrag ld_frag, + RegisterClass KRC> { + let mayLoad = 1 in { + def rm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + []>, EVEX; + def krm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins KRC:$mask, + x86memop:$src), + !strconcat(OpcodeStr, + " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + []>, EVEX, EVEX_KZ; + } +} + +defm VBROADCASTI32X4 : avx512_int_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", + i128mem, loadv2i64, VK16WM>, + EVEX_V512, EVEX_CD8<32, CD8VT4>; +defm VBROADCASTI64X4 : avx512_int_subvec_broadcast_rm<0x5b, "vbroadcasti64x4", + i256mem, loadv4i64, VK16WM>, VEX_W, + EVEX_V512, EVEX_CD8<64, CD8VT4>; + def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_512 (v4i32 VR128X:$src))), (VPBROADCASTDZrr VR128X:$src)>; def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))), @@ -517,10 +539,12 @@ def rr : AVX512XS8I<opc, MRMDestReg, (outs DstRC:$dst), (ins KRC:$src), []>, EVEX; } +let Predicates = [HasCDI] in { defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512, VK16, v16i32, v16i1>, EVEX_V512; defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512, VK8, v8i64, v8i1>, EVEX_V512, VEX_W; +} //===----------------------------------------------------------------------===// // AVX-512 - VPERM @@ -585,7 +609,7 @@ defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, memopv8f64, f512mem, // -- VPERM2I - 3 source operands form -- multiclass avx512_perm_3src<bits<8> opc, string OpcodeStr, RegisterClass RC, PatFrag mem_frag, X86MemOperand x86memop, - SDNode OpNode, ValueType OpVT> { + SDNode OpNode, ValueType OpVT, RegisterClass KRC> { let Constraints = "$src1 = $dst" in { def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), @@ -595,48 +619,107 @@ let Constraints = "$src1 = $dst" in { (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, EVEX_4V; + def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, KRC:$mask, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + " \t{$src3, $src2, $dst {${mask}}|" + "$dst {${mask}}, $src2, $src3}"), + [(set RC:$dst, (OpVT (vselect KRC:$mask, + (OpNode RC:$src1, RC:$src2, + RC:$src3), + RC:$src1)))]>, + EVEX_4V, EVEX_K; + + let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<> + def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, KRC:$mask, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + " \t{$src3, $src2, $dst {${mask}} {z} |", + "$dst {${mask}} {z}, $src2, $src3}"), + [(set RC:$dst, (OpVT (vselect KRC:$mask, + (OpNode RC:$src1, RC:$src2, + RC:$src3), + (OpVT (bitconvert + (v16i32 immAllZerosV))))))]>, + EVEX_4V, EVEX_KZ; + def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, " \t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, - (OpVT (OpNode RC:$src1, RC:$src2, + (OpVT (OpNode RC:$src1, RC:$src2, (mem_frag addr:$src3))))]>, EVEX_4V; + + def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, KRC:$mask, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, + " \t{$src3, $src2, $dst {${mask}}|" + "$dst {${mask}}, $src2, $src3}"), + [(set RC:$dst, + (OpVT (vselect KRC:$mask, + (OpNode RC:$src1, RC:$src2, + (mem_frag addr:$src3)), + RC:$src1)))]>, + EVEX_4V, EVEX_K; + + let AddedComplexity = 10 in // Prefer over the rrkz variant + def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, KRC:$mask, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, + " \t{$src3, $src2, $dst {${mask}} {z}|" + "$dst {${mask}} {z}, $src2, $src3}"), + [(set RC:$dst, + (OpVT (vselect KRC:$mask, + (OpNode RC:$src1, RC:$src2, + (mem_frag addr:$src3)), + (OpVT (bitconvert + (v16i32 immAllZerosV))))))]>, + EVEX_4V, EVEX_KZ; } } -defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32, i512mem, - X86VPermiv3, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, memopv8i64, i512mem, - X86VPermiv3, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, memopv16f32, i512mem, - X86VPermiv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, memopv8f64, i512mem, - X86VPermiv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VPERMT2D : avx512_perm_3src<0x7E, "vpermt2d", VR512, memopv16i32, i512mem, - X86VPermv3, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMT2Q : avx512_perm_3src<0x7E, "vpermt2q", VR512, memopv8i64, i512mem, - X86VPermv3, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMT2PS : avx512_perm_3src<0x7F, "vpermt2ps", VR512, memopv16f32, i512mem, - X86VPermv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMT2PD : avx512_perm_3src<0x7F, "vpermt2pd", VR512, memopv8f64, i512mem, - X86VPermv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -def : Pat<(v16f32 (int_x86_avx512_mask_vpermt_ps_512 (v16i32 VR512:$idx), - (v16f32 VR512:$src1), (v16f32 VR512:$src2), (i16 -1))), - (VPERMT2PSrr VR512:$src1, VR512:$idx, VR512:$src2)>; - -def : Pat<(v16i32 (int_x86_avx512_mask_vpermt_d_512 (v16i32 VR512:$idx), - (v16i32 VR512:$src1), (v16i32 VR512:$src2), (i16 -1))), - (VPERMT2Drr VR512:$src1, VR512:$idx, VR512:$src2)>; - -def : Pat<(v8f64 (int_x86_avx512_mask_vpermt_pd_512 (v8i64 VR512:$idx), - (v8f64 VR512:$src1), (v8f64 VR512:$src2), (i8 -1))), - (VPERMT2PDrr VR512:$src1, VR512:$idx, VR512:$src2)>; - -def : Pat<(v8i64 (int_x86_avx512_mask_vpermt_q_512 (v8i64 VR512:$idx), - (v8i64 VR512:$src1), (v8i64 VR512:$src2), (i8 -1))), - (VPERMT2Qrr VR512:$src1, VR512:$idx, VR512:$src2)>; +defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32, + i512mem, X86VPermiv3, v16i32, VK16WM>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, memopv8i64, + i512mem, X86VPermiv3, v8i64, VK8WM>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, memopv16f32, + i512mem, X86VPermiv3, v16f32, VK16WM>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, memopv8f64, + i512mem, X86VPermiv3, v8f64, VK8WM>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +multiclass avx512_perm_table_3src<bits<8> opc, string Suffix, RegisterClass RC, + PatFrag mem_frag, X86MemOperand x86memop, + SDNode OpNode, ValueType OpVT, RegisterClass KRC, + ValueType MaskVT, RegisterClass MRC> : + avx512_perm_3src<opc, "vpermt2"##Suffix, RC, mem_frag, x86memop, OpNode, + OpVT, KRC> { + def : Pat<(OpVT (!cast<Intrinsic>("int_x86_avx512_mask_vpermt_"##Suffix##"_512") + VR512:$idx, VR512:$src1, VR512:$src2, -1)), + (!cast<Instruction>(NAME#rr) VR512:$src1, VR512:$idx, VR512:$src2)>; + + def : Pat<(OpVT (!cast<Intrinsic>("int_x86_avx512_mask_vpermt_"##Suffix##"_512") + VR512:$idx, VR512:$src1, VR512:$src2, MRC:$mask)), + (!cast<Instruction>(NAME#rrk) VR512:$src1, + (MaskVT (COPY_TO_REGCLASS MRC:$mask, KRC)), VR512:$idx, VR512:$src2)>; +} + +defm VPERMT2D : avx512_perm_table_3src<0x7E, "d", VR512, memopv16i32, i512mem, + X86VPermv3, v16i32, VK16WM, v16i1, GR16>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPERMT2Q : avx512_perm_table_3src<0x7E, "q", VR512, memopv8i64, i512mem, + X86VPermv3, v8i64, VK8WM, v8i1, GR8>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMT2PS : avx512_perm_table_3src<0x7F, "ps", VR512, memopv16f32, i512mem, + X86VPermv3, v16f32, VK16WM, v16i1, GR16>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPERMT2PD : avx512_perm_table_3src<0x7F, "pd", VR512, memopv8f64, i512mem, + X86VPermv3, v8f64, VK8WM, v8i1, GR8>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + //===----------------------------------------------------------------------===// // AVX-512 - BLEND using mask // @@ -790,52 +873,61 @@ def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>; -multiclass avx512_icmp_cc<bits<8> opc, RegisterClass KRC, +multiclass avx512_icmp_cc<bits<8> opc, RegisterClass WMRC, RegisterClass KRC, RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, - SDNode OpNode, ValueType vt, Operand CC, string asm, - string asm_alt> { + SDNode OpNode, ValueType vt, Operand CC, string Suffix> { def rri : AVX512AIi8<opc, MRMSrcReg, - (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, + (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2), imm:$cc))], IIC_SSE_ALU_F32P_RR>, EVEX_4V; def rmi : AVX512AIi8<opc, MRMSrcMem, - (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, + (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2), imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V; // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { def rri_alt : AVX512AIi8<opc, MRMSrcReg, (outs KRC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), - asm_alt, [], IIC_SSE_ALU_F32P_RR>, EVEX_4V; + !strconcat("vpcmp", Suffix, + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), + [], IIC_SSE_ALU_F32P_RR>, EVEX_4V; + def rrik_alt : AVX512AIi8<opc, MRMSrcReg, + (outs KRC:$dst), (ins WMRC:$mask, RC:$src1, RC:$src2, i8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, $src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2, $cc}"), + [], IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K; def rmi_alt : AVX512AIi8<opc, MRMSrcMem, (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), - asm_alt, [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; + !strconcat("vpcmp", Suffix, + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), + [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; + def rmik_alt : AVX512AIi8<opc, MRMSrcMem, + (outs KRC:$dst), (ins WMRC:$mask, RC:$src1, x86memop:$src2, i8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, $src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2, $cc}"), + [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K; } } -defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16, VR512, i512mem, memopv16i32, - X86cmpm, v16i32, AVXCC, - "vpcmp${cc}d\t{$src2, $src1, $dst|$dst, $src1, $src2}", - "vpcmpd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16, VR512, i512mem, memopv16i32, - X86cmpmu, v16i32, AVXCC, - "vpcmp${cc}ud\t{$src2, $src1, $dst|$dst, $src1, $src2}", - "vpcmpud\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, - EVEX_V512, EVEX_CD8<32, CD8VF>; - -defm VPCMPQZ : avx512_icmp_cc<0x1F, VK8, VR512, i512mem, memopv8i64, - X86cmpm, v8i64, AVXCC, - "vpcmp${cc}q\t{$src2, $src1, $dst|$dst, $src1, $src2}", - "vpcmpq\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8, VR512, i512mem, memopv8i64, - X86cmpmu, v8i64, AVXCC, - "vpcmp${cc}uq\t{$src2, $src1, $dst|$dst, $src1, $src2}", - "vpcmpuq\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; - -// avx512_cmp_packed - sse 1 & 2 compare packed instructions +defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16WM, VK16, VR512, i512mem, memopv16i32, + X86cmpm, v16i32, AVXCC, "d">, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16WM, VK16, VR512, i512mem, memopv16i32, + X86cmpmu, v16i32, AVXCC, "ud">, + EVEX_V512, EVEX_CD8<32, CD8VF>; + +defm VPCMPQZ : avx512_icmp_cc<0x1F, VK8WM, VK8, VR512, i512mem, memopv8i64, + X86cmpm, v8i64, AVXCC, "q">, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; +defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8WM, VK8, VR512, i512mem, memopv8i64, + X86cmpmu, v8i64, AVXCC, "uq">, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; + +// avx512_cmp_packed - compare packed instructions multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC, X86MemOperand x86memop, ValueType vt, string suffix, Domain d> { @@ -859,11 +951,11 @@ multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC, // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { def rri_alt : AVX512PIi8<0xC2, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), + (outs KRC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), !strconcat("vcmp", suffix, " \t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; def rmi_alt : AVX512PIi8<0xC2, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), + (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), !strconcat("vcmp", suffix, " \t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; } @@ -1788,6 +1880,46 @@ def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>; //===----------------------------------------------------------------------===// +// AVX-512 - Non-temporals +//===----------------------------------------------------------------------===// + +def VMOVNTDQAZrm : AVX5128I<0x2A, MRMSrcMem, (outs VR512:$dst), + (ins i512mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", + [(set VR512:$dst, + (int_x86_avx512_movntdqa addr:$src))]>, + EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>; + +// Prefer non-temporal over temporal versions +let AddedComplexity = 400, SchedRW = [WriteStore] in { + +def VMOVNTPSZmr : AVX512PSI<0x2B, MRMDestMem, (outs), + (ins f512mem:$dst, VR512:$src), + "vmovntps\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v16f32 VR512:$src), + addr:$dst)], + IIC_SSE_MOVNT>, + EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; + +def VMOVNTPDZmr : AVX512PDI<0x2B, MRMDestMem, (outs), + (ins f512mem:$dst, VR512:$src), + "vmovntpd\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v8f64 VR512:$src), + addr:$dst)], + IIC_SSE_MOVNT>, + EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + + +def VMOVNTDQZmr : AVX512BI<0xE7, MRMDestMem, (outs), + (ins i512mem:$dst, VR512:$src), + "vmovntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v8i64 VR512:$src), + addr:$dst)], + IIC_SSE_MOVNT>, + EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>; +} + +//===----------------------------------------------------------------------===// // AVX-512 - Integer arithmetic // multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -3161,6 +3293,10 @@ def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))), (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr (v16i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>; +def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))), + (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr + (v8i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_ymm)>; + def : Pat<(v16f32 (int_x86_avx512_mask_cvtdq2ps_512 (v16i32 VR512:$src), (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), imm:$rc)), (VCVTDQ2PSZrrb VR512:$src, imm:$rc)>; @@ -4343,6 +4479,37 @@ def : Pat<(int_x86_avx512_mask_conflict_q_512 VR512:$src2, VR512:$src1, (VPCONFLICTQrrk VR512:$src1, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>; +let Predicates = [HasCDI] in { +defm VPLZCNTD : avx512_conflict<0x44, "vplzcntd", VR512, VK16WM, + i512mem, i32mem, "{1to16}">, + EVEX_V512, EVEX_CD8<32, CD8VF>; + + +defm VPLZCNTQ : avx512_conflict<0x44, "vplzcntq", VR512, VK8WM, + i512mem, i64mem, "{1to8}">, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +} + +def : Pat<(int_x86_avx512_mask_lzcnt_d_512 VR512:$src2, VR512:$src1, + GR16:$mask), + (VPLZCNTDrrk VR512:$src1, + (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), VR512:$src2)>; + +def : Pat<(int_x86_avx512_mask_lzcnt_q_512 VR512:$src2, VR512:$src1, + GR8:$mask), + (VPLZCNTQrrk VR512:$src1, + (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>; + +def : Pat<(v16i32 (ctlz (memopv16i32 addr:$src))), + (VPLZCNTDrm addr:$src)>; +def : Pat<(v16i32 (ctlz (v16i32 VR512:$src))), + (VPLZCNTDrr VR512:$src)>; +def : Pat<(v8i64 (ctlz (memopv8i64 addr:$src))), + (VPLZCNTQrm addr:$src)>; +def : Pat<(v8i64 (ctlz (v8i64 VR512:$src))), + (VPLZCNTQrr VR512:$src)>; + def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>; diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index 368e14b..f2574cc 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -1278,8 +1278,10 @@ let isCompare = 1 in { def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>; // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the - // register class is constrained to GR8_NOREX. - let isPseudo = 1 in + // register class is constrained to GR8_NOREX. This pseudo is explicitly + // marked side-effect free, since it doesn't have an isel pattern like + // other test instructions. + let isPseudo = 1, hasSideEffects = 0 in def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask), "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>; } // Defs = [EFLAGS] diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 34d8fb9..ca4f608 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -110,7 +110,7 @@ let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in // When using segmented stacks these are lowered into instructions which first // check if the current stacklet has enough free memory. If it does, memory is -// allocated by bumping the stack pointer. Otherwise memory is allocated from +// allocated by bumping the stack pointer. Otherwise memory is allocated from // the heap. let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in @@ -197,6 +197,26 @@ let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in { } //===----------------------------------------------------------------------===// +// Pseudo instructions used by unwind info. +// +let isPseudo = 1 in { + def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg), + "#SEH_PushReg $reg", []>; + def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst), + "#SEH_SaveReg $reg, $dst", []>; + def SEH_SaveXMM : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst), + "#SEH_SaveXMM $reg, $dst", []>; + def SEH_StackAlloc : I<0, Pseudo, (outs), (ins i32imm:$size), + "#SEH_StackAlloc $size", []>; + def SEH_SetFrame : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$offset), + "#SEH_SetFrame $reg, $offset", []>; + def SEH_PushFrame : I<0, Pseudo, (outs), (ins i1imm:$mode), + "#SEH_PushFrame $mode", []>; + def SEH_EndPrologue : I<0, Pseudo, (outs), (ins), + "#SEH_EndPrologue", []>; +} + +//===----------------------------------------------------------------------===// // Pseudo instructions used by segmented stacks. // @@ -371,7 +391,7 @@ let Defs = [RCX,RDI], isCodeGenOnly = 1 in { def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}", [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32, Requires<[In64BitMode]>; - + let Uses = [RAX,RCX,RDI] in def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}", [(X86rep_stos i64)], IIC_REP_STOS>, REP, @@ -502,83 +522,6 @@ def CMOV_RFP80 : I<0, Pseudo, //===----------------------------------------------------------------------===// -// Atomic Instruction Pseudo Instructions -//===----------------------------------------------------------------------===// - -// Pseudo atomic instructions - -multiclass PSEUDO_ATOMIC_LOAD_BINOP<string mnemonic> { - let usesCustomInserter = 1, mayLoad = 1, mayStore = 1 in { - let Defs = [EFLAGS, AL] in - def NAME#8 : I<0, Pseudo, (outs GR8:$dst), - (ins i8mem:$ptr, GR8:$val), - !strconcat(mnemonic, "8 PSEUDO!"), []>; - let Defs = [EFLAGS, AX] in - def NAME#16 : I<0, Pseudo,(outs GR16:$dst), - (ins i16mem:$ptr, GR16:$val), - !strconcat(mnemonic, "16 PSEUDO!"), []>; - let Defs = [EFLAGS, EAX] in - def NAME#32 : I<0, Pseudo, (outs GR32:$dst), - (ins i32mem:$ptr, GR32:$val), - !strconcat(mnemonic, "32 PSEUDO!"), []>; - let Defs = [EFLAGS, RAX] in - def NAME#64 : I<0, Pseudo, (outs GR64:$dst), - (ins i64mem:$ptr, GR64:$val), - !strconcat(mnemonic, "64 PSEUDO!"), []>; - } -} - -multiclass PSEUDO_ATOMIC_LOAD_BINOP_PATS<string name, string frag> { - def : Pat<(!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val), - (!cast<Instruction>(name # "8") addr:$ptr, GR8:$val)>; - def : Pat<(!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val), - (!cast<Instruction>(name # "16") addr:$ptr, GR16:$val)>; - def : Pat<(!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val), - (!cast<Instruction>(name # "32") addr:$ptr, GR32:$val)>; - def : Pat<(!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val), - (!cast<Instruction>(name # "64") addr:$ptr, GR64:$val)>; -} - -// Atomic exchange, and, or, xor -defm ATOMAND : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMAND">; -defm ATOMOR : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMOR">; -defm ATOMXOR : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMXOR">; -defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMNAND">; -defm ATOMMAX : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMAX">; -defm ATOMMIN : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMIN">; -defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMAX">; -defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMIN">; - -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMAND", "atomic_load_and">; -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMOR", "atomic_load_or">; -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMXOR", "atomic_load_xor">; -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMNAND", "atomic_load_nand">; -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMAX", "atomic_load_max">; -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMIN", "atomic_load_min">; -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMAX", "atomic_load_umax">; -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMIN", "atomic_load_umin">; - -multiclass PSEUDO_ATOMIC_LOAD_BINOP6432<string mnemonic> { - let usesCustomInserter = 1, Defs = [EFLAGS, EAX, EDX], - mayLoad = 1, mayStore = 1, hasSideEffects = 0 in - def NAME#6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2), - (ins i64mem:$ptr, GR32:$val1, GR32:$val2), - !strconcat(mnemonic, "6432 PSEUDO!"), []>; -} - -defm ATOMAND : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMAND">; -defm ATOMOR : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMOR">; -defm ATOMXOR : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMXOR">; -defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMNAND">; -defm ATOMADD : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMADD">; -defm ATOMSUB : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSUB">; -defm ATOMMAX : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMAX">; -defm ATOMMIN : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMIN">; -defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMAX">; -defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMIN">; -defm ATOMSWAP : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSWAP">; - -//===----------------------------------------------------------------------===// // Normal-Instructions-With-Lock-Prefix Pseudo Instructions //===----------------------------------------------------------------------===// @@ -1696,20 +1639,34 @@ def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2), (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>; // Increment reg. -def : Pat<(add GR8 :$src, 1), (INC8r GR8 :$src)>; -def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>, Requires<[Not64BitMode]>; -def : Pat<(add GR16:$src, 1), (INC64_16r GR16:$src)>, Requires<[In64BitMode]>; -def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>, Requires<[Not64BitMode]>; -def : Pat<(add GR32:$src, 1), (INC64_32r GR32:$src)>, Requires<[In64BitMode]>; -def : Pat<(add GR64:$src, 1), (INC64r GR64:$src)>; +// Do not make INC if it is slow +def : Pat<(add GR8:$src, 1), + (INC8r GR8:$src)>, Requires<[NotSlowIncDec]>; +def : Pat<(add GR16:$src, 1), + (INC16r GR16:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>; +def : Pat<(add GR16:$src, 1), + (INC64_16r GR16:$src)>, Requires<[NotSlowIncDec, In64BitMode]>; +def : Pat<(add GR32:$src, 1), + (INC32r GR32:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>; +def : Pat<(add GR32:$src, 1), + (INC64_32r GR32:$src)>, Requires<[NotSlowIncDec, In64BitMode]>; +def : Pat<(add GR64:$src, 1), + (INC64r GR64:$src)>, Requires<[NotSlowIncDec]>; // Decrement reg. -def : Pat<(add GR8 :$src, -1), (DEC8r GR8 :$src)>; -def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>, Requires<[Not64BitMode]>; -def : Pat<(add GR16:$src, -1), (DEC64_16r GR16:$src)>, Requires<[In64BitMode]>; -def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>, Requires<[Not64BitMode]>; -def : Pat<(add GR32:$src, -1), (DEC64_32r GR32:$src)>, Requires<[In64BitMode]>; -def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>; +// Do not make DEC if it is slow +def : Pat<(add GR8:$src, -1), + (DEC8r GR8:$src)>, Requires<[NotSlowIncDec]>; +def : Pat<(add GR16:$src, -1), + (DEC16r GR16:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>; +def : Pat<(add GR16:$src, -1), + (DEC64_16r GR16:$src)>, Requires<[NotSlowIncDec, In64BitMode]>; +def : Pat<(add GR32:$src, -1), + (DEC32r GR32:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>; +def : Pat<(add GR32:$src, -1), + (DEC64_32r GR32:$src)>, Requires<[NotSlowIncDec, In64BitMode]>; +def : Pat<(add GR64:$src, -1), + (DEC64r GR64:$src)>, Requires<[NotSlowIncDec]>; // or reg/reg. def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr GR8 :$src1, GR8 :$src2)>; diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 1582f43..6f0fa94 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -224,6 +224,10 @@ def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>; def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>; def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>; +def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>; +def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>; +def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>; + def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>; def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 6993577..0d3afc4 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/LLVMContext.h" #include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -97,14 +98,11 @@ struct X86OpTblEntry { // Pin the vtable to this file. void X86InstrInfo::anchor() {} -X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) - : X86GenInstrInfo((tm.getSubtarget<X86Subtarget>().is64Bit() - ? X86::ADJCALLSTACKDOWN64 - : X86::ADJCALLSTACKDOWN32), - (tm.getSubtarget<X86Subtarget>().is64Bit() - ? X86::ADJCALLSTACKUP64 - : X86::ADJCALLSTACKUP32)), - TM(tm), RI(tm) { +X86InstrInfo::X86InstrInfo(X86Subtarget &STI) + : X86GenInstrInfo( + (STI.is64Bit() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), + (STI.is64Bit() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)), + Subtarget(STI), RI(STI) { static const X86OpTblEntry OpTbl2Addr[] = { { X86::ADC32ri, X86::ADC32mi, 0 }, @@ -1472,7 +1470,7 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, case X86::MOVSX32rr8: case X86::MOVZX32rr8: case X86::MOVSX64rr8: - if (!TM.getSubtarget<X86Subtarget>().is64Bit()) + if (!Subtarget.is64Bit()) // It's not always legal to reference the low 8-bit of the larger // register in 32-bit mode. return false; @@ -1950,7 +1948,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo(); unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass); unsigned Opc, leaInReg; - if (TM.getSubtarget<X86Subtarget>().is64Bit()) { + if (Subtarget.is64Bit()) { Opc = X86::LEA64_32r; leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); } else { @@ -2006,7 +2004,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, // just a single insert_subreg. addRegReg(MIB, leaInReg, true, leaInReg, false); } else { - if (TM.getSubtarget<X86Subtarget>().is64Bit()) + if (Subtarget.is64Bit()) leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); else leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); @@ -2076,13 +2074,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, // we have better subtarget support, enable the 16-bit LEA generation here. // 16-bit LEA is also slow on Core2. bool DisableLEA16 = true; - bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit(); + bool is64Bit = Subtarget.is64Bit(); unsigned MIOpc = MI->getOpcode(); switch (MIOpc) { case X86::SHUFPSrri: { assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!"); - if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return nullptr; + if (!Subtarget.hasSSE2()) return nullptr; unsigned B = MI->getOperand(1).getReg(); unsigned C = MI->getOperand(2).getReg(); @@ -2094,7 +2092,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, } case X86::SHUFPDrri: { assert(MI->getNumOperands() == 4 && "Unknown shufpd instruction!"); - if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return nullptr; + if (!Subtarget.hasSSE2()) return nullptr; unsigned B = MI->getOperand(1).getReg(); unsigned C = MI->getOperand(2).getReg(); @@ -2672,8 +2670,7 @@ static X86::CondCode getSwappedCondition(X86::CondCode CC) { /// getSETFromCond - Return a set opcode for the given condition and /// whether it has memory operand. -static unsigned getSETFromCond(X86::CondCode CC, - bool HasMemoryOperand) { +unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) { static const uint16_t Opc[16][2] = { { X86::SETAr, X86::SETAm }, { X86::SETAEr, X86::SETAEm }, @@ -2693,14 +2690,14 @@ static unsigned getSETFromCond(X86::CondCode CC, { X86::SETSr, X86::SETSm } }; - assert(CC < 16 && "Can only handle standard cond codes"); + assert(CC <= LAST_VALID_COND && "Can only handle standard cond codes"); return Opc[CC][HasMemoryOperand ? 1 : 0]; } /// getCMovFromCond - Return a cmov opcode for the given condition, /// register size in bytes, and operand type. -static unsigned getCMovFromCond(X86::CondCode CC, unsigned RegBytes, - bool HasMemoryOperand) { +unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes, + bool HasMemoryOperand) { static const uint16_t Opc[32][3] = { { X86::CMOVA16rr, X86::CMOVA32rr, X86::CMOVA64rr }, { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr }, @@ -2976,7 +2973,7 @@ canInsertSelect(const MachineBasicBlock &MBB, unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const { // Not all subtargets have cmov instructions. - if (!TM.getSubtarget<X86Subtarget>().hasCMov()) + if (!Subtarget.hasCMov()) return false; if (Cond.size() != 1) return false; @@ -3027,8 +3024,7 @@ static bool isHReg(unsigned Reg) { // Try and copy between VR128/VR64 and GR64 registers. static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, - const X86Subtarget& Subtarget) { - + const X86Subtarget &Subtarget) { // SrcReg(VR128) -> DestReg(GR64) // SrcReg(VR64) -> DestReg(GR64) @@ -3107,8 +3103,8 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { // First deal with the normal symmetric copies. - bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); - bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512(); + bool HasAVX = Subtarget.hasAVX(); + bool HasAVX512 = Subtarget.hasAVX512(); unsigned Opc = 0; if (X86::GR64RegClass.contains(DestReg, SrcReg)) Opc = X86::MOV64rr; @@ -3120,7 +3116,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copying to or from a physical H register on x86-64 requires a NOREX // move. Otherwise use a normal move. if ((isHReg(DestReg) || isHReg(SrcReg)) && - TM.getSubtarget<X86Subtarget>().is64Bit()) { + Subtarget.is64Bit()) { Opc = X86::MOV8rr_NOREX; // Both operands must be encodable without an REX prefix. assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) && @@ -3137,7 +3133,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, else if (X86::VR256RegClass.contains(DestReg, SrcReg)) Opc = X86::VMOVAPSYrr; if (!Opc) - Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, TM.getSubtarget<X86Subtarget>()); + Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget); if (Opc) { BuildMI(MBB, MI, DL, get(Opc), DestReg) @@ -3183,9 +3179,9 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, static unsigned getLoadStoreRegOpcode(unsigned Reg, const TargetRegisterClass *RC, bool isStackAligned, - const TargetMachine &TM, + const X86Subtarget &STI, bool load) { - if (TM.getSubtarget<X86Subtarget>().hasAVX512()) { + if (STI.hasAVX512()) { if (X86::VK8RegClass.hasSubClassEq(RC) || X86::VK16RegClass.hasSubClassEq(RC)) return load ? X86::KMOVWkm : X86::KMOVWmk; @@ -3197,13 +3193,13 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr; } - bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + bool HasAVX = STI.hasAVX(); switch (RC->getSize()) { default: llvm_unreachable("Unknown spill size"); case 1: assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass"); - if (TM.getSubtarget<X86Subtarget>().is64Bit()) + if (STI.is64Bit()) // Copying to or from a physical H register on x86-64 requires a NOREX // move. Otherwise use a normal move. if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC)) @@ -3270,16 +3266,16 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, static unsigned getStoreRegOpcode(unsigned SrcReg, const TargetRegisterClass *RC, bool isStackAligned, - TargetMachine &TM) { - return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, TM, false); + const X86Subtarget &STI) { + return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, STI, false); } static unsigned getLoadRegOpcode(unsigned DestReg, const TargetRegisterClass *RC, bool isStackAligned, - const TargetMachine &TM) { - return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, TM, true); + const X86Subtarget &STI) { + return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, STI, true); } void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, @@ -3291,9 +3287,10 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() && "Stack slot too small for store"); unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); - bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) || - RI.canRealignStack(MF); - unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM); + bool isAligned = + (MF.getTarget().getFrameLowering()->getStackAlignment() >= Alignment) || + RI.canRealignStack(MF); + unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); DebugLoc DL = MBB.findDebugLoc(MI); addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx) .addReg(SrcReg, getKillRegState(isKill)); @@ -3309,7 +3306,7 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg, unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); bool isAligned = MMOBegin != MMOEnd && (*MMOBegin)->getAlignment() >= Alignment; - unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM); + unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); DebugLoc DL; MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc)); for (unsigned i = 0, e = Addr.size(); i != e; ++i) @@ -3327,9 +3324,10 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); - bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) || - RI.canRealignStack(MF); - unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM); + bool isAligned = + (MF.getTarget().getFrameLowering()->getStackAlignment() >= Alignment) || + RI.canRealignStack(MF); + unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); DebugLoc DL = MBB.findDebugLoc(MI); addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx); } @@ -3343,7 +3341,7 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg, unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); bool isAligned = MMOBegin != MMOEnd && (*MMOBegin)->getAlignment() >= Alignment; - unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM); + unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); DebugLoc DL; MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg); for (unsigned i = 0, e = Addr.size(); i != e; ++i) @@ -3741,7 +3739,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, continue; // EFLAGS is used by this instruction. - X86::CondCode OldCC; + X86::CondCode OldCC = X86::COND_INVALID; bool OpcIsSET = false; if (IsCmpZero || IsSwapped) { // We decode the condition code from opcode. @@ -3964,7 +3962,7 @@ static bool Expand2AddrUndef(MachineInstrBuilder &MIB, } bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { - bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + bool HasAVX = Subtarget.hasAVX(); MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); switch (MI->getOpcode()) { case X86::MOV32r0: @@ -4075,7 +4073,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, unsigned Size, unsigned Align) const { const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr; - bool isCallRegIndirect = TM.getSubtarget<X86Subtarget>().callRegIndirect(); + bool isCallRegIndirect = Subtarget.callRegIndirect(); bool isTwoAddrFold = false; // Atom favors register form of call. So, we do not fold loads into calls @@ -4316,7 +4314,7 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, if (X86::VR128RegClass.contains(Reg)) { // These instructions are all floating point domain, so xorps is the best // choice. - bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + bool HasAVX = Subtarget.hasAVX(); unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr; BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg) .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); @@ -4352,7 +4350,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, // If the function stack isn't realigned we don't want to fold instructions // that need increased alignment. if (!RI.needsStackRealignment(MF)) - Alignment = std::min(Alignment, TM.getFrameLowering()->getStackAlignment()); + Alignment = std::min( + Alignment, MF.getTarget().getFrameLowering()->getStackAlignment()); if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { unsigned NewOpc = 0; unsigned RCSize = 0; @@ -4453,14 +4452,14 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // Create a constant-pool entry and operands to load from it. // Medium and large mode can't fold loads this way. - if (TM.getCodeModel() != CodeModel::Small && - TM.getCodeModel() != CodeModel::Kernel) + if (MF.getTarget().getCodeModel() != CodeModel::Small && + MF.getTarget().getCodeModel() != CodeModel::Kernel) return nullptr; // x86-32 PIC requires a PIC base register for constant pools. unsigned PICBase = 0; - if (TM.getRelocationModel() == Reloc::PIC_) { - if (TM.getSubtarget<X86Subtarget>().is64Bit()) + if (MF.getTarget().getRelocationModel() == Reloc::PIC_) { + if (Subtarget.is64Bit()) PICBase = X86::RIP; else // FIXME: PICBase = getGlobalBaseReg(&MF); @@ -4600,7 +4599,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); if (!MI->hasOneMemOperand() && RC == &X86::VR128RegClass && - !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast()) + !Subtarget.isUnalignedMemAccessFast()) // Without memoperands, loadRegFromAddr and storeRegToStackSlot will // conservatively assume the address is unaligned. That's bad for // performance. @@ -4748,13 +4747,13 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, cast<MachineSDNode>(N)->memoperands_end()); if (!(*MMOs.first) && RC == &X86::VR128RegClass && - !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast()) + !Subtarget.isUnalignedMemAccessFast()) // Do not introduce a slow unaligned load. return false; unsigned Alignment = RC->getSize() == 32 ? 32 : 16; bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; - Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl, + Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl, VT, MVT::Other, AddrOps); NewNodes.push_back(Load); @@ -4791,15 +4790,15 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, cast<MachineSDNode>(N)->memoperands_end()); if (!(*MMOs.first) && RC == &X86::VR128RegClass && - !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast()) + !Subtarget.isUnalignedMemAccessFast()) // Do not introduce a slow unaligned store. return false; unsigned Alignment = RC->getSize() == 32 ? 32 : 16; bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; - SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC, - isAligned, TM), - dl, MVT::Other, AddrOps); + SDNode *Store = + DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget), + dl, MVT::Other, AddrOps); NewNodes.push_back(Store); // Preserve memory reference information. @@ -4960,7 +4959,7 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, default: // XMM registers. In 64-bit mode we can be a bit more aggressive since we // have 16 of them to play with. - if (TM.getSubtargetImpl()->is64Bit()) { + if (Subtarget.is64Bit()) { if (NumLoads >= 3) return false; } else if (NumLoads) { @@ -4986,7 +4985,7 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First, // Check if this processor supports macro-fusion. Since this is a minor // heuristic, we haven't specifically reserved a feature. hasAVX is a decent // proxy for SandyBridge+. - if (!TM.getSubtarget<X86Subtarget>().hasAVX()) + if (!Subtarget.hasAVX()) return false; enum { @@ -5038,6 +5037,7 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First, case X86::TEST16rm: case X86::TEST32rm: case X86::TEST64rm: + case X86::TEST8ri_NOREX: case X86::AND16i16: case X86::AND16ri: case X86::AND16ri8: @@ -5168,7 +5168,7 @@ isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { /// TODO: Eliminate this and move the code to X86MachineFunctionInfo. /// unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const { - assert(!TM.getSubtarget<X86Subtarget>().is64Bit() && + assert(!Subtarget.is64Bit() && "X86-64 PIC uses RIP relative addressing"); X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); @@ -5271,7 +5271,7 @@ static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) { std::pair<uint16_t, uint16_t> X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const { uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3; - bool hasAVX2 = TM.getSubtarget<X86Subtarget>().hasAVX2(); + bool hasAVX2 = Subtarget.hasAVX2(); uint16_t validDomains = 0; if (domain && lookup(MI->getOpcode(), domain)) validDomains = 0xe; @@ -5286,7 +5286,7 @@ void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { assert(dom && "Not an SSE instruction"); const uint16_t *table = lookup(MI->getOpcode(), dom); if (!table) { // try the other table - assert((TM.getSubtarget<X86Subtarget>().hasAVX2() || Domain < 3) && + assert((Subtarget.hasAVX2() || Domain < 3) && "256-bit vector operations only available in AVX2"); table = lookupAVX2(MI->getOpcode(), dom); } @@ -5299,6 +5299,16 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { NopInst.setOpcode(X86::NOOP); } +void X86InstrInfo::getUnconditionalBranch( + MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const { + Branch.setOpcode(X86::JMP_4); + Branch.addOperand(MCOperand::CreateExpr(BranchTarget)); +} + +void X86InstrInfo::getTrap(MCInst &MI) const { + MI.setOpcode(X86::TRAP); +} + bool X86InstrInfo::isHighLatencyDef(int opc) const { switch (opc) { default: return false; diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 5f34915..c177e3a 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -24,7 +24,7 @@ namespace llvm { class X86RegisterInfo; - class X86TargetMachine; + class X86Subtarget; namespace X86 { // X86 specific condition code. These correspond to X86_*_COND in @@ -46,6 +46,7 @@ namespace X86 { COND_O = 13, COND_P = 14, COND_S = 15, + LAST_VALID_COND = COND_S, // Artificial condition codes. These are used by AnalyzeBranch // to indicate a block terminated with two conditional branches to @@ -61,12 +62,21 @@ namespace X86 { // Turn condition code into conditional branch opcode. unsigned GetCondBranchFromCond(CondCode CC); + /// \brief Return a set opcode for the given condition and whether it has + /// a memory operand. + unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false); + + /// \brief Return a cmov opcode for the given condition, register size in + /// bytes, and operand type. + unsigned getCMovFromCond(CondCode CC, unsigned RegBytes, + bool HasMemoryOperand = false); + // Turn CMov opcode into condition code. CondCode getCondFromCMovOpc(unsigned Opc); /// GetOppositeBranchCondition - Return the inverse of the specified cond, /// e.g. turning COND_E to COND_NE. - CondCode GetOppositeBranchCondition(X86::CondCode CC); + CondCode GetOppositeBranchCondition(CondCode CC); } // end namespace X86; @@ -129,7 +139,7 @@ inline static bool isMem(const MachineInstr *MI, unsigned Op) { } class X86InstrInfo final : public X86GenInstrInfo { - X86TargetMachine &TM; + X86Subtarget &Subtarget; const X86RegisterInfo RI; /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1, @@ -156,7 +166,7 @@ class X86InstrInfo final : public X86GenInstrInfo { virtual void anchor(); public: - explicit X86InstrInfo(X86TargetMachine &tm); + explicit X86InstrInfo(X86Subtarget &STI); /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should @@ -396,6 +406,12 @@ public: const SmallVectorImpl<MachineOperand> &MOs, unsigned Size, unsigned Alignment) const; + void + getUnconditionalBranch(MCInst &Branch, + const MCSymbolRefExpr *BranchTarget) const override; + + void getTrap(MCInst &MI) const override; + bool isHighLatencyDef(int opc) const override; bool hasHighOperandLatency(const InstrItineraryData *ItinData, diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 0d97669..e7b532c 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -155,27 +155,6 @@ def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; -def X86AtomAdd64 : SDNode<"X86ISD::ATOMADD64_DAG", SDTX86atomicBinary, - [SDNPHasChain, SDNPMayStore, - SDNPMayLoad, SDNPMemOperand]>; -def X86AtomSub64 : SDNode<"X86ISD::ATOMSUB64_DAG", SDTX86atomicBinary, - [SDNPHasChain, SDNPMayStore, - SDNPMayLoad, SDNPMemOperand]>; -def X86AtomOr64 : SDNode<"X86ISD::ATOMOR64_DAG", SDTX86atomicBinary, - [SDNPHasChain, SDNPMayStore, - SDNPMayLoad, SDNPMemOperand]>; -def X86AtomXor64 : SDNode<"X86ISD::ATOMXOR64_DAG", SDTX86atomicBinary, - [SDNPHasChain, SDNPMayStore, - SDNPMayLoad, SDNPMemOperand]>; -def X86AtomAnd64 : SDNode<"X86ISD::ATOMAND64_DAG", SDTX86atomicBinary, - [SDNPHasChain, SDNPMayStore, - SDNPMayLoad, SDNPMemOperand]>; -def X86AtomNand64 : SDNode<"X86ISD::ATOMNAND64_DAG", SDTX86atomicBinary, - [SDNPHasChain, SDNPMayStore, - SDNPMayLoad, SDNPMemOperand]>; -def X86AtomSwap64 : SDNode<"X86ISD::ATOMSWAP64_DAG", SDTX86atomicBinary, - [SDNPHasChain, SDNPMayStore, - SDNPMayLoad, SDNPMemOperand]>; def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; @@ -208,6 +187,8 @@ def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void, [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; def X86rdtscp : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void, [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; +def X86rdpmc : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void, + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>; def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>; @@ -795,6 +776,7 @@ def OptForSpeed : Predicate<"!OptForSize">; def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">; +def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">; //===----------------------------------------------------------------------===// // X86 Instruction Format Definitions. diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 1eb0485..f9a5ae1 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -4337,20 +4337,6 @@ defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, SSE_INTALU_ITINS_P, 0>; //===---------------------------------------------------------------------===// -// SSE2 - Packed Integer Pack Instructions -//===---------------------------------------------------------------------===// - -defm PACKSSWB : PDI_binop_all_int<0x63, "packsswb", int_x86_sse2_packsswb_128, - int_x86_avx2_packsswb, - SSE_INTALU_ITINS_SHUFF_P, 0>; -defm PACKSSDW : PDI_binop_all_int<0x6B, "packssdw", int_x86_sse2_packssdw_128, - int_x86_avx2_packssdw, - SSE_INTALU_ITINS_SHUFF_P, 0>; -defm PACKUSWB : PDI_binop_all_int<0x67, "packuswb", int_x86_sse2_packuswb_128, - int_x86_avx2_packuswb, - SSE_INTALU_ITINS_SHUFF_P, 0>; - -//===---------------------------------------------------------------------===// // SSE2 - Packed Integer Shuffle Instructions //===---------------------------------------------------------------------===// @@ -4432,6 +4418,136 @@ let Predicates = [UseSSE2] in { } //===---------------------------------------------------------------------===// +// Packed Integer Pack Instructions (SSE & AVX) +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { +multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag, + bit Is2Addr = 1> { + def rr : PDI<opc, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>, + Sched<[WriteShuffle]>; + def rm : PDI<opc, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (OutVT (OpNode VR128:$src1, + (bc_frag (memopv2i64 addr:$src2)))))]>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> { + def Yrr : PDI<opc, MRMSrcReg, + (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>, + Sched<[WriteShuffle]>; + def Yrm : PDI<opc, MRMSrcMem, + (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OutVT (OpNode VR256:$src1, + (bc_frag (memopv4i64 addr:$src2)))))]>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag, + bit Is2Addr = 1> { + def rr : SS48I<opc, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>, + Sched<[WriteShuffle]>; + def rm : SS48I<opc, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (OutVT (OpNode VR128:$src1, + (bc_frag (memopv2i64 addr:$src2)))))]>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> { + def Yrr : SS48I<opc, MRMSrcReg, + (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>, + Sched<[WriteShuffle]>; + def Yrm : SS48I<opc, MRMSrcMem, + (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OutVT (OpNode VR256:$src1, + (bc_frag (memopv4i64 addr:$src2)))))]>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +let Predicates = [HasAVX] in { + defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, + bc_v8i16, 0>, VEX_4V; + defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, + bc_v4i32, 0>, VEX_4V; + + defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, + bc_v8i16, 0>, VEX_4V; + defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, + bc_v4i32, 0>, VEX_4V; +} + +let Predicates = [HasAVX2] in { + defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss, + bc_v16i16>, VEX_4V, VEX_L; + defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, + bc_v8i32>, VEX_4V, VEX_L; + + defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus, + bc_v16i16>, VEX_4V, VEX_L; + defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, + bc_v8i32>, VEX_4V, VEX_L; +} + +let Constraints = "$src1 = $dst" in { + defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, + bc_v8i16>; + defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, + bc_v4i32>; + + defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, + bc_v8i16>; + + let Predicates = [HasSSE41] in + defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, + bc_v4i32>; +} +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// // SSE2 - Packed Integer Unpack Instructions //===---------------------------------------------------------------------===// @@ -5239,6 +5355,60 @@ let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { f128mem, SSE_ALU_F64P>, PD; } +// Patterns used to select 'addsub' instructions. +let Predicates = [HasAVX] in { + // Constant 170 corresponds to the binary mask '10101010'. + // When used as a blend mask, it allows selecting eight elements from two + // input vectors as follow: + // - Even-numbered values in the destination are copied from + // the corresponding elements in the first input vector; + // - Odd-numbered values in the destination are copied from + // the corresponding elements in the second input vector. + + def : Pat<(v8f32 (X86Blendi (v8f32 (fsub VR256:$lhs, VR256:$rhs)), + (v8f32 (fadd VR256:$lhs, VR256:$rhs)), (i32 170))), + (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>; + + // Constant 10 corresponds to the binary mask '1010'. + // In the two pattens below, constant 10 is used as a blend mask to select + // - the 1st and 3rd element from the first input vector (the 'fsub' node); + // - the 2nd and 4th element from the second input vector (the 'fadd' node). + + def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)), + (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))), + (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; + def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)), + (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))), + (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; + def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)), + (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))), + (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)), + (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), + (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)), + (v2f64 (fsub VR128:$lhs, VR128:$rhs)))), + (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; +} + +let Predicates = [UseSSE3] in { + // Constant 10 corresponds to the binary mask '1010'. + // In the pattern below, it is used as a blend mask to select: + // - the 1st and 3rd element from the first input vector (the fsub node); + // - the 2nd and 4th element from the second input vector (the fadd node). + + def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)), + (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))), + (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>; + + def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)), + (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), + (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)), + (v2f64 (fsub VR128:$lhs, VR128:$rhs)))), + (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; +} + //===---------------------------------------------------------------------===// // SSE3 Instructions //===---------------------------------------------------------------------===// @@ -7053,8 +7223,6 @@ multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, let Predicates = [HasAVX] in { let isCommutable = 0 in - defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw, - 0, DEFAULT_ITINS_SHUFFLESCHED>, VEX_4V; defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, VEX_4V; @@ -7086,9 +7254,6 @@ let Predicates = [HasAVX] in { let Predicates = [HasAVX2] in { let isCommutable = 0 in - defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw", - int_x86_avx2_packusdw, WriteShuffle>, - VEX_4V, VEX_L; defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, VEX_4V, VEX_L; @@ -7120,8 +7285,6 @@ let Predicates = [HasAVX2] in { let Constraints = "$src1 = $dst" in { let isCommutable = 0 in - defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw, - 1, DEFAULT_ITINS_SHUFFLESCHED>; defm PMINSB : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128, memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; defm PMINSD : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128, @@ -7969,6 +8132,16 @@ class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX; +class avx_broadcast_no_int<bits<8> opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, ValueType VT, + PatFrag ld_frag, SchedWrite Sched> : + AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>, + Sched<[Sched]>, VEX { + let mayLoad = 1; +} + // AVX2 adds register forms class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC, Intrinsic Int, SchedWrite Sched> : @@ -7977,16 +8150,15 @@ class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC, [(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX; let ExeDomain = SSEPackedSingle in { - def VBROADCASTSSrm : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem, - int_x86_avx_vbroadcast_ss, WriteLoad>; - def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem, - int_x86_avx_vbroadcast_ss_256, - WriteFShuffleLd>, VEX_L; + def VBROADCASTSSrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR128, + f32mem, v4f32, loadf32, WriteLoad>; + def VBROADCASTSSYrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR256, + f32mem, v8f32, loadf32, + WriteFShuffleLd>, VEX_L; } let ExeDomain = SSEPackedDouble in -def VBROADCASTSDYrm : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem, - int_x86_avx_vbroadcast_sd_256, - WriteFShuffleLd>, VEX_L; +def VBROADCASTSDYrm : avx_broadcast_no_int<0x19, "vbroadcastsd", VR256, f64mem, + v4f64, loadf64, WriteFShuffleLd>, VEX_L; def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem, int_x86_avx_vbroadcastf128_pd_256, WriteFShuffleLd>, VEX_L; @@ -8366,6 +8538,21 @@ let Predicates = [HasF16C] in { (VCVTPH2PSrm addr:$src)>; } +// Patterns for matching conversions from float to half-float and vice versa. +let Predicates = [HasF16C] in { + def : Pat<(f32_to_f16 FR32:$src), + (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr + (COPY_TO_REGCLASS FR32:$src, VR128), 0)), sub_16bit))>; + + def : Pat<(f16_to_f32 GR16:$src), + (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr + (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >; + + def : Pat<(f16_to_f32 (i16 (f32_to_f16 FR32:$src))), + (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr + (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 0)), FR32)) >; +} + //===----------------------------------------------------------------------===// // AVX2 Instructions //===----------------------------------------------------------------------===// @@ -8543,13 +8730,6 @@ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), } let Predicates = [HasAVX] in { -def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))), - (VBROADCASTSSYrm addr:$src)>; -def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))), - (VBROADCASTSDYrm addr:$src)>; -def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))), - (VBROADCASTSSrm addr:$src)>; - // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. let AddedComplexity = 20 in { diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td index b5595cb..5402780 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -439,7 +439,10 @@ def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src), let SchedRW = [WriteSystem] in { def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB; def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB; -def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [], IIC_RDPMC>, TB; + +let Defs = [RAX, RDX], Uses = [ECX] in + def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)], IIC_RDPMC>, + TB; def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), "smsw{w}\t$dst", [], IIC_SMSW>, OpSize16, TB; diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp index e969ef2..a082c4f 100644 --- a/lib/Target/X86/X86JITInfo.cpp +++ b/lib/Target/X86/X86JITInfo.cpp @@ -432,7 +432,7 @@ X86JITInfo::getLazyResolverFunction(JITCompilerFn F) { // SSE Callback should be called for SSE-enabled LLVM. return X86CompilationCallback_SSE; #else - if (Subtarget->hasSSE1()) + if (useSSE) return X86CompilationCallback_SSE; #endif #endif @@ -440,8 +440,8 @@ X86JITInfo::getLazyResolverFunction(JITCompilerFn F) { return X86CompilationCallback; } -X86JITInfo::X86JITInfo(X86TargetMachine &tm) : TM(tm) { - Subtarget = &TM.getSubtarget<X86Subtarget>(); +X86JITInfo::X86JITInfo(bool UseSSE) { + useSSE = UseSSE; useGOT = 0; TLSOffset = nullptr; } diff --git a/lib/Target/X86/X86JITInfo.h b/lib/Target/X86/X86JITInfo.h index 4d279de..564343f 100644 --- a/lib/Target/X86/X86JITInfo.h +++ b/lib/Target/X86/X86JITInfo.h @@ -19,16 +19,14 @@ #include "llvm/Target/TargetJITInfo.h" namespace llvm { - class X86TargetMachine; class X86Subtarget; class X86JITInfo : public TargetJITInfo { - X86TargetMachine &TM; - const X86Subtarget *Subtarget; uintptr_t PICBase; - char* TLSOffset; + char *TLSOffset; + bool useSSE; public: - explicit X86JITInfo(X86TargetMachine &tm); + explicit X86JITInfo(bool UseSSE); /// replaceMachineCodeForFunction - Make it so that calling the function /// whose machine code is at OLD turns into a call to NEW, perhaps by diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 0190080..2bd70a9 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "X86AsmPrinter.h" +#include "X86RegisterInfo.h" #include "InstPrinter/X86ATTInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" #include "llvm/ADT/SmallString.h" @@ -779,6 +780,9 @@ static void LowerPATCHPOINT(MCStreamer &OS, StackMaps &SM, void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { X86MCInstLower MCInstLowering(*MF, *this); + const X86RegisterInfo *RI = + static_cast<const X86RegisterInfo *>(TM.getRegisterInfo()); + switch (MI->getOpcode()) { case TargetOpcode::DBG_VALUE: llvm_unreachable("Should be handled target independently"); @@ -883,6 +887,37 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { .addReg(X86::R10) .addReg(X86::RAX)); return; + + case X86::SEH_PushReg: + OutStreamer.EmitWinCFIPushReg(RI->getSEHRegNum(MI->getOperand(0).getImm())); + return; + + case X86::SEH_SaveReg: + OutStreamer.EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()), + MI->getOperand(1).getImm()); + return; + + case X86::SEH_SaveXMM: + OutStreamer.EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()), + MI->getOperand(1).getImm()); + return; + + case X86::SEH_StackAlloc: + OutStreamer.EmitWinCFIAllocStack(MI->getOperand(0).getImm()); + return; + + case X86::SEH_SetFrame: + OutStreamer.EmitWinCFISetFrame(RI->getSEHRegNum(MI->getOperand(0).getImm()), + MI->getOperand(1).getImm()); + return; + + case X86::SEH_PushFrame: + OutStreamer.EmitWinCFIPushFrame(MI->getOperand(0).getImm()); + return; + + case X86::SEH_EndPrologue: + OutStreamer.EmitWinCFIEndProlog(); + return; } MCInst TmpInst; diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index a83e1e4..e8a7e84 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -53,20 +53,18 @@ static cl::opt<bool> EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true), cl::desc("Enable use of a base pointer for complex stack frames")); -X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm) - : X86GenRegisterInfo((tm.getSubtarget<X86Subtarget>().is64Bit() - ? X86::RIP : X86::EIP), - X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), false), - X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), true), - (tm.getSubtarget<X86Subtarget>().is64Bit() - ? X86::RIP : X86::EIP)), - TM(tm) { +X86RegisterInfo::X86RegisterInfo(const X86Subtarget &STI) + : X86GenRegisterInfo( + (STI.is64Bit() ? X86::RIP : X86::EIP), + X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), false), + X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), true), + (STI.is64Bit() ? X86::RIP : X86::EIP)), + Subtarget(STI) { X86_MC::InitLLVM2SEHRegisterMapping(this); // Cache some information. - const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); - Is64Bit = Subtarget->is64Bit(); - IsWin64 = Subtarget->isTargetWin64(); + Is64Bit = Subtarget.is64Bit(); + IsWin64 = Subtarget.isTargetWin64(); if (Is64Bit) { SlotSize = 8; @@ -83,21 +81,6 @@ X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm) BasePtr = Is64Bit ? X86::RBX : X86::ESI; } -/// getCompactUnwindRegNum - This function maps the register to the number for -/// compact unwind encoding. Return -1 if the register isn't valid. -int X86RegisterInfo::getCompactUnwindRegNum(unsigned RegNum, bool isEH) const { - switch (getLLVMRegNum(RegNum, isEH)) { - case X86::EBX: case X86::RBX: return 1; - case X86::ECX: case X86::R12: return 2; - case X86::EDX: case X86::R13: return 3; - case X86::EDI: case X86::R14: return 4; - case X86::ESI: case X86::R15: return 5; - case X86::EBP: case X86::RBP: return 6; - } - - return -1; -} - bool X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { // ExeDepsFixer and PostRAScheduler require liveness. @@ -173,9 +156,8 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{ } const TargetRegisterClass * -X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) - const { - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); +X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, + unsigned Kind) const { switch (Kind) { default: llvm_unreachable("Unexpected Kind in getPointerRegClass!"); case 0: // Normal GPRs. @@ -225,7 +207,7 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, case X86::GR64RegClassID: return 12 - FPDiff; case X86::VR128RegClassID: - return TM.getSubtarget<X86Subtarget>().is64Bit() ? 10 : 4; + return Subtarget.is64Bit() ? 10 : 4; case X86::VR64RegClassID: return 4; } @@ -233,8 +215,8 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, const MCPhysReg * X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { - bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); - bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512(); + bool HasAVX = Subtarget.hasAVX(); + bool HasAVX512 = Subtarget.hasAVX512(); assert(MF && "MachineFunction required"); switch (MF->getFunction()->getCallingConv()) { @@ -287,8 +269,8 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const uint32_t* X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { - bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); - bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512(); + bool HasAVX = Subtarget.hasAVX(); + bool HasAVX512 = Subtarget.hasAVX512(); switch (CC) { case CallingConv::GHC: @@ -406,7 +388,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(*AI); } } - if (!Is64Bit || !TM.getSubtarget<X86Subtarget>().hasAVX512()) { + if (!Is64Bit || !Subtarget.hasAVX512()) { for (unsigned n = 16; n != 32; ++n) { for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI) Reserved.set(*AI); @@ -459,7 +441,7 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *F = MF.getFunction(); - unsigned StackAlign = TM.getFrameLowering()->getStackAlignment(); + unsigned StackAlign = MF.getTarget().getFrameLowering()->getStackAlignment(); bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index 2289d91..74efd1f 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -22,11 +22,11 @@ namespace llvm { class Type; class TargetInstrInfo; - class X86TargetMachine; + class X86Subtarget; class X86RegisterInfo final : public X86GenRegisterInfo { public: - X86TargetMachine &TM; + const X86Subtarget &Subtarget; private: /// Is64Bit - Is the target 64-bits. @@ -55,15 +55,11 @@ private: unsigned BasePtr; public: - X86RegisterInfo(X86TargetMachine &tm); + X86RegisterInfo(const X86Subtarget &STI); // FIXME: This should be tablegen'd like getDwarfRegNum is int getSEHRegNum(unsigned i) const; - /// getCompactUnwindRegNum - This function maps the register to the number for - /// compact unwind encoding. Return -1 if the register isn't valid. - int getCompactUnwindRegNum(unsigned RegNum, bool isEH) const override; - /// Code Generation virtual methods... /// bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override; diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index 744890d..a83dd9b 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -11,21 +11,23 @@ // //===----------------------------------------------------------------------===// -#include "X86TargetMachine.h" +#include "X86InstrInfo.h" +#include "X86ISelLowering.h" +#include "X86RegisterInfo.h" +#include "X86Subtarget.h" +#include "X86SelectionDAGInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/Target/TargetLowering.h" + using namespace llvm; #define DEBUG_TYPE "x86-selectiondag-info" -X86SelectionDAGInfo::X86SelectionDAGInfo(const X86TargetMachine &TM) : - TargetSelectionDAGInfo(TM), - Subtarget(&TM.getSubtarget<X86Subtarget>()), - TLI(*TM.getTargetLowering()) { -} +X86SelectionDAGInfo::X86SelectionDAGInfo(const DataLayout &DL) + : TargetSelectionDAGInfo(&DL) {} -X86SelectionDAGInfo::~X86SelectionDAGInfo() { -} +X86SelectionDAGInfo::~X86SelectionDAGInfo() {} SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, @@ -35,6 +37,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, bool isVolatile, MachinePointerInfo DstPtrInfo) const { ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); + const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget<X86Subtarget>(); // If to a segment-relative address space, use the default lowering. if (DstPtrInfo.getAddrSpace() >= 256) @@ -43,16 +46,14 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, // If not DWORD aligned or size is more than the threshold, call the library. // The libc version is likely to be faster for these cases. It can use the // address value and run time information about the CPU. - if ((Align & 3) != 0 || - !ConstantSize || - ConstantSize->getZExtValue() > - Subtarget->getMaxInlineSizeThreshold()) { + if ((Align & 3) != 0 || !ConstantSize || + ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) { // Check to see if there is a specialized entry-point for memory zeroing. ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); if (const char *bzeroEntry = V && - V->isNullValue() ? Subtarget->getBZeroEntry() : nullptr) { - EVT IntPtr = TLI.getPointerTy(); + V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) { + EVT IntPtr = DAG.getTargetLoweringInfo().getPointerTy(); Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; @@ -65,10 +66,11 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain) .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0) + DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args), + 0) .setDiscardResult(); - std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI); + std::pair<SDValue,SDValue> CallResult = DAG.getTargetLoweringInfo().LowerCallTo(CLI); return CallResult.second; } @@ -99,7 +101,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, ValReg = X86::EAX; Val = (Val << 8) | Val; Val = (Val << 16) | Val; - if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned + if (Subtarget.is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned AVT = MVT::i64; ValReg = X86::RAX; Val = (Val << 32) | Val; @@ -128,13 +130,11 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, InFlag = Chain.getValue(1); } - Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : - X86::ECX, - Count, InFlag); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX, + Count, InFlag); InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : - X86::EDI, - Dst, InFlag); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI, + Dst, InFlag); InFlag = Chain.getValue(1); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); @@ -182,10 +182,11 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, // This requires the copy size to be a constant, preferably // within a subtarget-specific limit. ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); + const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget<X86Subtarget>(); if (!ConstantSize) return SDValue(); uint64_t SizeVal = ConstantSize->getZExtValue(); - if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold()) + if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold()) return SDValue(); /// If not DWORD aligned, it is more efficient to call the library. However @@ -218,7 +219,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, AVT = MVT::i32; else // QWORD aligned - AVT = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; + AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32; unsigned UBytes = AVT.getSizeInBits() / 8; unsigned CountVal = SizeVal / UBytes; @@ -226,15 +227,15 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, unsigned BytesLeft = SizeVal % UBytes; SDValue InFlag; - Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX, Count, InFlag); InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI, Dst, InFlag); InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI : X86::ESI, Src, InFlag); InFlag = Chain.getValue(1); diff --git a/lib/Target/X86/X86SelectionDAGInfo.h b/lib/Target/X86/X86SelectionDAGInfo.h index 0d5dc38..c12555a 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.h +++ b/lib/Target/X86/X86SelectionDAGInfo.h @@ -23,14 +23,8 @@ class X86TargetMachine; class X86Subtarget; class X86SelectionDAGInfo : public TargetSelectionDAGInfo { - /// Subtarget - Keep a pointer to the X86Subtarget around so that we can - /// make the right decision when generating code for different targets. - const X86Subtarget *Subtarget; - - const X86TargetLowering &TLI; - public: - explicit X86SelectionDAGInfo(const X86TargetMachine &TM); + explicit X86SelectionDAGInfo(const DataLayout &DL); ~X86SelectionDAGInfo(); SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 989e0d6..79b7e68 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -291,13 +291,60 @@ void X86Subtarget::initializeEnvironment() { CallRegIndirect = false; LEAUsesAG = false; SlowLEA = false; + SlowIncDec = false; stackAlignment = 4; // FIXME: this is a known good value for Yonah. How about others? MaxInlineSizeThreshold = 128; } +static std::string computeDataLayout(const X86Subtarget &ST) { + // X86 is little endian + std::string Ret = "e"; + + Ret += DataLayout::getManglingComponent(ST.getTargetTriple()); + // X86 and x32 have 32 bit pointers. + if (ST.isTarget64BitILP32() || !ST.is64Bit()) + Ret += "-p:32:32"; + + // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32. + if (ST.is64Bit() || ST.isOSWindows() || ST.isTargetNaCl()) + Ret += "-i64:64"; + else + Ret += "-f64:32:64"; + + // Some ABIs align long double to 128 bits, others to 32. + if (ST.isTargetNaCl()) + ; // No f80 + else if (ST.is64Bit() || ST.isTargetDarwin()) + Ret += "-f80:128"; + else + Ret += "-f80:32"; + + // The registers can hold 8, 16, 32 or, in x86-64, 64 bits. + if (ST.is64Bit()) + Ret += "-n8:16:32:64"; + else + Ret += "-n8:16:32"; + + // The stack is aligned to 32 bits on some ABIs and 128 bits on others. + if (!ST.is64Bit() && ST.isOSWindows()) + Ret += "-S32"; + else + Ret += "-S128"; + + return Ret; +} + +X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU, + StringRef FS) { + initializeEnvironment(); + resetSubtargetFeatures(CPU, FS); + return *this; +} + X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, unsigned StackAlignOverride) + const std::string &FS, X86TargetMachine &TM, + unsigned StackAlignOverride) : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others), PICStyle(PICStyles::None), TargetTriple(TT), StackAlignOverride(StackAlignOverride), @@ -305,10 +352,12 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, In32BitMode(TargetTriple.getArch() == Triple::x86 && TargetTriple.getEnvironment() != Triple::CODE16), In16BitMode(TargetTriple.getArch() == Triple::x86 && - TargetTriple.getEnvironment() == Triple::CODE16) { - initializeEnvironment(); - resetSubtargetFeatures(CPU, FS); -} + TargetTriple.getEnvironment() == Triple::CODE16), + DL(computeDataLayout(*this)), TSInfo(DL), + InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM), + FrameLowering(TargetFrameLowering::StackGrowsDown, getStackAlignment(), + is64Bit() ? -8 : -4), + JITInfo(hasSSE1()) {} bool X86Subtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel, diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 703559a..09db0eb 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -14,6 +14,11 @@ #ifndef X86SUBTARGET_H #define X86SUBTARGET_H +#include "X86FrameLowering.h" +#include "X86ISelLowering.h" +#include "X86InstrInfo.h" +#include "X86JITInfo.h" +#include "X86SelectionDAGInfo.h" #include "llvm/ADT/Triple.h" #include "llvm/IR/CallingConv.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -40,6 +45,7 @@ enum Style { } class X86Subtarget final : public X86GenSubtargetInfo { + protected: enum X86SSEEnum { NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F @@ -181,6 +187,9 @@ protected: /// SlowLEA - True if the LEA instruction with certain arguments is slow bool SlowLEA; + /// SlowIncDec - True if INC and DEC instructions are slow when writing to flags + bool SlowIncDec; + /// Processor has AVX-512 PreFetch Instructions bool HasPFI; @@ -217,14 +226,31 @@ private: /// In16BitMode - True if compiling for 16-bit, false for 32-bit or 64-bit. bool In16BitMode; + // Calculates type size & alignment + const DataLayout DL; + X86SelectionDAGInfo TSInfo; + // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which + // X86TargetLowering needs. + X86InstrInfo InstrInfo; + X86TargetLowering TLInfo; + X86FrameLowering FrameLowering; + X86JITInfo JITInfo; + public: /// This constructor initializes the data members to match that /// of the specified triple. /// X86Subtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, + const std::string &FS, X86TargetMachine &TM, unsigned StackAlignOverride); + const X86TargetLowering *getTargetLowering() const { return &TLInfo; } + const X86InstrInfo *getInstrInfo() const { return &InstrInfo; } + const DataLayout *getDataLayout() const { return &DL; } + const X86FrameLowering *getFrameLowering() const { return &FrameLowering; } + const X86SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } + X86JITInfo *getJITInfo() { return &JITInfo; } + /// getStackAlignment - Returns the minimum alignment known to hold of the /// stack frame on entry to the function and which must be maintained by every /// function for this subtarget. @@ -241,6 +267,9 @@ public: /// \brief Reset the features for the X86 target. void resetSubtargetFeatures(const MachineFunction *MF) override; private: + /// \brief Initialize the full set of dependencies so we can use an initializer + /// list for X86Subtarget. + X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); void initializeEnvironment(); void resetSubtargetFeatures(StringRef CPU, StringRef FS); public: @@ -319,6 +348,7 @@ public: bool callRegIndirect() const { return CallRegIndirect; } bool LEAusesAG() const { return LEAUsesAG; } bool slowLEA() const { return SlowLEA; } + bool slowIncDec() const { return SlowIncDec; } bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } bool hasERI() const { return HasERI; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 93760ef..f12140f 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -29,61 +29,14 @@ extern "C" void LLVMInitializeX86Target() { void X86TargetMachine::anchor() { } -static std::string computeDataLayout(const X86Subtarget &ST) { - // X86 is little endian - std::string Ret = "e"; - - Ret += DataLayout::getManglingComponent(ST.getTargetTriple()); - // X86 and x32 have 32 bit pointers. - if (ST.isTarget64BitILP32() || !ST.is64Bit()) - Ret += "-p:32:32"; - - // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32. - if (ST.is64Bit() || ST.isTargetCygMing() || ST.isTargetKnownWindowsMSVC() || - ST.isTargetNaCl()) - Ret += "-i64:64"; - else - Ret += "-f64:32:64"; - - // Some ABIs align long double to 128 bits, others to 32. - if (ST.isTargetNaCl()) - ; // No f80 - else if (ST.is64Bit() || ST.isTargetDarwin()) - Ret += "-f80:128"; - else - Ret += "-f80:32"; - - // The registers can hold 8, 16, 32 or, in x86-64, 64 bits. - if (ST.is64Bit()) - Ret += "-n8:16:32:64"; - else - Ret += "-n8:16:32"; - - // The stack is aligned to 32 bits on some ABIs and 128 bits on others. - if (!ST.is64Bit() && (ST.isTargetCygMing() || ST.isTargetKnownWindowsMSVC())) - Ret += "-S32"; - else - Ret += "-S128"; - - return Ret; -} - /// X86TargetMachine ctor - Create an X86 target. /// -X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, +X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS, Options.StackAlignmentOverride), - FrameLowering(*this, Subtarget), - InstrItins(Subtarget.getInstrItineraryData()), - DL(computeDataLayout(*getSubtargetImpl())), - InstrInfo(*this), - TLInfo(*this), - TSInfo(*this), - JITInfo(*this) { + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) { // Determine the PICStyle based on the target selected. if (getRelocationModel() == Reloc::Static) { // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None. @@ -158,6 +111,7 @@ public: return *getX86TargetMachine().getSubtargetImpl(); } + void addIRPasses() override; bool addInstSelector() override; bool addILPOpts() override; bool addPreRegAlloc() override; @@ -170,6 +124,12 @@ TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) { return new X86PassConfig(this, PM); } +void X86PassConfig::addIRPasses() { + addPass(createX86AtomicExpandPass(&getX86TargetMachine())); + + TargetPassConfig::addIRPasses(); +} + bool X86PassConfig::addInstSelector() { // Install an instruction selector. addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel())); diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index 57e6eda..41d5157 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -13,12 +13,7 @@ #ifndef X86TARGETMACHINE_H #define X86TARGETMACHINE_H - -#include "X86FrameLowering.h" -#include "X86ISelLowering.h" #include "X86InstrInfo.h" -#include "X86JITInfo.h" -#include "X86SelectionDAGInfo.h" #include "X86Subtarget.h" #include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetMachine.h" @@ -30,13 +25,6 @@ class StringRef; class X86TargetMachine final : public LLVMTargetMachine { virtual void anchor(); X86Subtarget Subtarget; - X86FrameLowering FrameLowering; - InstrItineraryData InstrItins; - const DataLayout DL; // Calculates type size & alignment - X86InstrInfo InstrInfo; - X86TargetLowering TLInfo; - X86SelectionDAGInfo TSInfo; - X86JITInfo JITInfo; public: X86TargetMachine(const Target &T, StringRef TT, @@ -44,28 +32,28 @@ public: Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); - const DataLayout *getDataLayout() const override { return &DL; } + const DataLayout *getDataLayout() const override { + return getSubtargetImpl()->getDataLayout(); + } const X86InstrInfo *getInstrInfo() const override { - return &InstrInfo; + return getSubtargetImpl()->getInstrInfo(); } const TargetFrameLowering *getFrameLowering() const override { - return &FrameLowering; - } - X86JITInfo *getJITInfo() override { - return &JITInfo; + return getSubtargetImpl()->getFrameLowering(); } + X86JITInfo *getJITInfo() override { return Subtarget.getJITInfo(); } const X86Subtarget *getSubtargetImpl() const override { return &Subtarget; } const X86TargetLowering *getTargetLowering() const override { - return &TLInfo; + return getSubtargetImpl()->getTargetLowering(); } const X86SelectionDAGInfo *getSelectionDAGInfo() const override { - return &TSInfo; + return getSubtargetImpl()->getSelectionDAGInfo(); } const X86RegisterInfo *getRegisterInfo() const override { return &getInstrInfo()->getRegisterInfo(); } const InstrItineraryData *getInstrItineraryData() const override { - return &InstrItins; + return &getSubtargetImpl()->getInstrItineraryData(); } /// \brief Register X86 analysis passes with a pass manager. diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 91b9d40..c961e2f 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -102,6 +102,8 @@ public: unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) const override; + unsigned getIntImmCost(int64_t) const; + unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override; unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, @@ -142,13 +144,17 @@ unsigned X86TTI::getNumberOfRegisters(bool Vector) const { if (Vector && !ST->hasSSE1()) return 0; - if (ST->is64Bit()) + if (ST->is64Bit()) { + if (Vector && ST->hasAVX512()) + return 32; return 16; + } return 8; } unsigned X86TTI::getRegisterBitWidth(bool Vector) const { if (Vector) { + if (ST->hasAVX512()) return 512; if (ST->hasAVX()) return 256; if (ST->hasSSE1()) return 128; return 0; @@ -400,17 +406,117 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const { - // We only estimate the cost of reverse shuffles. - if (Kind != SK_Reverse) + // We only estimate the cost of reverse and alternate shuffles. + if (Kind != SK_Reverse && Kind != SK_Alternate) return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); - unsigned Cost = 1; - if (LT.second.getSizeInBits() > 128) - Cost = 3; // Extract + insert + copy. + if (Kind == SK_Reverse) { + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); + unsigned Cost = 1; + if (LT.second.getSizeInBits() > 128) + Cost = 3; // Extract + insert + copy. + + // Multiple by the number of parts. + return Cost * LT.first; + } + + if (Kind == SK_Alternate) { + // 64-bit packed float vectors (v2f32) are widened to type v4f32. + // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); + + // The backend knows how to generate a single VEX.256 version of + // instruction VPBLENDW if the target supports AVX2. + if (ST->hasAVX2() && LT.second == MVT::v16i16) + return LT.first; + + static const CostTblEntry<MVT::SimpleValueType> AVXAltShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vblendpd + {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vblendpd + + {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vblendps + {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vblendps + + // This shuffle is custom lowered into a sequence of: + // 2x vextractf128 , 2x vpblendw , 1x vinsertf128 + {ISD::VECTOR_SHUFFLE, MVT::v16i16, 5}, + + // This shuffle is custom lowered into a long sequence of: + // 2x vextractf128 , 4x vpshufb , 2x vpor , 1x vinsertf128 + {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9} + }; + + if (ST->hasAVX()) { + int Idx = CostTableLookup(AVXAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); + if (Idx != -1) + return LT.first * AVXAltShuffleTbl[Idx].Cost; + } + + static const CostTblEntry<MVT::SimpleValueType> SSE41AltShuffleTbl[] = { + // These are lowered into movsd. + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + + // packed float vectors with four elements are lowered into BLENDI dag + // nodes. A v4i32/v4f32 BLENDI generates a single 'blendps'/'blendpd'. + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, + + // This shuffle generates a single pshufw. + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, + + // There is no instruction that matches a v16i8 alternate shuffle. + // The backend will expand it into the sequence 'pshufb + pshufb + or'. + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} + }; + + if (ST->hasSSE41()) { + int Idx = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); + if (Idx != -1) + return LT.first * SSE41AltShuffleTbl[Idx].Cost; + } + + static const CostTblEntry<MVT::SimpleValueType> SSSE3AltShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd + + // SSE3 doesn't have 'blendps'. The following shuffles are expanded into + // the sequence 'shufps + pshufd' + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, - // Multiple by the number of parts. - return Cost * LT.first; + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // pshufb + pshufb + or + }; + + if (ST->hasSSSE3()) { + int Idx = CostTableLookup(SSSE3AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); + if (Idx != -1) + return LT.first * SSSE3AltShuffleTbl[Idx].Cost; + } + + static const CostTblEntry<MVT::SimpleValueType> SSEAltShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd + + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd + + // This is expanded into a long sequence of four extract + four insert. + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw. + + // 8 x (pinsrw + pextrw + and + movb + movzb + or) + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48} + }; + + // Fall-back (SSE3 and SSE2). + int Idx = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); + if (Idx != -1) + return LT.first * SSEAltShuffleTbl[Idx].Cost; + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); + } + + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); } unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { @@ -808,6 +914,19 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy, return TargetTransformInfo::getReductionCost(Opcode, ValTy, IsPairwise); } +/// \brief Calculate the cost of materializing a 64-bit value. This helper +/// method might only calculate a fraction of a larger immediate. Therefore it +/// is valid to return a cost of ZERO. +unsigned X86TTI::getIntImmCost(int64_t Val) const { + if (Val == 0) + return TCC_Free; + + if (isInt<32>(Val)) + return TCC_Basic; + + return 2 * TCC_Basic; +} + unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); @@ -825,11 +944,21 @@ unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { if (Imm == 0) return TCC_Free; - if (Imm.getBitWidth() <= 64 && - (isInt<32>(Imm.getSExtValue()) || isUInt<32>(Imm.getZExtValue()))) - return TCC_Basic; - else - return 2 * TCC_Basic; + // Sign-extend all constants to a multiple of 64-bit. + APInt ImmVal = Imm; + if (BitSize & 0x3f) + ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); + + // Split the constant into 64-bit chunks and calculate the cost for each + // chunk. + unsigned Cost = 0; + for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { + APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); + int64_t Val = Tmp.getSExtValue(); + Cost += getIntImmCost(Val); + } + // We need at least one instruction to materialze the constant. + return std::max(1U, Cost); } unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, @@ -889,9 +1018,13 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, break; } - if ((Idx == ImmIdx) && - Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) - return TCC_Free; + if (Idx == ImmIdx) { + unsigned NumConstants = (BitSize + 63) / 64; + unsigned Cost = X86TTI::getIntImmCost(Imm, Ty); + return (Cost <= NumConstants * TCC_Basic) + ? static_cast<unsigned>(TCC_Free) + : Cost; + } return X86TTI::getIntImmCost(Imm, Ty); } |