aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86')
-rw-r--r--lib/Target/X86/Android.mk1
-rw-r--r--lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp371
-rw-r--r--lib/Target/X86/AsmParser/X86AsmInstrumentation.h9
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParser.cpp287
-rw-r--r--lib/Target/X86/AsmParser/X86Operand.h42
-rw-r--r--lib/Target/X86/CMakeLists.txt1
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp3
-rw-r--r--lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp46
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp17
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp13
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h3
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp6
-rw-r--r--lib/Target/X86/X86.h4
-rw-r--r--lib/Target/X86/X86.td7
-rw-r--r--lib/Target/X86/X86AtomicExpandPass.cpp287
-rw-r--r--lib/Target/X86/X86CodeEmitter.cpp17
-rw-r--r--lib/Target/X86/X86FastISel.cpp1265
-rw-r--r--lib/Target/X86/X86FixupLEAs.cpp175
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp407
-rw-r--r--lib/Target/X86/X86FrameLowering.h20
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp32
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp3517
-rw-r--r--lib/Target/X86/X86ISelLowering.h31
-rw-r--r--lib/Target/X86/X86InstrAVX512.td301
-rw-r--r--lib/Target/X86/X86InstrArithmetic.td6
-rw-r--r--lib/Target/X86/X86InstrCompiler.td139
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td4
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp136
-rw-r--r--lib/Target/X86/X86InstrInfo.h24
-rw-r--r--lib/Target/X86/X86InstrInfo.td24
-rw-r--r--lib/Target/X86/X86InstrSSE.td252
-rw-r--r--lib/Target/X86/X86InstrSystem.td5
-rw-r--r--lib/Target/X86/X86JITInfo.cpp6
-rw-r--r--lib/Target/X86/X86JITInfo.h8
-rw-r--r--lib/Target/X86/X86MCInstLower.cpp35
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp54
-rw-r--r--lib/Target/X86/X86RegisterInfo.h10
-rw-r--r--lib/Target/X86/X86SelectionDAGInfo.cpp57
-rw-r--r--lib/Target/X86/X86SelectionDAGInfo.h8
-rw-r--r--lib/Target/X86/X86Subtarget.cpp59
-rw-r--r--lib/Target/X86/X86Subtarget.h32
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp62
-rw-r--r--lib/Target/X86/X86TargetMachine.h30
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp167
44 files changed, 5684 insertions, 2296 deletions
diff --git a/lib/Target/X86/Android.mk b/lib/Target/X86/Android.mk
index 0d0a9ca..e2c4be7 100644
--- a/lib/Target/X86/Android.mk
+++ b/lib/Target/X86/Android.mk
@@ -12,6 +12,7 @@ x86_codegen_TBLGEN_TABLES := \
x86_codegen_SRC_FILES := \
X86AsmPrinter.cpp \
+ X86AtomicExpandPass.cpp \
X86CodeEmitter.cpp \
X86FastISel.cpp \
X86FixupLEAs.cpp \
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
index f3e6b3f..a365f62 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
@@ -20,6 +20,7 @@
#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
#include "llvm/MC/MCTargetOptions.h"
#include "llvm/Support/CommandLine.h"
@@ -36,8 +37,8 @@ bool IsStackReg(unsigned Reg) {
}
std::string FuncName(unsigned AccessSize, bool IsWrite) {
- return std::string("__sanitizer_sanitize_") + (IsWrite ? "store" : "load") +
- (utostr(AccessSize));
+ return std::string("__asan_report_") + (IsWrite ? "store" : "load") +
+ utostr(AccessSize);
}
class X86AddressSanitizer : public X86AsmInstrumentation {
@@ -47,47 +48,55 @@ public:
// X86AsmInstrumentation implementation:
virtual void InstrumentInstruction(
- const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
- MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) override {
+ const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
+ const MCInstrInfo &MII, MCStreamer &Out) override {
InstrumentMOV(Inst, Operands, Ctx, MII, Out);
}
// Should be implemented differently in x86_32 and x86_64 subclasses.
- virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize,
- bool IsWrite, MCContext &Ctx,
- MCStreamer &Out) = 0;
+ virtual void InstrumentMemOperandSmallImpl(
+ X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+ MCStreamer &Out) = 0;
+ virtual void InstrumentMemOperandLargeImpl(
+ X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+ MCStreamer &Out) = 0;
- void InstrumentMemOperand(MCParsedAsmOperand *Op, unsigned AccessSize,
+ void InstrumentMemOperand(MCParsedAsmOperand &Op, unsigned AccessSize,
bool IsWrite, MCContext &Ctx, MCStreamer &Out);
- void InstrumentMOV(const MCInst &Inst,
- SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+ void InstrumentMOV(const MCInst &Inst, OperandVector &Operands,
MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
void EmitInstruction(MCStreamer &Out, const MCInst &Inst) {
Out.EmitInstruction(Inst, STI);
}
+ void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); }
+
protected:
const MCSubtargetInfo &STI;
};
void X86AddressSanitizer::InstrumentMemOperand(
- MCParsedAsmOperand *Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+ MCParsedAsmOperand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
MCStreamer &Out) {
- assert(Op && Op->isMem() && "Op should be a memory operand.");
+ assert(Op.isMem() && "Op should be a memory operand.");
assert((AccessSize & (AccessSize - 1)) == 0 && AccessSize <= 16 &&
"AccessSize should be a power of two, less or equal than 16.");
- X86Operand *MemOp = static_cast<X86Operand *>(Op);
+ X86Operand &MemOp = static_cast<X86Operand &>(Op);
// FIXME: get rid of this limitation.
- if (IsStackReg(MemOp->getMemBaseReg()) || IsStackReg(MemOp->getMemIndexReg()))
+ if (IsStackReg(MemOp.getMemBaseReg()) || IsStackReg(MemOp.getMemIndexReg()))
return;
- InstrumentMemOperandImpl(MemOp, AccessSize, IsWrite, Ctx, Out);
+ // FIXME: take into account load/store alignment.
+ if (AccessSize < 8)
+ InstrumentMemOperandSmallImpl(MemOp, AccessSize, IsWrite, Ctx, Out);
+ else
+ InstrumentMemOperandLargeImpl(MemOp, AccessSize, IsWrite, Ctx, Out);
}
void X86AddressSanitizer::InstrumentMOV(
- const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
- MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) {
+ const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
+ const MCInstrInfo &MII, MCStreamer &Out) {
// Access size in bytes.
unsigned AccessSize = 0;
@@ -124,107 +133,351 @@ void X86AddressSanitizer::InstrumentMOV(
const bool IsWrite = MII.get(Inst.getOpcode()).mayStore();
for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) {
- MCParsedAsmOperand *Op = Operands[Ix];
- if (Op && Op->isMem())
+ assert(Operands[Ix]);
+ MCParsedAsmOperand &Op = *Operands[Ix];
+ if (Op.isMem())
InstrumentMemOperand(Op, AccessSize, IsWrite, Ctx, Out);
}
}
class X86AddressSanitizer32 : public X86AddressSanitizer {
public:
+ static const long kShadowOffset = 0x20000000;
+
X86AddressSanitizer32(const MCSubtargetInfo &STI)
: X86AddressSanitizer(STI) {}
virtual ~X86AddressSanitizer32() {}
- virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize,
- bool IsWrite, MCContext &Ctx,
- MCStreamer &Out) override;
+ virtual void InstrumentMemOperandSmallImpl(
+ X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+ MCStreamer &Out) override;
+ virtual void InstrumentMemOperandLargeImpl(
+ X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+ MCStreamer &Out) override;
+
+ private:
+ void EmitCallAsanReport(MCContext &Ctx, MCStreamer &Out, unsigned AccessSize,
+ bool IsWrite, unsigned AddressReg) {
+ EmitInstruction(Out, MCInstBuilder(X86::CLD));
+ EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
+
+ EmitInstruction(Out, MCInstBuilder(X86::AND64ri8).addReg(X86::ESP)
+ .addReg(X86::ESP).addImm(-16));
+ EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(AddressReg));
+
+
+ const std::string& Fn = FuncName(AccessSize, IsWrite);
+ MCSymbol *FnSym = Ctx.GetOrCreateSymbol(StringRef(Fn));
+ const MCSymbolRefExpr *FnExpr =
+ MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
+ EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr));
+ }
};
-void X86AddressSanitizer32::InstrumentMemOperandImpl(
- X86Operand *Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+void X86AddressSanitizer32::InstrumentMemOperandSmallImpl(
+ X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
MCStreamer &Out) {
- // FIXME: emit .cfi directives for correct stack unwinding.
EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX));
+ EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::ECX));
+ EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EDX));
+ EmitInstruction(Out, MCInstBuilder(X86::PUSHF32));
+
{
MCInst Inst;
Inst.setOpcode(X86::LEA32r);
Inst.addOperand(MCOperand::CreateReg(X86::EAX));
+ Op.addMemOperands(Inst, 5);
+ EmitInstruction(Out, Inst);
+ }
+
+ EmitInstruction(
+ Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EAX));
+ EmitInstruction(Out, MCInstBuilder(X86::SHR32ri).addReg(X86::ECX)
+ .addReg(X86::ECX).addImm(3));
+
+ {
+ MCInst Inst;
+ Inst.setOpcode(X86::MOV8rm);
+ Inst.addOperand(MCOperand::CreateReg(X86::CL));
+ const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
+ std::unique_ptr<X86Operand> Op(
+ X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc()));
+ Op->addMemOperands(Inst, 5);
+ EmitInstruction(Out, Inst);
+ }
+
+ EmitInstruction(Out,
+ MCInstBuilder(X86::TEST8rr).addReg(X86::CL).addReg(X86::CL));
+ MCSymbol *DoneSym = Ctx.CreateTempSymbol();
+ const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+ EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+
+ EmitInstruction(
+ Out, MCInstBuilder(X86::MOV32rr).addReg(X86::EDX).addReg(X86::EAX));
+ EmitInstruction(Out, MCInstBuilder(X86::AND32ri).addReg(X86::EDX)
+ .addReg(X86::EDX).addImm(7));
+
+ switch (AccessSize) {
+ case 1:
+ break;
+ case 2: {
+ MCInst Inst;
+ Inst.setOpcode(X86::LEA32r);
+ Inst.addOperand(MCOperand::CreateReg(X86::EDX));
+
+ const MCExpr *Disp = MCConstantExpr::Create(1, Ctx);
+ std::unique_ptr<X86Operand> Op(
+ X86Operand::CreateMem(0, Disp, X86::EDX, 0, 1, SMLoc(), SMLoc()));
Op->addMemOperands(Inst, 5);
EmitInstruction(Out, Inst);
+ break;
}
+ case 4:
+ EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8).addReg(X86::EDX)
+ .addReg(X86::EDX).addImm(3));
+ break;
+ default:
+ assert(false && "Incorrect access size");
+ break;
+ }
+
+ EmitInstruction(
+ Out, MCInstBuilder(X86::MOVSX32rr8).addReg(X86::ECX).addReg(X86::CL));
+ EmitInstruction(
+ Out, MCInstBuilder(X86::CMP32rr).addReg(X86::EDX).addReg(X86::ECX));
+ EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr));
+
+ EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite, X86::EAX);
+ EmitLabel(Out, DoneSym);
+
+ EmitInstruction(Out, MCInstBuilder(X86::POPF32));
+ EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EDX));
+ EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::ECX));
+ EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX));
+}
+
+void X86AddressSanitizer32::InstrumentMemOperandLargeImpl(
+ X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+ MCStreamer &Out) {
EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX));
+ EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::ECX));
+ EmitInstruction(Out, MCInstBuilder(X86::PUSHF32));
+
{
- const std::string Func = FuncName(AccessSize, IsWrite);
- const MCSymbol *FuncSym = Ctx.GetOrCreateSymbol(StringRef(Func));
- const MCSymbolRefExpr *FuncExpr =
- MCSymbolRefExpr::Create(FuncSym, MCSymbolRefExpr::VK_PLT, Ctx);
- EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FuncExpr));
+ MCInst Inst;
+ Inst.setOpcode(X86::LEA32r);
+ Inst.addOperand(MCOperand::CreateReg(X86::EAX));
+ Op.addMemOperands(Inst, 5);
+ EmitInstruction(Out, Inst);
}
- EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX));
+ EmitInstruction(
+ Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EAX));
+ EmitInstruction(Out, MCInstBuilder(X86::SHR32ri).addReg(X86::ECX)
+ .addReg(X86::ECX).addImm(3));
+ {
+ MCInst Inst;
+ switch (AccessSize) {
+ case 8:
+ Inst.setOpcode(X86::CMP8mi);
+ break;
+ case 16:
+ Inst.setOpcode(X86::CMP16mi);
+ break;
+ default:
+ assert(false && "Incorrect access size");
+ break;
+ }
+ const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
+ std::unique_ptr<X86Operand> Op(
+ X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc()));
+ Op->addMemOperands(Inst, 5);
+ Inst.addOperand(MCOperand::CreateImm(0));
+ EmitInstruction(Out, Inst);
+ }
+ MCSymbol *DoneSym = Ctx.CreateTempSymbol();
+ const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+ EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+
+ EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite, X86::EAX);
+ EmitLabel(Out, DoneSym);
+
+ EmitInstruction(Out, MCInstBuilder(X86::POPF32));
+ EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::ECX));
EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX));
}
class X86AddressSanitizer64 : public X86AddressSanitizer {
public:
+ static const long kShadowOffset = 0x7fff8000;
+
X86AddressSanitizer64(const MCSubtargetInfo &STI)
: X86AddressSanitizer(STI) {}
virtual ~X86AddressSanitizer64() {}
- virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize,
- bool IsWrite, MCContext &Ctx,
- MCStreamer &Out) override;
-};
+ virtual void InstrumentMemOperandSmallImpl(
+ X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+ MCStreamer &Out) override;
+ virtual void InstrumentMemOperandLargeImpl(
+ X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+ MCStreamer &Out) override;
-void X86AddressSanitizer64::InstrumentMemOperandImpl(X86Operand *Op,
- unsigned AccessSize,
- bool IsWrite,
- MCContext &Ctx,
- MCStreamer &Out) {
- // FIXME: emit .cfi directives for correct stack unwinding.
-
- // Set %rsp below current red zone (128 bytes wide) using LEA instruction to
- // preserve flags.
- {
+private:
+ void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) {
MCInst Inst;
Inst.setOpcode(X86::LEA64r);
Inst.addOperand(MCOperand::CreateReg(X86::RSP));
- const MCExpr *Disp = MCConstantExpr::Create(-128, Ctx);
+ const MCExpr *Disp = MCConstantExpr::Create(Offset, Ctx);
std::unique_ptr<X86Operand> Op(
X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc()));
Op->addMemOperands(Inst, 5);
EmitInstruction(Out, Inst);
}
+
+ void EmitCallAsanReport(MCContext &Ctx, MCStreamer &Out, unsigned AccessSize,
+ bool IsWrite) {
+ EmitInstruction(Out, MCInstBuilder(X86::CLD));
+ EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
+
+ EmitInstruction(Out, MCInstBuilder(X86::AND64ri8).addReg(X86::RSP)
+ .addReg(X86::RSP).addImm(-16));
+
+ const std::string& Fn = FuncName(AccessSize, IsWrite);
+ MCSymbol *FnSym = Ctx.GetOrCreateSymbol(StringRef(Fn));
+ const MCSymbolRefExpr *FnExpr =
+ MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
+ EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr));
+ }
+};
+
+void X86AddressSanitizer64::InstrumentMemOperandSmallImpl(
+ X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+ MCStreamer &Out) {
+ EmitAdjustRSP(Ctx, Out, -128);
+ EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RAX));
+ EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RCX));
EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RDI));
+ EmitInstruction(Out, MCInstBuilder(X86::PUSHF64));
{
MCInst Inst;
Inst.setOpcode(X86::LEA64r);
Inst.addOperand(MCOperand::CreateReg(X86::RDI));
- Op->addMemOperands(Inst, 5);
+ Op.addMemOperands(Inst, 5);
EmitInstruction(Out, Inst);
}
+ EmitInstruction(
+ Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RAX).addReg(X86::RDI));
+ EmitInstruction(Out, MCInstBuilder(X86::SHR64ri).addReg(X86::RAX)
+ .addReg(X86::RAX).addImm(3));
{
- const std::string Func = FuncName(AccessSize, IsWrite);
- const MCSymbol *FuncSym = Ctx.GetOrCreateSymbol(StringRef(Func));
- const MCSymbolRefExpr *FuncExpr =
- MCSymbolRefExpr::Create(FuncSym, MCSymbolRefExpr::VK_PLT, Ctx);
- EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FuncExpr));
+ MCInst Inst;
+ Inst.setOpcode(X86::MOV8rm);
+ Inst.addOperand(MCOperand::CreateReg(X86::AL));
+ const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
+ std::unique_ptr<X86Operand> Op(
+ X86Operand::CreateMem(0, Disp, X86::RAX, 0, 1, SMLoc(), SMLoc()));
+ Op->addMemOperands(Inst, 5);
+ EmitInstruction(Out, Inst);
+ }
+
+ EmitInstruction(Out,
+ MCInstBuilder(X86::TEST8rr).addReg(X86::AL).addReg(X86::AL));
+ MCSymbol *DoneSym = Ctx.CreateTempSymbol();
+ const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+ EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+
+ EmitInstruction(
+ Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EDI));
+ EmitInstruction(Out, MCInstBuilder(X86::AND32ri).addReg(X86::ECX)
+ .addReg(X86::ECX).addImm(7));
+
+ switch (AccessSize) {
+ case 1:
+ break;
+ case 2: {
+ MCInst Inst;
+ Inst.setOpcode(X86::LEA32r);
+ Inst.addOperand(MCOperand::CreateReg(X86::ECX));
+
+ const MCExpr *Disp = MCConstantExpr::Create(1, Ctx);
+ std::unique_ptr<X86Operand> Op(
+ X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc()));
+ Op->addMemOperands(Inst, 5);
+ EmitInstruction(Out, Inst);
+ break;
}
+ case 4:
+ EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8).addReg(X86::ECX)
+ .addReg(X86::ECX).addImm(3));
+ break;
+ default:
+ assert(false && "Incorrect access size");
+ break;
+ }
+
+ EmitInstruction(
+ Out, MCInstBuilder(X86::MOVSX32rr8).addReg(X86::EAX).addReg(X86::AL));
+ EmitInstruction(
+ Out, MCInstBuilder(X86::CMP32rr).addReg(X86::ECX).addReg(X86::EAX));
+ EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr));
+
+ EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite);
+ EmitLabel(Out, DoneSym);
+
+ EmitInstruction(Out, MCInstBuilder(X86::POPF64));
EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RDI));
+ EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RCX));
+ EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RAX));
+ EmitAdjustRSP(Ctx, Out, 128);
+}
+
+void X86AddressSanitizer64::InstrumentMemOperandLargeImpl(
+ X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+ MCStreamer &Out) {
+ EmitAdjustRSP(Ctx, Out, -128);
+ EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RAX));
+ EmitInstruction(Out, MCInstBuilder(X86::PUSHF64));
- // Restore old %rsp value.
{
MCInst Inst;
Inst.setOpcode(X86::LEA64r);
- Inst.addOperand(MCOperand::CreateReg(X86::RSP));
-
- const MCExpr *Disp = MCConstantExpr::Create(128, Ctx);
+ Inst.addOperand(MCOperand::CreateReg(X86::RAX));
+ Op.addMemOperands(Inst, 5);
+ EmitInstruction(Out, Inst);
+ }
+ EmitInstruction(Out, MCInstBuilder(X86::SHR64ri).addReg(X86::RAX)
+ .addReg(X86::RAX).addImm(3));
+ {
+ MCInst Inst;
+ switch (AccessSize) {
+ case 8:
+ Inst.setOpcode(X86::CMP8mi);
+ break;
+ case 16:
+ Inst.setOpcode(X86::CMP16mi);
+ break;
+ default:
+ assert(false && "Incorrect access size");
+ break;
+ }
+ const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
std::unique_ptr<X86Operand> Op(
- X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc()));
+ X86Operand::CreateMem(0, Disp, X86::RAX, 0, 1, SMLoc(), SMLoc()));
Op->addMemOperands(Inst, 5);
+ Inst.addOperand(MCOperand::CreateImm(0));
EmitInstruction(Out, Inst);
}
+
+ MCSymbol *DoneSym = Ctx.CreateTempSymbol();
+ const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+ EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+
+ EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite);
+ EmitLabel(Out, DoneSym);
+
+ EmitInstruction(Out, MCInstBuilder(X86::POPF64));
+ EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RAX));
+ EmitAdjustRSP(Ctx, Out, 128);
}
} // End anonymous namespace
@@ -233,8 +486,8 @@ X86AsmInstrumentation::X86AsmInstrumentation() {}
X86AsmInstrumentation::~X86AsmInstrumentation() {}
void X86AsmInstrumentation::InstrumentInstruction(
- const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
- MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) {}
+ const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
+ const MCInstrInfo &MII, MCStreamer &Out) {}
X86AsmInstrumentation *
CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
index 0369b14..1bc3c09 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
@@ -12,6 +12,8 @@
#include "llvm/ADT/SmallVector.h"
+#include <memory>
+
namespace llvm {
class MCContext;
@@ -35,10 +37,9 @@ public:
// Instruments Inst. Should be called just before the original
// instruction is sent to Out.
virtual void InstrumentInstruction(
- const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
- MCContext &Ctx,
- const MCInstrInfo &MII,
- MCStreamer &Out);
+ const MCInst &Inst,
+ SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>> &Operands,
+ MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
protected:
friend X86AsmInstrumentation *
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index d3e695e..f0765ed 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -235,6 +235,7 @@ private:
IES_RSHIFT,
IES_PLUS,
IES_MINUS,
+ IES_NOT,
IES_MULTIPLY,
IES_DIVIDE,
IES_LBRAC,
@@ -372,6 +373,7 @@ private:
State = IES_ERROR;
break;
case IES_PLUS:
+ case IES_NOT:
case IES_MULTIPLY:
case IES_DIVIDE:
case IES_LPAREN:
@@ -401,6 +403,19 @@ private:
}
PrevState = CurrState;
}
+ void onNot() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_NOT:
+ State = IES_NOT;
+ break;
+ }
+ PrevState = CurrState;
+ }
void onRegister(unsigned Reg) {
IntelExprState CurrState = State;
switch (State) {
@@ -438,6 +453,7 @@ private:
break;
case IES_PLUS:
case IES_MINUS:
+ case IES_NOT:
State = IES_INTEGER;
Sym = SymRef;
SymName = SymRefName;
@@ -453,6 +469,7 @@ private:
break;
case IES_PLUS:
case IES_MINUS:
+ case IES_NOT:
case IES_OR:
case IES_AND:
case IES_LSHIFT:
@@ -476,11 +493,22 @@ private:
PrevState == IES_OR || PrevState == IES_AND ||
PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
- PrevState == IES_LPAREN || PrevState == IES_LBRAC) &&
+ PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
+ PrevState == IES_NOT) &&
CurrState == IES_MINUS) {
// Unary minus. No need to pop the minus operand because it was never
// pushed.
IC.pushOperand(IC_IMM, -TmpInt); // Push -Imm.
+ } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+ PrevState == IES_OR || PrevState == IES_AND ||
+ PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
+ PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
+ PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
+ PrevState == IES_NOT) &&
+ CurrState == IES_NOT) {
+ // Unary not. No need to pop the not operand because it was never
+ // pushed.
+ IC.pushOperand(IC_IMM, ~TmpInt); // Push ~Imm.
} else {
IC.pushOperand(IC_IMM, TmpInt);
}
@@ -561,6 +589,7 @@ private:
break;
case IES_PLUS:
case IES_MINUS:
+ case IES_NOT:
case IES_OR:
case IES_AND:
case IES_LSHIFT:
@@ -568,13 +597,14 @@ private:
case IES_MULTIPLY:
case IES_DIVIDE:
case IES_LPAREN:
- // FIXME: We don't handle this type of unary minus, yet.
+ // FIXME: We don't handle this type of unary minus or not, yet.
if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
PrevState == IES_OR || PrevState == IES_AND ||
PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
- PrevState == IES_LPAREN || PrevState == IES_LBRAC) &&
- CurrState == IES_MINUS) {
+ PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
+ PrevState == IES_NOT) &&
+ (CurrState == IES_MINUS || CurrState == IES_NOT)) {
State = IES_ERROR;
break;
}
@@ -618,52 +648,52 @@ private:
return Error(L, Msg, Ranges, MatchingInlineAsm);
}
- X86Operand *ErrorOperand(SMLoc Loc, StringRef Msg) {
+ std::nullptr_t ErrorOperand(SMLoc Loc, StringRef Msg) {
Error(Loc, Msg);
return nullptr;
}
- X86Operand *DefaultMemSIOperand(SMLoc Loc);
- X86Operand *DefaultMemDIOperand(SMLoc Loc);
- X86Operand *ParseOperand();
- X86Operand *ParseATTOperand();
- X86Operand *ParseIntelOperand();
- X86Operand *ParseIntelOffsetOfOperator();
+ std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc);
+ std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc);
+ std::unique_ptr<X86Operand> ParseOperand();
+ std::unique_ptr<X86Operand> ParseATTOperand();
+ std::unique_ptr<X86Operand> ParseIntelOperand();
+ std::unique_ptr<X86Operand> ParseIntelOffsetOfOperator();
bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp);
- X86Operand *ParseIntelOperator(unsigned OpKind);
- X86Operand *ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size);
- X86Operand *ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc,
- unsigned Size);
+ std::unique_ptr<X86Operand> ParseIntelOperator(unsigned OpKind);
+ std::unique_ptr<X86Operand>
+ ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size);
+ std::unique_ptr<X86Operand>
+ ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc, unsigned Size);
bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
- X86Operand *ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
- int64_t ImmDisp, unsigned Size);
+ std::unique_ptr<X86Operand> ParseIntelBracExpression(unsigned SegReg,
+ SMLoc Start,
+ int64_t ImmDisp,
+ unsigned Size);
bool ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier,
InlineAsmIdentifierInfo &Info,
bool IsUnevaluatedOperand, SMLoc &End);
- X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
+ std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
- X86Operand *CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp,
- unsigned BaseReg, unsigned IndexReg,
- unsigned Scale, SMLoc Start, SMLoc End,
- unsigned Size, StringRef Identifier,
- InlineAsmIdentifierInfo &Info);
+ std::unique_ptr<X86Operand>
+ CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
+ unsigned IndexReg, unsigned Scale, SMLoc Start,
+ SMLoc End, unsigned Size, StringRef Identifier,
+ InlineAsmIdentifierInfo &Info);
bool ParseDirectiveWord(unsigned Size, SMLoc L);
bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
- bool processInstruction(MCInst &Inst,
- const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
+ bool processInstruction(MCInst &Inst, const OperandVector &Ops);
/// Wrapper around MCStreamer::EmitInstruction(). Possibly adds
/// instrumentation around Inst.
- void EmitInstruction(MCInst &Inst,
- SmallVectorImpl<MCParsedAsmOperand *> &Operands,
- MCStreamer &Out);
+ void EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out);
bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
- SmallVectorImpl<MCParsedAsmOperand*> &Operands,
- MCStreamer &Out, unsigned &ErrorInfo,
+ OperandVector &Operands, MCStreamer &Out,
+ unsigned &ErrorInfo,
bool MatchingInlineAsm) override;
/// doSrcDstMatch - Returns true if operands are matching in their
@@ -674,8 +704,8 @@ private:
/// Parses AVX512 specific operand primitives: masked registers ({%k<NUM>}, {z})
/// and memory broadcasting ({1to<NUM>}) primitives, updating Operands vector if required.
/// \return \c true if no parsing errors occurred, \c false otherwise.
- bool HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
- const MCParsedAsmOperand &Op);
+ bool HandleAVX512Operand(OperandVector &Operands,
+ const MCParsedAsmOperand &Op);
bool is64BitMode() const {
// FIXME: Can tablegen auto-generate this?
@@ -725,9 +755,8 @@ public:
bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
- bool
- ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
- SmallVectorImpl<MCParsedAsmOperand*> &Operands) override;
+ bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) override;
bool ParseDirective(AsmToken DirectiveID) override;
};
@@ -908,7 +937,7 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
return false;
}
-X86Operand *X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
+std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
unsigned basereg =
is64BitMode() ? X86::RSI : (is32BitMode() ? X86::ESI : X86::SI);
const MCExpr *Disp = MCConstantExpr::Create(0, getContext());
@@ -916,7 +945,7 @@ X86Operand *X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
/*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0);
}
-X86Operand *X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
+std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
unsigned basereg =
is64BitMode() ? X86::RDI : (is32BitMode() ? X86::EDI : X86::DI);
const MCExpr *Disp = MCConstantExpr::Create(0, getContext());
@@ -924,7 +953,7 @@ X86Operand *X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
/*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0);
}
-X86Operand *X86AsmParser::ParseOperand() {
+std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() {
if (isParsingIntelSyntax())
return ParseIntelOperand();
return ParseATTOperand();
@@ -946,12 +975,10 @@ static unsigned getIntelMemOperandSize(StringRef OpStr) {
return Size;
}
-X86Operand *
-X86AsmParser::CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp,
- unsigned BaseReg, unsigned IndexReg,
- unsigned Scale, SMLoc Start, SMLoc End,
- unsigned Size, StringRef Identifier,
- InlineAsmIdentifierInfo &Info){
+std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
+ unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg,
+ unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier,
+ InlineAsmIdentifierInfo &Info) {
// If this is not a VarDecl then assume it is a FuncDecl or some other label
// reference. We need an 'r' constraint here, so we need to create register
// operand to ensure proper matching. Just pick a GPR based on the size of
@@ -1064,7 +1091,8 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac)
break;
- switch (getLexer().getKind()) {
+ AsmToken::TokenKind TK = getLexer().getKind();
+ switch (TK) {
default: {
if (SM.isValidEndState()) {
Done = true;
@@ -1076,13 +1104,14 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
Done = true;
break;
}
+ case AsmToken::String:
case AsmToken::Identifier: {
// This could be a register or a symbolic displacement.
unsigned TmpReg;
const MCExpr *Val;
SMLoc IdentLoc = Tok.getLoc();
StringRef Identifier = Tok.getString();
- if(!ParseRegister(TmpReg, IdentLoc, End)) {
+ if (TK != AsmToken::String && !ParseRegister(TmpReg, IdentLoc, End)) {
SM.onRegister(TmpReg);
UpdateLocLex = false;
break;
@@ -1142,6 +1171,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
}
case AsmToken::Plus: SM.onPlus(); break;
case AsmToken::Minus: SM.onMinus(); break;
+ case AsmToken::Tilde: SM.onNot(); break;
case AsmToken::Star: SM.onStar(); break;
case AsmToken::Slash: SM.onDivide(); break;
case AsmToken::Pipe: SM.onOr(); break;
@@ -1164,9 +1194,9 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
return false;
}
-X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
- int64_t ImmDisp,
- unsigned Size) {
+std::unique_ptr<X86Operand>
+X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
+ int64_t ImmDisp, unsigned Size) {
const AsmToken &Tok = Parser.getTok();
SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc();
if (getLexer().isNot(AsmToken::LBrac))
@@ -1270,9 +1300,9 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
}
/// \brief Parse intel style segment override.
-X86Operand *X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg,
- SMLoc Start,
- unsigned Size) {
+std::unique_ptr<X86Operand>
+X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start,
+ unsigned Size) {
assert(SegReg != 0 && "Tried to parse a segment override without a segment!");
const AsmToken &Tok = Parser.getTok(); // Eat colon.
if (Tok.isNot(AsmToken::Colon))
@@ -1321,8 +1351,9 @@ X86Operand *X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg,
}
/// ParseIntelMemOperand - Parse intel style memory operand.
-X86Operand *X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start,
- unsigned Size) {
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp,
+ SMLoc Start,
+ unsigned Size) {
const AsmToken &Tok = Parser.getTok();
SMLoc End;
@@ -1425,7 +1456,7 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
/// Parse the 'offset' operator. This operator is used to specify the
/// location rather then the content of a variable.
-X86Operand *X86AsmParser::ParseIntelOffsetOfOperator() {
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() {
const AsmToken &Tok = Parser.getTok();
SMLoc OffsetOfLoc = Tok.getLoc();
Parser.Lex(); // Eat offset.
@@ -1462,7 +1493,7 @@ enum IntelOperatorKind {
/// variable. A variable's size is the product of its LENGTH and TYPE. The
/// TYPE operator returns the size of a C or C++ type or variable. If the
/// variable is an array, TYPE returns the size of a single element.
-X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) {
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) {
const AsmToken &Tok = Parser.getTok();
SMLoc TypeLoc = Tok.getLoc();
Parser.Lex(); // Eat operator.
@@ -1495,7 +1526,7 @@ X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) {
return X86Operand::CreateImm(Imm, Start, End);
}
-X86Operand *X86AsmParser::ParseIntelOperand() {
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
const AsmToken &Tok = Parser.getTok();
SMLoc Start, End;
@@ -1523,7 +1554,7 @@ X86Operand *X86AsmParser::ParseIntelOperand() {
// Immediate.
if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Minus) ||
- getLexer().is(AsmToken::LParen)) {
+ getLexer().is(AsmToken::Tilde) || getLexer().is(AsmToken::LParen)) {
AsmToken StartTok = Tok;
IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true,
/*AddImmPrefix=*/false);
@@ -1577,7 +1608,7 @@ X86Operand *X86AsmParser::ParseIntelOperand() {
return ParseIntelMemOperand(/*Disp=*/0, Start, Size);
}
-X86Operand *X86AsmParser::ParseATTOperand() {
+std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
switch (getLexer().getKind()) {
default:
// Parse a memory operand with no segment register.
@@ -1613,9 +1644,8 @@ X86Operand *X86AsmParser::ParseATTOperand() {
}
}
-bool
-X86AsmParser::HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
- const MCParsedAsmOperand &Op) {
+bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
+ const MCParsedAsmOperand &Op) {
if(STI.getFeatureBits() & X86::FeatureAVX512) {
if (getLexer().is(AsmToken::LCurly)) {
// Eat "{" and mark the current place.
@@ -1653,8 +1683,8 @@ X86AsmParser::HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands
} else {
// Parse mask register {%k1}
Operands.push_back(X86Operand::CreateToken("{", consumedToken));
- if (X86Operand *Op = ParseOperand()) {
- Operands.push_back(Op);
+ if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
+ Operands.push_back(std::move(Op));
if (!getLexer().is(AsmToken::RCurly))
return !ErrorAndEatStatement(getLexer().getLoc(),
"Expected } at this point");
@@ -1682,7 +1712,8 @@ X86AsmParser::HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands
/// ParseMemOperand: segment: disp(basereg, indexreg, scale). The '%ds:' prefix
/// has already been parsed if present.
-X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
+std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
+ SMLoc MemStart) {
// We have to disambiguate a parenthesized expression "(4+5)" from the start
// of a memory operand with a missing displacement "(%ebx)" or "(,%eax)". The
@@ -1845,9 +1876,8 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
MemStart, MemEnd);
}
-bool X86AsmParser::
-ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
- SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) {
InstInfo = &Info;
StringRef PatchedName = Name;
@@ -1940,9 +1970,9 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
// Read the operands.
while(1) {
- if (X86Operand *Op = ParseOperand()) {
- Operands.push_back(Op);
- if (!HandleAVX512Operand(Operands, *Op))
+ if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
+ Operands.push_back(std::move(Op));
+ if (!HandleAVX512Operand(Operands, *Operands.back()))
return true;
} else {
Parser.eatToEndOfStatement();
@@ -1973,27 +2003,25 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
// documented form in various unofficial manuals, so a lot of code uses it.
if ((Name == "outb" || Name == "outw" || Name == "outl" || Name == "out") &&
Operands.size() == 3) {
- X86Operand &Op = *(X86Operand*)Operands.back();
+ X86Operand &Op = (X86Operand &)*Operands.back();
if (Op.isMem() && Op.Mem.SegReg == 0 &&
isa<MCConstantExpr>(Op.Mem.Disp) &&
cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
SMLoc Loc = Op.getEndLoc();
Operands.back() = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
- delete &Op;
}
}
// Same hack for "in[bwl]? (%dx), %al" -> "inb %dx, %al".
if ((Name == "inb" || Name == "inw" || Name == "inl" || Name == "in") &&
Operands.size() == 3) {
- X86Operand &Op = *(X86Operand*)Operands.begin()[1];
+ X86Operand &Op = (X86Operand &)*Operands[1];
if (Op.isMem() && Op.Mem.SegReg == 0 &&
isa<MCConstantExpr>(Op.Mem.Disp) &&
cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
SMLoc Loc = Op.getEndLoc();
- Operands.begin()[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
- delete &Op;
+ Operands[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
}
}
@@ -2060,8 +2088,8 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
Operands.push_back(DefaultMemSIOperand(NameLoc));
}
} else if (Operands.size() == 3) {
- X86Operand &Op = *(X86Operand*)Operands.begin()[1];
- X86Operand &Op2 = *(X86Operand*)Operands.begin()[2];
+ X86Operand &Op = (X86Operand &)*Operands[1];
+ X86Operand &Op2 = (X86Operand &)*Operands[2];
if (!doSrcDstMatch(Op, Op2))
return Error(Op.getStartLoc(),
"mismatching source and destination index registers");
@@ -2076,10 +2104,8 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
(Name == "smov" || Name == "smovb" || Name == "smovw" ||
Name == "smovl" || Name == "smovd" || Name == "smovq"))) {
if (Operands.size() == 1) {
- if (Name == "movsd") {
- delete Operands.back();
+ if (Name == "movsd")
Operands.back() = X86Operand::CreateToken("movsl", NameLoc);
- }
if (isParsingIntelSyntax()) {
Operands.push_back(DefaultMemDIOperand(NameLoc));
Operands.push_back(DefaultMemSIOperand(NameLoc));
@@ -2088,8 +2114,8 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
Operands.push_back(DefaultMemDIOperand(NameLoc));
}
} else if (Operands.size() == 3) {
- X86Operand &Op = *(X86Operand*)Operands.begin()[1];
- X86Operand &Op2 = *(X86Operand*)Operands.begin()[2];
+ X86Operand &Op = (X86Operand &)*Operands[1];
+ X86Operand &Op2 = (X86Operand &)*Operands[2];
if (!doSrcDstMatch(Op, Op2))
return Error(Op.getStartLoc(),
"mismatching source and destination index registers");
@@ -2105,31 +2131,26 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
Operands.size() == 3) {
if (isParsingIntelSyntax()) {
// Intel syntax
- X86Operand *Op1 = static_cast<X86Operand*>(Operands[2]);
- if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
- cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
- delete Operands[2];
+ X86Operand &Op1 = static_cast<X86Operand &>(*Operands[2]);
+ if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
+ cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
Operands.pop_back();
- }
} else {
- X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
- if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
- cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
- delete Operands[1];
+ X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
+ if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
+ cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
Operands.erase(Operands.begin() + 1);
- }
}
}
// Transforms "int $3" into "int3" as a size optimization. We can't write an
// instalias with an immediate operand yet.
if (Name == "int" && Operands.size() == 2) {
- X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
- if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
- cast<MCConstantExpr>(Op1->getImm())->getValue() == 3) {
- delete Operands[1];
+ X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
+ if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
+ cast<MCConstantExpr>(Op1.getImm())->getValue() == 3) {
Operands.erase(Operands.begin() + 1);
- static_cast<X86Operand*>(Operands[0])->setTokenValue("int3");
+ static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3");
}
}
@@ -2175,9 +2196,7 @@ static bool convert64i32to64ri8(MCInst &Inst, unsigned Opcode,
return convertToSExti8(Inst, Opcode, X86::RAX, isCmp);
}
-bool X86AsmParser::
-processInstruction(MCInst &Inst,
- const SmallVectorImpl<MCParsedAsmOperand*> &Ops) {
+bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
switch (Inst.getOpcode()) {
default: return false;
case X86::AND16i16: return convert16i16to16ri8(Inst, X86::AND16ri8);
@@ -2258,51 +2277,47 @@ processInstruction(MCInst &Inst,
static const char *getSubtargetFeatureName(unsigned Val);
-void X86AsmParser::EmitInstruction(
- MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
- MCStreamer &Out) {
+void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands,
+ MCStreamer &Out) {
Instrumentation->InstrumentInstruction(Inst, Operands, getContext(), MII,
Out);
Out.EmitInstruction(Inst, STI);
}
-bool X86AsmParser::
-MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
- SmallVectorImpl<MCParsedAsmOperand*> &Operands,
- MCStreamer &Out, unsigned &ErrorInfo,
- bool MatchingInlineAsm) {
+bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out, unsigned &ErrorInfo,
+ bool MatchingInlineAsm) {
assert(!Operands.empty() && "Unexpect empty operand list!");
- X86Operand *Op = static_cast<X86Operand*>(Operands[0]);
- assert(Op->isToken() && "Leading operand should always be a mnemonic!");
+ X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
+ assert(Op.isToken() && "Leading operand should always be a mnemonic!");
ArrayRef<SMRange> EmptyRanges = None;
// First, handle aliases that expand to multiple instructions.
// FIXME: This should be replaced with a real .td file alias mechanism.
// Also, MatchInstructionImpl should actually *do* the EmitInstruction
// call.
- if (Op->getToken() == "fstsw" || Op->getToken() == "fstcw" ||
- Op->getToken() == "fstsww" || Op->getToken() == "fstcww" ||
- Op->getToken() == "finit" || Op->getToken() == "fsave" ||
- Op->getToken() == "fstenv" || Op->getToken() == "fclex") {
+ if (Op.getToken() == "fstsw" || Op.getToken() == "fstcw" ||
+ Op.getToken() == "fstsww" || Op.getToken() == "fstcww" ||
+ Op.getToken() == "finit" || Op.getToken() == "fsave" ||
+ Op.getToken() == "fstenv" || Op.getToken() == "fclex") {
MCInst Inst;
Inst.setOpcode(X86::WAIT);
Inst.setLoc(IDLoc);
if (!MatchingInlineAsm)
EmitInstruction(Inst, Operands, Out);
- const char *Repl =
- StringSwitch<const char*>(Op->getToken())
- .Case("finit", "fninit")
- .Case("fsave", "fnsave")
- .Case("fstcw", "fnstcw")
- .Case("fstcww", "fnstcw")
- .Case("fstenv", "fnstenv")
- .Case("fstsw", "fnstsw")
- .Case("fstsww", "fnstsw")
- .Case("fclex", "fnclex")
- .Default(nullptr);
+ const char *Repl = StringSwitch<const char *>(Op.getToken())
+ .Case("finit", "fninit")
+ .Case("fsave", "fnsave")
+ .Case("fstcw", "fnstcw")
+ .Case("fstcww", "fnstcw")
+ .Case("fstenv", "fnstenv")
+ .Case("fstsw", "fnstsw")
+ .Case("fstsww", "fnstsw")
+ .Case("fclex", "fnclex")
+ .Default(nullptr);
assert(Repl && "Unknown wait-prefixed instruction");
- delete Operands[0];
Operands[0] = X86Operand::CreateToken(Repl, IDLoc);
}
@@ -2355,11 +2370,11 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
// following hack.
// Change the operand to point to a temporary token.
- StringRef Base = Op->getToken();
+ StringRef Base = Op.getToken();
SmallString<16> Tmp;
Tmp += Base;
Tmp += ' ';
- Op->setTokenValue(Tmp.str());
+ Op.setTokenValue(Tmp.str());
// If this instruction starts with an 'f', then it is a floating point stack
// instruction. These come in up to three forms for 32-bit, 64-bit, and
@@ -2400,7 +2415,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
ErrorInfoMissingFeature = ErrorInfoIgnore;
// Restore the old token.
- Op->setTokenValue(Base);
+ Op.setTokenValue(Base);
// If exactly one matched, then we treat that as a successful match (and the
// instruction will already have been filled in correctly, since the failing
@@ -2450,8 +2465,8 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
if ((Match1 == Match_MnemonicFail) && (Match2 == Match_MnemonicFail) &&
(Match3 == Match_MnemonicFail) && (Match4 == Match_MnemonicFail)) {
if (!WasOriginallyInvalidOperand) {
- ArrayRef<SMRange> Ranges = MatchingInlineAsm ? EmptyRanges :
- Op->getLocRange();
+ ArrayRef<SMRange> Ranges =
+ MatchingInlineAsm ? EmptyRanges : Op.getLocRange();
return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'",
Ranges, MatchingInlineAsm);
}
@@ -2462,10 +2477,10 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return Error(IDLoc, "too few operands for instruction",
EmptyRanges, MatchingInlineAsm);
- X86Operand *Operand = (X86Operand*)Operands[ErrorInfo];
- if (Operand->getStartLoc().isValid()) {
- SMRange OperandRange = Operand->getLocRange();
- return Error(Operand->getStartLoc(), "invalid operand for instruction",
+ X86Operand &Operand = (X86Operand &)*Operands[ErrorInfo];
+ if (Operand.getStartLoc().isValid()) {
+ SMRange OperandRange = Operand.getLocRange();
+ return Error(Operand.getStartLoc(), "invalid operand for instruction",
OperandRange, MatchingInlineAsm);
}
}
diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h
index de3be38..1bbfc11 100644
--- a/lib/Target/X86/AsmParser/X86Operand.h
+++ b/lib/Target/X86/AsmParser/X86Operand.h
@@ -13,6 +13,7 @@
#include "X86AsmParserCommon.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/ADT/STLExtras.h"
namespace llvm {
@@ -410,20 +411,19 @@ struct X86Operand : public MCParsedAsmOperand {
Inst.addOperand(MCOperand::CreateReg(getMemSegReg()));
}
- static X86Operand *CreateToken(StringRef Str, SMLoc Loc) {
+ static std::unique_ptr<X86Operand> CreateToken(StringRef Str, SMLoc Loc) {
SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size());
- X86Operand *Res = new X86Operand(Token, Loc, EndLoc);
+ auto Res = llvm::make_unique<X86Operand>(Token, Loc, EndLoc);
Res->Tok.Data = Str.data();
Res->Tok.Length = Str.size();
return Res;
}
- static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
- bool AddressOf = false,
- SMLoc OffsetOfLoc = SMLoc(),
- StringRef SymName = StringRef(),
- void *OpDecl = nullptr) {
- X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc);
+ static std::unique_ptr<X86Operand>
+ CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
+ bool AddressOf = false, SMLoc OffsetOfLoc = SMLoc(),
+ StringRef SymName = StringRef(), void *OpDecl = nullptr) {
+ auto Res = llvm::make_unique<X86Operand>(Register, StartLoc, EndLoc);
Res->Reg.RegNo = RegNo;
Res->AddressOf = AddressOf;
Res->OffsetOfLoc = OffsetOfLoc;
@@ -432,17 +432,18 @@ struct X86Operand : public MCParsedAsmOperand {
return Res;
}
- static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc){
- X86Operand *Res = new X86Operand(Immediate, StartLoc, EndLoc);
+ static std::unique_ptr<X86Operand> CreateImm(const MCExpr *Val,
+ SMLoc StartLoc, SMLoc EndLoc) {
+ auto Res = llvm::make_unique<X86Operand>(Immediate, StartLoc, EndLoc);
Res->Imm.Val = Val;
return Res;
}
/// Create an absolute memory operand.
- static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
- unsigned Size = 0, StringRef SymName = StringRef(),
- void *OpDecl = nullptr) {
- X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
+ static std::unique_ptr<X86Operand>
+ CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, unsigned Size = 0,
+ StringRef SymName = StringRef(), void *OpDecl = nullptr) {
+ auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
Res->Mem.SegReg = 0;
Res->Mem.Disp = Disp;
Res->Mem.BaseReg = 0;
@@ -456,12 +457,11 @@ struct X86Operand : public MCParsedAsmOperand {
}
/// Create a generalized memory operand.
- static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp,
- unsigned BaseReg, unsigned IndexReg,
- unsigned Scale, SMLoc StartLoc, SMLoc EndLoc,
- unsigned Size = 0,
- StringRef SymName = StringRef(),
- void *OpDecl = nullptr) {
+ static std::unique_ptr<X86Operand>
+ CreateMem(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
+ unsigned IndexReg, unsigned Scale, SMLoc StartLoc, SMLoc EndLoc,
+ unsigned Size = 0, StringRef SymName = StringRef(),
+ void *OpDecl = nullptr) {
// We should never just have a displacement, that should be parsed as an
// absolute memory operand.
assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
@@ -469,7 +469,7 @@ struct X86Operand : public MCParsedAsmOperand {
// The scale should always be one of {1,2,4,8}.
assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) &&
"Invalid scale!");
- X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
+ auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
Res->Mem.SegReg = SegReg;
Res->Mem.Disp = Disp;
Res->Mem.BaseReg = BaseReg;
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index c54fbc1..a09767e 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -14,6 +14,7 @@ add_public_tablegen_target(X86CommonTableGen)
set(sources
X86AsmPrinter.cpp
+ X86AtomicExpandPass.cpp
X86CodeEmitter.cpp
X86FastISel.cpp
X86FloatingPoint.cpp
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index 804606d..55587d4 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -1620,7 +1620,8 @@ static int readVVVV(struct InternalInstruction* insn) {
int vvvv;
if (insn->vectorExtensionType == TYPE_EVEX)
- vvvv = vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]);
+ vvvv = (v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4 |
+ vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]));
else if (insn->vectorExtensionType == TYPE_VEX_3B)
vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]);
else if (insn->vectorExtensionType == TYPE_VEX_2B)
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index bf30a8e..23bca0d 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -73,11 +73,12 @@ public:
};
class X86AsmBackend : public MCAsmBackend {
- StringRef CPU;
+ const StringRef CPU;
bool HasNopl;
+ const uint64_t MaxNopLength;
public:
X86AsmBackend(const Target &T, StringRef _CPU)
- : MCAsmBackend(), CPU(_CPU) {
+ : MCAsmBackend(), CPU(_CPU), MaxNopLength(_CPU == "slm" ? 7 : 15) {
HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" &&
CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" &&
CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" &&
@@ -331,7 +332,7 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
// 15 is the longest single nop instruction. Emit as many 15-byte nops as
// needed, then emit a nop of the remaining length.
do {
- const uint8_t ThisNopLength = (uint8_t) std::min(Count, (uint64_t) 15);
+ const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength);
const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
for (uint8_t i = 0; i < Prefixes; i++)
OW->Write8(0x66);
@@ -365,6 +366,17 @@ public:
}
};
+class ELFX86_X32AsmBackend : public ELFX86AsmBackend {
+public:
+ ELFX86_X32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+ : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+ MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+ return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI,
+ ELF::EM_X86_64);
+ }
+};
+
class ELFX86_64AsmBackend : public ELFX86AsmBackend {
public:
ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
@@ -717,11 +729,10 @@ public:
};
class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
- bool SupportsCU;
public:
DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
- StringRef CPU, bool SupportsCU)
- : DarwinX86AsmBackend(T, MRI, CPU, false), SupportsCU(SupportsCU) {}
+ StringRef CPU)
+ : DarwinX86AsmBackend(T, MRI, CPU, false) {}
MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
@@ -732,20 +743,16 @@ public:
/// \brief Generate the compact unwind encoding for the CFI instructions.
uint32_t generateCompactUnwindEncoding(
ArrayRef<MCCFIInstruction> Instrs) const override {
- return SupportsCU ? generateCompactUnwindEncodingImpl(Instrs) : 0;
+ return generateCompactUnwindEncodingImpl(Instrs);
}
};
class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
- bool SupportsCU;
const MachO::CPUSubTypeX86 Subtype;
public:
DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
- StringRef CPU, bool SupportsCU,
- MachO::CPUSubTypeX86 st)
- : DarwinX86AsmBackend(T, MRI, CPU, true), SupportsCU(SupportsCU),
- Subtype(st) {
- }
+ StringRef CPU, MachO::CPUSubTypeX86 st)
+ : DarwinX86AsmBackend(T, MRI, CPU, true), Subtype(st) {}
MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
return createX86MachObjectWriter(OS, /*Is64Bit=*/true,
@@ -788,7 +795,7 @@ public:
/// \brief Generate the compact unwind encoding for the CFI instructions.
uint32_t generateCompactUnwindEncoding(
ArrayRef<MCCFIInstruction> Instrs) const override {
- return SupportsCU ? generateCompactUnwindEncodingImpl(Instrs) : 0;
+ return generateCompactUnwindEncodingImpl(Instrs);
}
};
@@ -801,9 +808,7 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
Triple TheTriple(TT);
if (TheTriple.isOSBinFormatMachO())
- return new DarwinX86_32AsmBackend(T, MRI, CPU,
- TheTriple.isMacOSX() &&
- !TheTriple.isMacOSXVersionLT(10, 7));
+ return new DarwinX86_32AsmBackend(T, MRI, CPU);
if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF())
return new WindowsX86AsmBackend(T, false, CPU);
@@ -823,14 +828,15 @@ MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName())
.Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H)
.Default(MachO::CPU_SUBTYPE_X86_64_ALL);
- return new DarwinX86_64AsmBackend(T, MRI, CPU,
- TheTriple.isMacOSX() &&
- !TheTriple.isMacOSXVersionLT(10, 7), CS);
+ return new DarwinX86_64AsmBackend(T, MRI, CPU, CS);
}
if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF())
return new WindowsX86AsmBackend(T, true, CPU);
uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+
+ if (TheTriple.getEnvironment() == Triple::GNUX32)
+ return new ELFX86_X32AsmBackend(T, OSABI, CPU);
return new ELFX86_64AsmBackend(T, OSABI, CPU);
}
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index 39480ea..83b2777 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -74,8 +74,9 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
// FIXME: this should not depend on the target OS version, but on the ld64
// version in use. From at least >= ld64-97.17 (Xcode 3.2.6) the abs-ified
- // FDE relocs may be used.
- DwarfFDESymbolsUseAbsDiff = T.isMacOSX() && !T.isMacOSXVersionLT(10, 6);
+ // FDE relocs may be used. We also use them for the ios simulator.
+ DwarfFDESymbolsUseAbsDiff = (T.isMacOSX() && !T.isMacOSXVersionLT(10, 6))
+ || T.isiOS();
UseIntegratedAssembler = true;
}
@@ -142,8 +143,11 @@ getNonexecutableStackSection(MCContext &Ctx) const {
void X86MCAsmInfoMicrosoft::anchor() { }
X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
- if (Triple.getArch() == Triple::x86_64)
+ if (Triple.getArch() == Triple::x86_64) {
PrivateGlobalPrefix = ".L";
+ PointerSize = 8;
+ ExceptionsType = ExceptionHandling::WinEH;
+ }
AssemblerDialect = AsmWriterFlavor;
@@ -157,17 +161,18 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
void X86MCAsmInfoGNUCOFF::anchor() { }
X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
+ assert(Triple.isOSWindows() && "Windows is the only supported COFF target");
if (Triple.getArch() == Triple::x86_64) {
PrivateGlobalPrefix = ".L";
PointerSize = 8;
+ ExceptionsType = ExceptionHandling::WinEH;
+ } else {
+ ExceptionsType = ExceptionHandling::DwarfCFI;
}
AssemblerDialect = AsmWriterFlavor;
TextAlignFillValue = 0x90;
- // Exceptions handling
- ExceptionsType = ExceptionHandling::DwarfCFI;
-
UseIntegratedAssembler = true;
}
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index e63036c..5e29e5c 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -197,14 +197,13 @@ void X86_MC::DetectFamilyModel(unsigned EAX, unsigned &Family,
}
}
-unsigned X86_MC::getDwarfRegFlavour(StringRef TT, bool isEH) {
- Triple TheTriple(TT);
- if (TheTriple.getArch() == Triple::x86_64)
+unsigned X86_MC::getDwarfRegFlavour(Triple TT, bool isEH) {
+ if (TT.getArch() == Triple::x86_64)
return DWARFFlavour::X86_64;
- if (TheTriple.isOSDarwin())
+ if (TT.isOSDarwin())
return isEH ? DWARFFlavour::X86_32_DarwinEH : DWARFFlavour::X86_32_Generic;
- if (TheTriple.isOSCygMing())
+ if (TT.isOSCygMing())
// Unsupported by now, just quick fallback
return DWARFFlavour::X86_32_Generic;
return DWARFFlavour::X86_32_Generic;
@@ -251,8 +250,8 @@ static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) {
MCRegisterInfo *X = new MCRegisterInfo();
InitX86MCRegisterInfo(X, RA,
- X86_MC::getDwarfRegFlavour(TT, false),
- X86_MC::getDwarfRegFlavour(TT, true),
+ X86_MC::getDwarfRegFlavour(TheTriple, false),
+ X86_MC::getDwarfRegFlavour(TheTriple, true),
RA);
X86_MC::InitLLVM2SEHRegisterMapping(X);
return X;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 8fe40fd..ebe74cf 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -28,6 +28,7 @@ class MCSubtargetInfo;
class MCRelocationInfo;
class MCStreamer;
class Target;
+class Triple;
class StringRef;
class raw_ostream;
@@ -64,7 +65,7 @@ namespace X86_MC {
void DetectFamilyModel(unsigned EAX, unsigned &Family, unsigned &Model);
- unsigned getDwarfRegFlavour(StringRef TT, bool isEH);
+ unsigned getDwarfRegFlavour(Triple TT, bool isEH);
void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI);
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index c62fd0a..7fa4180 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -19,12 +19,12 @@ public:
raw_ostream &OS)
: MCWinCOFFStreamer(C, AB, *CE, OS) { }
- void EmitWin64EHHandlerData() override;
+ void EmitWinEHHandlerData() override;
void FinishImpl() override;
};
-void X86WinCOFFStreamer::EmitWin64EHHandlerData() {
- MCStreamer::EmitWin64EHHandlerData();
+void X86WinCOFFStreamer::EmitWinEHHandlerData() {
+ MCStreamer::EmitWinEHHandlerData();
// We have to emit the unwind info now, because this directive
// actually switches to the .xdata section!
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 64e8ea8..d5522ed 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -24,6 +24,10 @@ class ImmutablePass;
class JITCodeEmitter;
class X86TargetMachine;
+/// createX86AtomicExpandPass - This pass expands atomic operations that cannot
+/// be handled natively in terms of a loop using cmpxchg.
+FunctionPass *createX86AtomicExpandPass(const X86TargetMachine *TM);
+
/// createX86ISelDag - This pass converts a legalized DAG into a
/// X86-specific DAG, ready for instruction scheduling.
///
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 6912b57..93f516a 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -168,6 +168,8 @@ def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
"LEA instruction needs inputs at AG stage">;
def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
"LEA instruction with certain arguments is slow">;
+def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
+ "INC and DEC instructions are slower than ADD and SUB">;
//===----------------------------------------------------------------------===//
// X86 processors supported.
@@ -228,7 +230,7 @@ def : ProcessorModel<"slm", SLMModel, [ProcIntelSLM,
FeaturePCLMUL, FeatureAES,
FeatureCallRegIndirect,
FeaturePRFCHW,
- FeatureSlowLEA,
+ FeatureSlowLEA, FeatureSlowIncDec,
FeatureSlowBTMem, FeatureFastUAMem]>;
// "Arrandale" along with corei3 and corei5
def : ProcessorModel<"corei7", SandyBridgeModel,
@@ -271,7 +273,8 @@ def : ProcessorModel<"knl", HaswellModel,
FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT,
FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C,
FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI,
- FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE]>;
+ FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE,
+ FeatureSlowIncDec]>;
def : Proc<"k6", [FeatureMMX]>;
def : Proc<"k6-2", [Feature3DNow]>;
diff --git a/lib/Target/X86/X86AtomicExpandPass.cpp b/lib/Target/X86/X86AtomicExpandPass.cpp
new file mode 100644
index 0000000..61eefbb
--- /dev/null
+++ b/lib/Target/X86/X86AtomicExpandPass.cpp
@@ -0,0 +1,287 @@
+//===-- X86AtomicExpandPass.cpp - Expand illegal atomic instructions --0---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass (at IR level) to replace atomic instructions which
+// cannot be implemented as a single instruction with cmpxchg-based loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-atomic-expand"
+
+namespace {
+ class X86AtomicExpandPass : public FunctionPass {
+ const X86TargetMachine *TM;
+ public:
+ static char ID; // Pass identification, replacement for typeid
+ explicit X86AtomicExpandPass(const X86TargetMachine *TM)
+ : FunctionPass(ID), TM(TM) {}
+
+ bool runOnFunction(Function &F) override;
+ bool expandAtomicInsts(Function &F);
+
+ bool needsCmpXchgNb(Type *MemType);
+
+ /// There are four kinds of atomic operations. Two never need expanding:
+ /// cmpxchg is what we expand the others *to*, and loads are easily handled
+ /// by ISelLowering. Atomicrmw and store can need expanding in some
+ /// circumstances.
+ bool shouldExpand(Instruction *Inst);
+
+ /// 128-bit atomic stores (64-bit on i686) need to be implemented in terms
+ /// of trivial cmpxchg16b loops. A simple store isn't necessarily atomic.
+ bool shouldExpandStore(StoreInst *SI);
+
+ /// Only some atomicrmw instructions need expanding -- some operations
+ /// (e.g. max) have absolutely no architectural support; some (e.g. or) have
+ /// limited support but can't return the previous value; some (e.g. add)
+ /// have complete support in the instruction set.
+ ///
+ /// Also, naturally, 128-bit operations always need to be expanded.
+ bool shouldExpandAtomicRMW(AtomicRMWInst *AI);
+
+ bool expandAtomicRMW(AtomicRMWInst *AI);
+ bool expandAtomicStore(StoreInst *SI);
+ };
+}
+
+char X86AtomicExpandPass::ID = 0;
+
+FunctionPass *llvm::createX86AtomicExpandPass(const X86TargetMachine *TM) {
+ return new X86AtomicExpandPass(TM);
+}
+
+bool X86AtomicExpandPass::runOnFunction(Function &F) {
+ SmallVector<Instruction *, 1> AtomicInsts;
+
+ // Changing control-flow while iterating through it is a bad idea, so gather a
+ // list of all atomic instructions before we start.
+ for (BasicBlock &BB : F)
+ for (Instruction &Inst : BB) {
+ if (isa<AtomicRMWInst>(&Inst) ||
+ (isa<StoreInst>(&Inst) && cast<StoreInst>(&Inst)->isAtomic()))
+ AtomicInsts.push_back(&Inst);
+ }
+
+ bool MadeChange = false;
+ for (Instruction *Inst : AtomicInsts) {
+ if (!shouldExpand(Inst))
+ continue;
+
+ if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst))
+ MadeChange |= expandAtomicRMW(AI);
+ if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+ MadeChange |= expandAtomicStore(SI);
+
+ assert(MadeChange && "Atomic inst not expanded when it should be?");
+ Inst->eraseFromParent();
+ }
+
+ return MadeChange;
+}
+
+/// Returns true if operations on the given type will need to use either
+/// cmpxchg8b or cmpxchg16b. This occurs if the type is 1 step up from the
+/// native width, and the instructions are available (otherwise we leave them
+/// alone to become __sync_fetch_and_... calls).
+bool X86AtomicExpandPass::needsCmpXchgNb(llvm::Type *MemType) {
+ const X86Subtarget &Subtarget = TM->getSubtarget<X86Subtarget>();
+ if (!Subtarget.hasCmpxchg16b())
+ return false;
+
+ unsigned CmpXchgNbWidth = Subtarget.is64Bit() ? 128 : 64;
+
+ unsigned OpWidth = MemType->getPrimitiveSizeInBits();
+ if (OpWidth == CmpXchgNbWidth)
+ return true;
+
+ return false;
+}
+
+
+bool X86AtomicExpandPass::shouldExpandAtomicRMW(AtomicRMWInst *AI) {
+ const X86Subtarget &Subtarget = TM->getSubtarget<X86Subtarget>();
+ unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+
+ if (needsCmpXchgNb(AI->getType()))
+ return true;
+
+ if (AI->getType()->getPrimitiveSizeInBits() > NativeWidth)
+ return false;
+
+ AtomicRMWInst::BinOp Op = AI->getOperation();
+ switch (Op) {
+ default:
+ llvm_unreachable("Unknown atomic operation");
+ case AtomicRMWInst::Xchg:
+ case AtomicRMWInst::Add:
+ case AtomicRMWInst::Sub:
+ // It's better to use xadd, xsub or xchg for these in all cases.
+ return false;
+ case AtomicRMWInst::Or:
+ case AtomicRMWInst::And:
+ case AtomicRMWInst::Xor:
+ // If the atomicrmw's result isn't actually used, we can just add a "lock"
+ // prefix to a normal instruction for these operations.
+ return !AI->use_empty();
+ case AtomicRMWInst::Nand:
+ case AtomicRMWInst::Max:
+ case AtomicRMWInst::Min:
+ case AtomicRMWInst::UMax:
+ case AtomicRMWInst::UMin:
+ // These always require a non-trivial set of data operations on x86. We must
+ // use a cmpxchg loop.
+ return true;
+ }
+}
+
+bool X86AtomicExpandPass::shouldExpandStore(StoreInst *SI) {
+ if (needsCmpXchgNb(SI->getValueOperand()->getType()))
+ return true;
+
+ return false;
+}
+
+bool X86AtomicExpandPass::shouldExpand(Instruction *Inst) {
+ if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst))
+ return shouldExpandAtomicRMW(AI);
+ if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+ return shouldExpandStore(SI);
+ return false;
+}
+
+/// Emit IR to implement the given atomicrmw operation on values in registers,
+/// returning the new value.
+static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
+ Value *Loaded, Value *Inc) {
+ Value *NewVal;
+ switch (Op) {
+ case AtomicRMWInst::Xchg:
+ return Inc;
+ case AtomicRMWInst::Add:
+ return Builder.CreateAdd(Loaded, Inc, "new");
+ case AtomicRMWInst::Sub:
+ return Builder.CreateSub(Loaded, Inc, "new");
+ case AtomicRMWInst::And:
+ return Builder.CreateAnd(Loaded, Inc, "new");
+ case AtomicRMWInst::Nand:
+ return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new");
+ case AtomicRMWInst::Or:
+ return Builder.CreateOr(Loaded, Inc, "new");
+ case AtomicRMWInst::Xor:
+ return Builder.CreateXor(Loaded, Inc, "new");
+ case AtomicRMWInst::Max:
+ NewVal = Builder.CreateICmpSGT(Loaded, Inc);
+ return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+ case AtomicRMWInst::Min:
+ NewVal = Builder.CreateICmpSLE(Loaded, Inc);
+ return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+ case AtomicRMWInst::UMax:
+ NewVal = Builder.CreateICmpUGT(Loaded, Inc);
+ return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+ case AtomicRMWInst::UMin:
+ NewVal = Builder.CreateICmpULE(Loaded, Inc);
+ return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+ default:
+ break;
+ }
+ llvm_unreachable("Unknown atomic op");
+}
+
+bool X86AtomicExpandPass::expandAtomicRMW(AtomicRMWInst *AI) {
+ AtomicOrdering Order =
+ AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering();
+ Value *Addr = AI->getPointerOperand();
+ BasicBlock *BB = AI->getParent();
+ Function *F = BB->getParent();
+ LLVMContext &Ctx = F->getContext();
+
+ // Given: atomicrmw some_op iN* %addr, iN %incr ordering
+ //
+ // The standard expansion we produce is:
+ // [...]
+ // %init_loaded = load atomic iN* %addr
+ // br label %loop
+ // loop:
+ // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
+ // %new = some_op iN %loaded, %incr
+ // %pair = cmpxchg iN* %addr, iN %loaded, iN %new
+ // %new_loaded = extractvalue { iN, i1 } %pair, 0
+ // %success = extractvalue { iN, i1 } %pair, 1
+ // br i1 %success, label %atomicrmw.end, label %loop
+ // atomicrmw.end:
+ // [...]
+ BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end");
+ BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
+
+ // This grabs the DebugLoc from AI.
+ IRBuilder<> Builder(AI);
+
+ // The split call above "helpfully" added a branch at the end of BB (to the
+ // wrong place), but we want a load. It's easiest to just remove
+ // the branch entirely.
+ std::prev(BB->end())->eraseFromParent();
+ Builder.SetInsertPoint(BB);
+ LoadInst *InitLoaded = Builder.CreateLoad(Addr);
+ InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits());
+ Builder.CreateBr(LoopBB);
+
+ // Start the main loop block now that we've taken care of the preliminaries.
+ Builder.SetInsertPoint(LoopBB);
+ PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded");
+ Loaded->addIncoming(InitLoaded, BB);
+
+ Value *NewVal =
+ performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand());
+
+ Value *Pair = Builder.CreateAtomicCmpXchg(
+ Addr, Loaded, NewVal, Order,
+ AtomicCmpXchgInst::getStrongestFailureOrdering(Order));
+ Value *NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
+ Loaded->addIncoming(NewLoaded, LoopBB);
+
+ Value *Success = Builder.CreateExtractValue(Pair, 1, "success");
+ Builder.CreateCondBr(Success, ExitBB, LoopBB);
+
+ AI->replaceAllUsesWith(NewLoaded);
+
+ return true;
+}
+
+bool X86AtomicExpandPass::expandAtomicStore(StoreInst *SI) {
+ // An atomic store might need cmpxchg16b (or 8b on x86) to execute. Express
+ // this in terms of the usual expansion to "atomicrmw xchg".
+ IRBuilder<> Builder(SI);
+ AtomicOrdering Order =
+ SI->getOrdering() == Unordered ? Monotonic : SI->getOrdering();
+ AtomicRMWInst *AI =
+ Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
+ SI->getValueOperand(), Order);
+
+ // Now we have an appropriate swap instruction, lower it as usual.
+ if (shouldExpandAtomicRMW(AI)) {
+ expandAtomicRMW(AI);
+ AI->eraseFromParent();
+ return true;
+ }
+
+ return AI;
+}
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index 76718d0..a3ae7ee 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -1113,9 +1113,14 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
case TargetOpcode::INLINEASM:
// We allow inline assembler nodes with empty bodies - they can
// implicitly define registers, which is ok for JIT.
- if (MI.getOperand(0).getSymbolName()[0])
+ if (MI.getOperand(0).getSymbolName()[0]) {
+ DebugLoc DL = MI.getDebugLoc();
+ DL.print(MI.getParent()->getParent()->getFunction()->getContext(),
+ llvm::errs());
report_fatal_error("JIT does not support inline asm!");
+ }
break;
+ case TargetOpcode::DBG_VALUE:
case TargetOpcode::CFI_INSTRUCTION:
break;
case TargetOpcode::GC_LABEL:
@@ -1126,6 +1131,16 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
case TargetOpcode::IMPLICIT_DEF:
case TargetOpcode::KILL:
break;
+
+ case X86::SEH_PushReg:
+ case X86::SEH_SaveReg:
+ case X86::SEH_SaveXMM:
+ case X86::SEH_StackAlloc:
+ case X86::SEH_SetFrame:
+ case X86::SEH_PushFrame:
+ case X86::SEH_EndPrologue:
+ break;
+
case X86::MOVPC32r: {
// This emits the "call" portion of this pseudo instruction.
MCE.emitByte(BaseOpcode);
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 56bcfa3..ce554ba 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -16,10 +16,12 @@
#include "X86.h"
#include "X86CallingConv.h"
#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
#include "X86MachineFunctionInfo.h"
#include "X86RegisterInfo.h"
#include "X86Subtarget.h"
#include "X86TargetMachine.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/FastISel.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -78,12 +80,14 @@ public:
private:
bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT);
- bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR);
+ bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, MachineMemOperand *MMO,
+ unsigned &ResultReg);
bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM,
- bool Aligned = false);
- bool X86FastEmitStore(EVT VT, unsigned ValReg, const X86AddressMode &AM,
- bool Aligned = false);
+ MachineMemOperand *MMO = nullptr, bool Aligned = false);
+ bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
+ const X86AddressMode &AM,
+ MachineMemOperand *MMO = nullptr, bool Aligned = false);
bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
unsigned &ResultReg);
@@ -107,6 +111,12 @@ private:
bool X86SelectDivRem(const Instruction *I);
+ bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I);
+
+ bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I);
+
+ bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I);
+
bool X86SelectSelect(const Instruction *I);
bool X86SelectTrunc(const Instruction *I);
@@ -147,10 +157,182 @@ private:
bool TryEmitSmallMemcpy(X86AddressMode DestAM,
X86AddressMode SrcAM, uint64_t Len);
+
+ bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
+ const Value *Cond);
};
} // end anonymous namespace.
+static CmpInst::Predicate optimizeCmpPredicate(const CmpInst *CI) {
+ // If both operands are the same, then try to optimize or fold the cmp.
+ CmpInst::Predicate Predicate = CI->getPredicate();
+ if (CI->getOperand(0) != CI->getOperand(1))
+ return Predicate;
+
+ switch (Predicate) {
+ default: llvm_unreachable("Invalid predicate!");
+ case CmpInst::FCMP_FALSE: Predicate = CmpInst::FCMP_FALSE; break;
+ case CmpInst::FCMP_OEQ: Predicate = CmpInst::FCMP_ORD; break;
+ case CmpInst::FCMP_OGT: Predicate = CmpInst::FCMP_FALSE; break;
+ case CmpInst::FCMP_OGE: Predicate = CmpInst::FCMP_ORD; break;
+ case CmpInst::FCMP_OLT: Predicate = CmpInst::FCMP_FALSE; break;
+ case CmpInst::FCMP_OLE: Predicate = CmpInst::FCMP_ORD; break;
+ case CmpInst::FCMP_ONE: Predicate = CmpInst::FCMP_FALSE; break;
+ case CmpInst::FCMP_ORD: Predicate = CmpInst::FCMP_ORD; break;
+ case CmpInst::FCMP_UNO: Predicate = CmpInst::FCMP_UNO; break;
+ case CmpInst::FCMP_UEQ: Predicate = CmpInst::FCMP_TRUE; break;
+ case CmpInst::FCMP_UGT: Predicate = CmpInst::FCMP_UNO; break;
+ case CmpInst::FCMP_UGE: Predicate = CmpInst::FCMP_TRUE; break;
+ case CmpInst::FCMP_ULT: Predicate = CmpInst::FCMP_UNO; break;
+ case CmpInst::FCMP_ULE: Predicate = CmpInst::FCMP_TRUE; break;
+ case CmpInst::FCMP_UNE: Predicate = CmpInst::FCMP_UNO; break;
+ case CmpInst::FCMP_TRUE: Predicate = CmpInst::FCMP_TRUE; break;
+
+ case CmpInst::ICMP_EQ: Predicate = CmpInst::FCMP_TRUE; break;
+ case CmpInst::ICMP_NE: Predicate = CmpInst::FCMP_FALSE; break;
+ case CmpInst::ICMP_UGT: Predicate = CmpInst::FCMP_FALSE; break;
+ case CmpInst::ICMP_UGE: Predicate = CmpInst::FCMP_TRUE; break;
+ case CmpInst::ICMP_ULT: Predicate = CmpInst::FCMP_FALSE; break;
+ case CmpInst::ICMP_ULE: Predicate = CmpInst::FCMP_TRUE; break;
+ case CmpInst::ICMP_SGT: Predicate = CmpInst::FCMP_FALSE; break;
+ case CmpInst::ICMP_SGE: Predicate = CmpInst::FCMP_TRUE; break;
+ case CmpInst::ICMP_SLT: Predicate = CmpInst::FCMP_FALSE; break;
+ case CmpInst::ICMP_SLE: Predicate = CmpInst::FCMP_TRUE; break;
+ }
+
+ return Predicate;
+}
+
+static std::pair<X86::CondCode, bool>
+getX86ConditionCode(CmpInst::Predicate Predicate) {
+ X86::CondCode CC = X86::COND_INVALID;
+ bool NeedSwap = false;
+ switch (Predicate) {
+ default: break;
+ // Floating-point Predicates
+ case CmpInst::FCMP_UEQ: CC = X86::COND_E; break;
+ case CmpInst::FCMP_OLT: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_OGT: CC = X86::COND_A; break;
+ case CmpInst::FCMP_OLE: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_OGE: CC = X86::COND_AE; break;
+ case CmpInst::FCMP_UGT: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_ULT: CC = X86::COND_B; break;
+ case CmpInst::FCMP_UGE: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_ULE: CC = X86::COND_BE; break;
+ case CmpInst::FCMP_ONE: CC = X86::COND_NE; break;
+ case CmpInst::FCMP_UNO: CC = X86::COND_P; break;
+ case CmpInst::FCMP_ORD: CC = X86::COND_NP; break;
+ case CmpInst::FCMP_OEQ: // fall-through
+ case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
+
+ // Integer Predicates
+ case CmpInst::ICMP_EQ: CC = X86::COND_E; break;
+ case CmpInst::ICMP_NE: CC = X86::COND_NE; break;
+ case CmpInst::ICMP_UGT: CC = X86::COND_A; break;
+ case CmpInst::ICMP_UGE: CC = X86::COND_AE; break;
+ case CmpInst::ICMP_ULT: CC = X86::COND_B; break;
+ case CmpInst::ICMP_ULE: CC = X86::COND_BE; break;
+ case CmpInst::ICMP_SGT: CC = X86::COND_G; break;
+ case CmpInst::ICMP_SGE: CC = X86::COND_GE; break;
+ case CmpInst::ICMP_SLT: CC = X86::COND_L; break;
+ case CmpInst::ICMP_SLE: CC = X86::COND_LE; break;
+ }
+
+ return std::make_pair(CC, NeedSwap);
+}
+
+static std::pair<unsigned, bool>
+getX86SSEConditionCode(CmpInst::Predicate Predicate) {
+ unsigned CC;
+ bool NeedSwap = false;
+
+ // SSE Condition code mapping:
+ // 0 - EQ
+ // 1 - LT
+ // 2 - LE
+ // 3 - UNORD
+ // 4 - NEQ
+ // 5 - NLT
+ // 6 - NLE
+ // 7 - ORD
+ switch (Predicate) {
+ default: llvm_unreachable("Unexpected predicate");
+ case CmpInst::FCMP_OEQ: CC = 0; break;
+ case CmpInst::FCMP_OGT: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_OLT: CC = 1; break;
+ case CmpInst::FCMP_OGE: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_OLE: CC = 2; break;
+ case CmpInst::FCMP_UNO: CC = 3; break;
+ case CmpInst::FCMP_UNE: CC = 4; break;
+ case CmpInst::FCMP_ULE: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_UGE: CC = 5; break;
+ case CmpInst::FCMP_ULT: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_UGT: CC = 6; break;
+ case CmpInst::FCMP_ORD: CC = 7; break;
+ case CmpInst::FCMP_UEQ:
+ case CmpInst::FCMP_ONE: CC = 8; break;
+ }
+
+ return std::make_pair(CC, NeedSwap);
+}
+
+/// \brief Check if it is possible to fold the condition from the XALU intrinsic
+/// into the user. The condition code will only be updated on success.
+bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
+ const Value *Cond) {
+ if (!isa<ExtractValueInst>(Cond))
+ return false;
+
+ const auto *EV = cast<ExtractValueInst>(Cond);
+ if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
+ return false;
+
+ const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
+ MVT RetVT;
+ const Function *Callee = II->getCalledFunction();
+ Type *RetTy =
+ cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
+ if (!isTypeLegal(RetTy, RetVT))
+ return false;
+
+ if (RetVT != MVT::i32 && RetVT != MVT::i64)
+ return false;
+
+ X86::CondCode TmpCC;
+ switch (II->getIntrinsicID()) {
+ default: return false;
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break;
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break;
+ }
+
+ // Check if both instructions are in the same basic block.
+ if (II->getParent() != I->getParent())
+ return false;
+
+ // Make sure nothing is in the way
+ BasicBlock::const_iterator Start = I;
+ BasicBlock::const_iterator End = II;
+ for (auto Itr = std::prev(Start); Itr != End; --Itr) {
+ // We only expect extractvalue instructions between the intrinsic and the
+ // instruction to be selected.
+ if (!isa<ExtractValueInst>(Itr))
+ return false;
+
+ // Check that the extractvalue operand comes from the intrinsic.
+ const auto *EVI = cast<ExtractValueInst>(Itr);
+ if (EVI->getAggregateOperand() != II)
+ return false;
+ }
+
+ CC = TmpCC;
+ return true;
+}
+
bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true);
if (evt == MVT::Other || !evt.isSimple())
@@ -180,7 +362,7 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
/// Return true and the result register by reference if it is possible.
bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
- unsigned &ResultReg) {
+ MachineMemOperand *MMO, unsigned &ResultReg) {
// Get opcode and regclass of the output for the given load instruction.
unsigned Opc = 0;
const TargetRegisterClass *RC = nullptr;
@@ -228,8 +410,11 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
}
ResultReg = createResultReg(RC);
- addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
- DbgLoc, TII.get(Opc), ResultReg), AM);
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+ addFullAddress(MIB, AM);
+ if (MMO)
+ MIB->addMemOperand(*FuncInfo.MF, MMO);
return true;
}
@@ -237,9 +422,9 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
/// and a displacement offset, or a GlobalAddress,
/// i.e. V. Return true if it is possible.
-bool
-X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg,
- const X86AddressMode &AM, bool Aligned) {
+bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
+ const X86AddressMode &AM,
+ MachineMemOperand *MMO, bool Aligned) {
// Get opcode and regclass of the output for the given store instruction.
unsigned Opc = 0;
switch (VT.getSimpleVT().SimpleTy) {
@@ -249,7 +434,8 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg,
// Mask out all but lowest bit.
unsigned AndResult = createResultReg(&X86::GR8RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(X86::AND8ri), AndResult).addReg(ValReg).addImm(1);
+ TII.get(X86::AND8ri), AndResult)
+ .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1);
ValReg = AndResult;
}
// FALLTHROUGH, handling i1 as i8.
@@ -288,13 +474,18 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg,
break;
}
- addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
- DbgLoc, TII.get(Opc)), AM).addReg(ValReg);
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+ addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill));
+ if (MMO)
+ MIB->addMemOperand(*FuncInfo.MF, MMO);
+
return true;
}
bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
- const X86AddressMode &AM, bool Aligned) {
+ const X86AddressMode &AM,
+ MachineMemOperand *MMO, bool Aligned) {
// Handle 'null' like i32/i64 0.
if (isa<ConstantPointerNull>(Val))
Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext()));
@@ -317,10 +508,12 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
}
if (Opc) {
- addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
- DbgLoc, TII.get(Opc)), AM)
- .addImm(Signed ? (uint64_t) CI->getSExtValue() :
- CI->getZExtValue());
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+ addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue()
+ : CI->getZExtValue());
+ if (MMO)
+ MIB->addMemOperand(*FuncInfo.MF, MMO);
return true;
}
}
@@ -329,7 +522,8 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
if (ValReg == 0)
return false;
- return X86FastEmitStore(VT, ValReg, AM, Aligned);
+ bool ValKill = hasTrivialKill(Val);
+ return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned);
}
/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
@@ -355,17 +549,8 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
return false;
// Can't handle TLS yet.
- if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
- if (GVar->isThreadLocal())
- return false;
-
- // Can't handle TLS yet, part 2 (this is slightly crazy, but this is how
- // it works...).
- if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
- if (const GlobalVariable *GVar =
- dyn_cast_or_null<GlobalVariable>(GA->getAliasee()))
- if (GVar->isThreadLocal())
- return false;
+ if (GV->isThreadLocal())
+ return false;
// RIP-relative addresses can't have additional register operands, so if
// we've already folded stuff into the addressing mode, just force the
@@ -696,7 +881,7 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
(AM.Base.Reg != 0 || AM.IndexReg != 0))
return false;
- // Can't handle DbgLocLImport.
+ // Can't handle DLL Import.
if (GV->hasDLLImportStorageClass())
return false;
@@ -749,19 +934,24 @@ bool X86FastISel::X86SelectStore(const Instruction *I) {
if (S->isAtomic())
return false;
- unsigned SABIAlignment =
- DL.getABITypeAlignment(S->getValueOperand()->getType());
- bool Aligned = S->getAlignment() == 0 || S->getAlignment() >= SABIAlignment;
+ const Value *Val = S->getValueOperand();
+ const Value *Ptr = S->getPointerOperand();
MVT VT;
- if (!isTypeLegal(I->getOperand(0)->getType(), VT, /*AllowI1=*/true))
+ if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true))
return false;
+ unsigned Alignment = S->getAlignment();
+ unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType());
+ if (Alignment == 0) // Ensure that codegen never sees alignment 0
+ Alignment = ABIAlignment;
+ bool Aligned = Alignment >= ABIAlignment;
+
X86AddressMode AM;
- if (!X86SelectAddress(I->getOperand(1), AM))
+ if (!X86SelectAddress(Ptr, AM))
return false;
- return X86FastEmitStore(VT, I->getOperand(0), AM, Aligned);
+ return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned);
}
/// X86SelectRet - Select and emit code to implement ret instructions.
@@ -896,25 +1086,29 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
/// X86SelectLoad - Select and emit code to implement load instructions.
///
-bool X86FastISel::X86SelectLoad(const Instruction *I) {
+bool X86FastISel::X86SelectLoad(const Instruction *I) {
+ const LoadInst *LI = cast<LoadInst>(I);
+
// Atomic loads need special handling.
- if (cast<LoadInst>(I)->isAtomic())
+ if (LI->isAtomic())
return false;
MVT VT;
- if (!isTypeLegal(I->getType(), VT, /*AllowI1=*/true))
+ if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true))
return false;
+ const Value *Ptr = LI->getPointerOperand();
+
X86AddressMode AM;
- if (!X86SelectAddress(I->getOperand(0), AM))
+ if (!X86SelectAddress(Ptr, AM))
return false;
unsigned ResultReg = 0;
- if (X86FastEmitLoad(VT, AM, ResultReg)) {
- UpdateValueMap(I, ResultReg);
- return true;
- }
- return false;
+ if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg))
+ return false;
+
+ UpdateValueMap(I, ResultReg);
+ return true;
}
static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
@@ -994,73 +1188,89 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
if (!isTypeLegal(I->getOperand(0)->getType(), VT))
return false;
- unsigned ResultReg = createResultReg(&X86::GR8RegClass);
- unsigned SetCCOpc;
- bool SwapArgs; // false -> compare Op0, Op1. true -> compare Op1, Op0.
- switch (CI->getPredicate()) {
- case CmpInst::FCMP_OEQ: {
- if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT))
+ // Try to optimize or fold the cmp.
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+ unsigned ResultReg = 0;
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_FALSE: {
+ ResultReg = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0),
+ ResultReg);
+ ResultReg = FastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true,
+ X86::sub_8bit);
+ if (!ResultReg)
return false;
+ break;
+ }
+ case CmpInst::FCMP_TRUE: {
+ ResultReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
+ ResultReg).addImm(1);
+ break;
+ }
+ }
- unsigned EReg = createResultReg(&X86::GR8RegClass);
- unsigned NPReg = createResultReg(&X86::GR8RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETEr), EReg);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(X86::SETNPr), NPReg);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(X86::AND8rr), ResultReg).addReg(NPReg).addReg(EReg);
+ if (ResultReg) {
UpdateValueMap(I, ResultReg);
return true;
}
- case CmpInst::FCMP_UNE: {
- if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT))
+
+ const Value *LHS = CI->getOperand(0);
+ const Value *RHS = CI->getOperand(1);
+
+ // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
+ // We don't have to materialize a zero constant for this case and can just use
+ // %x again on the RHS.
+ if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+ const auto *RHSC = dyn_cast<ConstantFP>(RHS);
+ if (RHSC && RHSC->isNullValue())
+ RHS = LHS;
+ }
+
+ // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
+ static unsigned SETFOpcTable[2][3] = {
+ { X86::SETEr, X86::SETNPr, X86::AND8rr },
+ { X86::SETNEr, X86::SETPr, X86::OR8rr }
+ };
+ unsigned *SETFOpc = nullptr;
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break;
+ case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break;
+ }
+
+ ResultReg = createResultReg(&X86::GR8RegClass);
+ if (SETFOpc) {
+ if (!X86FastEmitCompare(LHS, RHS, VT))
return false;
- unsigned NEReg = createResultReg(&X86::GR8RegClass);
- unsigned PReg = createResultReg(&X86::GR8RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETNEr), NEReg);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETPr), PReg);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::OR8rr),ResultReg)
- .addReg(PReg).addReg(NEReg);
+ unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
+ unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
+ FlagReg1);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
+ FlagReg2);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]),
+ ResultReg).addReg(FlagReg1).addReg(FlagReg2);
UpdateValueMap(I, ResultReg);
return true;
}
- case CmpInst::FCMP_OGT: SwapArgs = false; SetCCOpc = X86::SETAr; break;
- case CmpInst::FCMP_OGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break;
- case CmpInst::FCMP_OLT: SwapArgs = true; SetCCOpc = X86::SETAr; break;
- case CmpInst::FCMP_OLE: SwapArgs = true; SetCCOpc = X86::SETAEr; break;
- case CmpInst::FCMP_ONE: SwapArgs = false; SetCCOpc = X86::SETNEr; break;
- case CmpInst::FCMP_ORD: SwapArgs = false; SetCCOpc = X86::SETNPr; break;
- case CmpInst::FCMP_UNO: SwapArgs = false; SetCCOpc = X86::SETPr; break;
- case CmpInst::FCMP_UEQ: SwapArgs = false; SetCCOpc = X86::SETEr; break;
- case CmpInst::FCMP_UGT: SwapArgs = true; SetCCOpc = X86::SETBr; break;
- case CmpInst::FCMP_UGE: SwapArgs = true; SetCCOpc = X86::SETBEr; break;
- case CmpInst::FCMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr; break;
- case CmpInst::FCMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break;
-
- case CmpInst::ICMP_EQ: SwapArgs = false; SetCCOpc = X86::SETEr; break;
- case CmpInst::ICMP_NE: SwapArgs = false; SetCCOpc = X86::SETNEr; break;
- case CmpInst::ICMP_UGT: SwapArgs = false; SetCCOpc = X86::SETAr; break;
- case CmpInst::ICMP_UGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break;
- case CmpInst::ICMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr; break;
- case CmpInst::ICMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break;
- case CmpInst::ICMP_SGT: SwapArgs = false; SetCCOpc = X86::SETGr; break;
- case CmpInst::ICMP_SGE: SwapArgs = false; SetCCOpc = X86::SETGEr; break;
- case CmpInst::ICMP_SLT: SwapArgs = false; SetCCOpc = X86::SETLr; break;
- case CmpInst::ICMP_SLE: SwapArgs = false; SetCCOpc = X86::SETLEr; break;
- default:
- return false;
- }
- const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+ X86::CondCode CC;
+ bool SwapArgs;
+ std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+ assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+ unsigned Opc = X86::getSETFromCond(CC);
+
if (SwapArgs)
- std::swap(Op0, Op1);
+ std::swap(LHS, RHS);
- // Emit a compare of Op0/Op1.
- if (!X86FastEmitCompare(Op0, Op1, VT))
+ // Emit a compare of LHS/RHS.
+ if (!X86FastEmitCompare(LHS, RHS, VT))
return false;
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SetCCOpc), ResultReg);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
UpdateValueMap(I, ResultReg);
return true;
}
@@ -1126,73 +1336,88 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
// Fold the common case of a conditional branch with a comparison
// in the same block (values defined on other blocks may not have
// initialized registers).
+ X86::CondCode CC;
if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
EVT VT = TLI.getValueType(CI->getOperand(0)->getType());
+ // Try to optimize or fold the cmp.
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_FALSE: FastEmitBranch(FalseMBB, DbgLoc); return true;
+ case CmpInst::FCMP_TRUE: FastEmitBranch(TrueMBB, DbgLoc); return true;
+ }
+
+ const Value *CmpLHS = CI->getOperand(0);
+ const Value *CmpRHS = CI->getOperand(1);
+
+ // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x,
+ // 0.0.
+ // We don't have to materialize a zero constant for this case and can just
+ // use %x again on the RHS.
+ if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+ const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
+ if (CmpRHSC && CmpRHSC->isNullValue())
+ CmpRHS = CmpLHS;
+ }
+
// Try to take advantage of fallthrough opportunities.
- CmpInst::Predicate Predicate = CI->getPredicate();
if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
std::swap(TrueMBB, FalseMBB);
Predicate = CmpInst::getInversePredicate(Predicate);
}
- bool SwapArgs; // false -> compare Op0, Op1. true -> compare Op1, Op0.
- unsigned BranchOpc; // Opcode to jump on, e.g. "X86::JA"
-
+ // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition
+ // code check. Instead two branch instructions are required to check all
+ // the flags. First we change the predicate to a supported condition code,
+ // which will be the first branch. Later one we will emit the second
+ // branch.
+ bool NeedExtraBranch = false;
switch (Predicate) {
+ default: break;
case CmpInst::FCMP_OEQ:
- std::swap(TrueMBB, FalseMBB);
- Predicate = CmpInst::FCMP_UNE;
- // FALL THROUGH
- case CmpInst::FCMP_UNE: SwapArgs = false; BranchOpc = X86::JNE_4; break;
- case CmpInst::FCMP_OGT: SwapArgs = false; BranchOpc = X86::JA_4; break;
- case CmpInst::FCMP_OGE: SwapArgs = false; BranchOpc = X86::JAE_4; break;
- case CmpInst::FCMP_OLT: SwapArgs = true; BranchOpc = X86::JA_4; break;
- case CmpInst::FCMP_OLE: SwapArgs = true; BranchOpc = X86::JAE_4; break;
- case CmpInst::FCMP_ONE: SwapArgs = false; BranchOpc = X86::JNE_4; break;
- case CmpInst::FCMP_ORD: SwapArgs = false; BranchOpc = X86::JNP_4; break;
- case CmpInst::FCMP_UNO: SwapArgs = false; BranchOpc = X86::JP_4; break;
- case CmpInst::FCMP_UEQ: SwapArgs = false; BranchOpc = X86::JE_4; break;
- case CmpInst::FCMP_UGT: SwapArgs = true; BranchOpc = X86::JB_4; break;
- case CmpInst::FCMP_UGE: SwapArgs = true; BranchOpc = X86::JBE_4; break;
- case CmpInst::FCMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4; break;
- case CmpInst::FCMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break;
-
- case CmpInst::ICMP_EQ: SwapArgs = false; BranchOpc = X86::JE_4; break;
- case CmpInst::ICMP_NE: SwapArgs = false; BranchOpc = X86::JNE_4; break;
- case CmpInst::ICMP_UGT: SwapArgs = false; BranchOpc = X86::JA_4; break;
- case CmpInst::ICMP_UGE: SwapArgs = false; BranchOpc = X86::JAE_4; break;
- case CmpInst::ICMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4; break;
- case CmpInst::ICMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break;
- case CmpInst::ICMP_SGT: SwapArgs = false; BranchOpc = X86::JG_4; break;
- case CmpInst::ICMP_SGE: SwapArgs = false; BranchOpc = X86::JGE_4; break;
- case CmpInst::ICMP_SLT: SwapArgs = false; BranchOpc = X86::JL_4; break;
- case CmpInst::ICMP_SLE: SwapArgs = false; BranchOpc = X86::JLE_4; break;
- default:
- return false;
+ std::swap(TrueMBB, FalseMBB); // fall-through
+ case CmpInst::FCMP_UNE:
+ NeedExtraBranch = true;
+ Predicate = CmpInst::FCMP_ONE;
+ break;
}
- const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+ bool SwapArgs;
+ unsigned BranchOpc;
+ std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+ assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+
+ BranchOpc = X86::GetCondBranchFromCond(CC);
if (SwapArgs)
- std::swap(Op0, Op1);
+ std::swap(CmpLHS, CmpRHS);
// Emit a compare of the LHS and RHS, setting the flags.
- if (!X86FastEmitCompare(Op0, Op1, VT))
+ if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT))
return false;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
.addMBB(TrueMBB);
- if (Predicate == CmpInst::FCMP_UNE) {
- // X86 requires a second branch to handle UNE (and OEQ,
- // which is mapped to UNE above).
+ // X86 requires a second branch to handle UNE (and OEQ, which is mapped
+ // to UNE above).
+ if (NeedExtraBranch) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_4))
.addMBB(TrueMBB);
}
+ // Obtain the branch weight and add the TrueBB to the successor list.
+ uint32_t BranchWeight = 0;
+ if (FuncInfo.BPI)
+ BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+ TrueMBB->getBasicBlock());
+ FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
+
+ // Emits an unconditional branch to the FalseBB, obtains the branch
+ // weight, and adds it to the successor list.
FastEmitBranch(FalseMBB, DbgLoc);
- FuncInfo.MBB->addSuccessor(TrueMBB);
+
return true;
}
} else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
@@ -1224,10 +1449,32 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc))
.addMBB(TrueMBB);
FastEmitBranch(FalseMBB, DbgLoc);
- FuncInfo.MBB->addSuccessor(TrueMBB);
+ uint32_t BranchWeight = 0;
+ if (FuncInfo.BPI)
+ BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+ TrueMBB->getBasicBlock());
+ FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
return true;
}
}
+ } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) {
+ // Fake request the condition, otherwise the intrinsic might be completely
+ // optimized away.
+ unsigned TmpReg = getRegForValue(BI->getCondition());
+ if (TmpReg == 0)
+ return false;
+
+ unsigned BranchOpc = X86::GetCondBranchFromCond(CC);
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
+ .addMBB(TrueMBB);
+ FastEmitBranch(FalseMBB, DbgLoc);
+ uint32_t BranchWeight = 0;
+ if (FuncInfo.BPI)
+ BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+ TrueMBB->getBasicBlock());
+ FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
+ return true;
}
// Otherwise do a clumsy setcc and re-test it.
@@ -1241,7 +1488,11 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_4))
.addMBB(TrueMBB);
FastEmitBranch(FalseMBB, DbgLoc);
- FuncInfo.MBB->addSuccessor(TrueMBB);
+ uint32_t BranchWeight = 0;
+ if (FuncInfo.BPI)
+ BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+ TrueMBB->getBasicBlock());
+ FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
return true;
}
@@ -1478,50 +1729,319 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
return true;
}
-bool X86FastISel::X86SelectSelect(const Instruction *I) {
- MVT VT;
- if (!isTypeLegal(I->getType(), VT))
+/// \brief Emit a conditional move instruction (if the are supported) to lower
+/// the select.
+bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
+ // Check if the subtarget supports these instructions.
+ if (!Subtarget->hasCMov())
return false;
- // We only use cmov here, if we don't have a cmov instruction bail.
- if (!Subtarget->hasCMov()) return false;
+ // FIXME: Add support for i8.
+ if (RetVT < MVT::i16 || RetVT > MVT::i64)
+ return false;
- unsigned Opc = 0;
- const TargetRegisterClass *RC = nullptr;
- if (VT == MVT::i16) {
- Opc = X86::CMOVE16rr;
- RC = &X86::GR16RegClass;
- } else if (VT == MVT::i32) {
- Opc = X86::CMOVE32rr;
- RC = &X86::GR32RegClass;
- } else if (VT == MVT::i64) {
- Opc = X86::CMOVE64rr;
- RC = &X86::GR64RegClass;
- } else {
+ const Value *Cond = I->getOperand(0);
+ const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+ bool NeedTest = true;
+ X86::CondCode CC = X86::COND_NE;
+
+ // Optimize conditions coming from a compare if both instructions are in the
+ // same basic block (values defined in other basic blocks may not have
+ // initialized registers).
+ const auto *CI = dyn_cast<CmpInst>(Cond);
+ if (CI && (CI->getParent() == I->getParent())) {
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+
+ // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
+ static unsigned SETFOpcTable[2][3] = {
+ { X86::SETNPr, X86::SETEr , X86::TEST8rr },
+ { X86::SETPr, X86::SETNEr, X86::OR8rr }
+ };
+ unsigned *SETFOpc = nullptr;
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_OEQ:
+ SETFOpc = &SETFOpcTable[0][0];
+ Predicate = CmpInst::ICMP_NE;
+ break;
+ case CmpInst::FCMP_UNE:
+ SETFOpc = &SETFOpcTable[1][0];
+ Predicate = CmpInst::ICMP_NE;
+ break;
+ }
+
+ bool NeedSwap;
+ std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate);
+ assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+
+ const Value *CmpLHS = CI->getOperand(0);
+ const Value *CmpRHS = CI->getOperand(1);
+ if (NeedSwap)
+ std::swap(CmpLHS, CmpRHS);
+
+ EVT CmpVT = TLI.getValueType(CmpLHS->getType());
+ // Emit a compare of the LHS and RHS, setting the flags.
+ if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT))
+ return false;
+
+ if (SETFOpc) {
+ unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
+ unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
+ FlagReg1);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
+ FlagReg2);
+ auto const &II = TII.get(SETFOpc[2]);
+ if (II.getNumDefs()) {
+ unsigned TmpReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg)
+ .addReg(FlagReg2).addReg(FlagReg1);
+ } else {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+ .addReg(FlagReg2).addReg(FlagReg1);
+ }
+ }
+ NeedTest = false;
+ } else if (foldX86XALUIntrinsic(CC, I, Cond)) {
+ // Fake request the condition, otherwise the intrinsic might be completely
+ // optimized away.
+ unsigned TmpReg = getRegForValue(Cond);
+ if (TmpReg == 0)
+ return false;
+
+ NeedTest = false;
+ }
+
+ if (NeedTest) {
+ // Selects operate on i1, however, CondReg is 8 bits width and may contain
+ // garbage. Indeed, only the less significant bit is supposed to be
+ // accurate. If we read more than the lsb, we may see non-zero values
+ // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for
+ // the select. This is achieved by performing TEST against 1.
+ unsigned CondReg = getRegForValue(Cond);
+ if (CondReg == 0)
+ return false;
+ bool CondIsKill = hasTrivialKill(Cond);
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+ .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
+ }
+
+ const Value *LHS = I->getOperand(1);
+ const Value *RHS = I->getOperand(2);
+
+ unsigned RHSReg = getRegForValue(RHS);
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ unsigned LHSReg = getRegForValue(LHS);
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ if (!LHSReg || !RHSReg)
+ return false;
+
+ unsigned Opc = X86::getCMovFromCond(CC, RC->getSize());
+ unsigned ResultReg = FastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill,
+ LHSReg, LHSIsKill);
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+/// \brief Emit SSE instructions to lower the select.
+///
+/// Try to use SSE1/SSE2 instructions to simulate a select without branches.
+/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary
+/// SSE instructions are available.
+bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
+ // Optimize conditions coming from a compare if both instructions are in the
+ // same basic block (values defined in other basic blocks may not have
+ // initialized registers).
+ const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0));
+ if (!CI || (CI->getParent() != I->getParent()))
return false;
+
+ if (I->getType() != CI->getOperand(0)->getType() ||
+ !((Subtarget->hasSSE1() && RetVT == MVT::f32) ||
+ (Subtarget->hasSSE2() && RetVT == MVT::f64) ))
+ return false;
+
+ const Value *CmpLHS = CI->getOperand(0);
+ const Value *CmpRHS = CI->getOperand(1);
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+
+ // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
+ // We don't have to materialize a zero constant for this case and can just use
+ // %x again on the RHS.
+ if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+ const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
+ if (CmpRHSC && CmpRHSC->isNullValue())
+ CmpRHS = CmpLHS;
}
- unsigned Op0Reg = getRegForValue(I->getOperand(0));
- if (Op0Reg == 0) return false;
- unsigned Op1Reg = getRegForValue(I->getOperand(1));
- if (Op1Reg == 0) return false;
- unsigned Op2Reg = getRegForValue(I->getOperand(2));
- if (Op2Reg == 0) return false;
-
- // Selects operate on i1, however, Op0Reg is 8 bits width and may contain
- // garbage. Indeed, only the less significant bit is supposed to be accurate.
- // If we read more than the lsb, we may see non-zero values whereas lsb
- // is zero. Therefore, we have to truncate Op0Reg to i1 for the select.
- // This is achieved by performing TEST against 1.
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
- .addReg(Op0Reg).addImm(1);
- unsigned ResultReg = createResultReg(RC);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
- .addReg(Op1Reg).addReg(Op2Reg);
+ unsigned CC;
+ bool NeedSwap;
+ std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate);
+ if (CC > 7)
+ return false;
+
+ if (NeedSwap)
+ std::swap(CmpLHS, CmpRHS);
+
+ static unsigned OpcTable[2][2][4] = {
+ { { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr },
+ { X86::VCMPSSrr, X86::VFsANDPSrr, X86::VFsANDNPSrr, X86::VFsORPSrr } },
+ { { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr },
+ { X86::VCMPSDrr, X86::VFsANDPDrr, X86::VFsANDNPDrr, X86::VFsORPDrr } }
+ };
+
+ bool HasAVX = Subtarget->hasAVX();
+ unsigned *Opc = nullptr;
+ switch (RetVT.SimpleTy) {
+ default: return false;
+ case MVT::f32: Opc = &OpcTable[0][HasAVX][0]; break;
+ case MVT::f64: Opc = &OpcTable[1][HasAVX][0]; break;
+ }
+
+ const Value *LHS = I->getOperand(1);
+ const Value *RHS = I->getOperand(2);
+
+ unsigned LHSReg = getRegForValue(LHS);
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ unsigned RHSReg = getRegForValue(RHS);
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ unsigned CmpLHSReg = getRegForValue(CmpLHS);
+ bool CmpLHSIsKill = hasTrivialKill(CmpLHS);
+
+ unsigned CmpRHSReg = getRegForValue(CmpRHS);
+ bool CmpRHSIsKill = hasTrivialKill(CmpRHS);
+
+ if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS)
+ return false;
+
+ const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+ unsigned CmpReg = FastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
+ CmpRHSReg, CmpRHSIsKill, CC);
+ unsigned AndReg = FastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false,
+ LHSReg, LHSIsKill);
+ unsigned AndNReg = FastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true,
+ RHSReg, RHSIsKill);
+ unsigned ResultReg = FastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true,
+ AndReg, /*IsKill=*/true);
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
+ // These are pseudo CMOV instructions and will be later expanded into control-
+ // flow.
+ unsigned Opc;
+ switch (RetVT.SimpleTy) {
+ default: return false;
+ case MVT::i8: Opc = X86::CMOV_GR8; break;
+ case MVT::i16: Opc = X86::CMOV_GR16; break;
+ case MVT::i32: Opc = X86::CMOV_GR32; break;
+ case MVT::f32: Opc = X86::CMOV_FR32; break;
+ case MVT::f64: Opc = X86::CMOV_FR64; break;
+ }
+
+ const Value *Cond = I->getOperand(0);
+ X86::CondCode CC = X86::COND_NE;
+
+ // Optimize conditions coming from a compare if both instructions are in the
+ // same basic block (values defined in other basic blocks may not have
+ // initialized registers).
+ const auto *CI = dyn_cast<CmpInst>(Cond);
+ if (CI && (CI->getParent() == I->getParent())) {
+ bool NeedSwap;
+ std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate());
+ if (CC > X86::LAST_VALID_COND)
+ return false;
+
+ const Value *CmpLHS = CI->getOperand(0);
+ const Value *CmpRHS = CI->getOperand(1);
+
+ if (NeedSwap)
+ std::swap(CmpLHS, CmpRHS);
+
+ EVT CmpVT = TLI.getValueType(CmpLHS->getType());
+ if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT))
+ return false;
+ } else {
+ unsigned CondReg = getRegForValue(Cond);
+ if (CondReg == 0)
+ return false;
+ bool CondIsKill = hasTrivialKill(Cond);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+ .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
+ }
+
+ const Value *LHS = I->getOperand(1);
+ const Value *RHS = I->getOperand(2);
+
+ unsigned LHSReg = getRegForValue(LHS);
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ unsigned RHSReg = getRegForValue(RHS);
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ if (!LHSReg || !RHSReg)
+ return false;
+
+ const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+
+ unsigned ResultReg =
+ FastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC);
UpdateValueMap(I, ResultReg);
return true;
}
+bool X86FastISel::X86SelectSelect(const Instruction *I) {
+ MVT RetVT;
+ if (!isTypeLegal(I->getType(), RetVT))
+ return false;
+
+ // Check if we can fold the select.
+ if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) {
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+ const Value *Opnd = nullptr;
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break;
+ case CmpInst::FCMP_TRUE: Opnd = I->getOperand(1); break;
+ }
+ // No need for a select anymore - this is an unconditional move.
+ if (Opnd) {
+ unsigned OpReg = getRegForValue(Opnd);
+ if (OpReg == 0)
+ return false;
+ bool OpIsKill = hasTrivialKill(Opnd);
+ const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(OpReg, getKillRegState(OpIsKill));
+ UpdateValueMap(I, ResultReg);
+ return true;
+ }
+ }
+
+ // First try to use real conditional move instructions.
+ if (X86FastEmitCMoveSelect(RetVT, I))
+ return true;
+
+ // Try to use a sequence of SSE instructions to simulate a conditional move.
+ if (X86FastEmitSSESelect(RetVT, I))
+ return true;
+
+ // Fall-back to pseudo conditional move instructions, which will be later
+ // converted to control-flow.
+ if (X86FastEmitPseudoSelect(RetVT, I))
+ return true;
+
+ return false;
+}
+
bool X86FastISel::X86SelectFPExt(const Instruction *I) {
// fpext from float to double.
if (X86ScalarSSEf64 &&
@@ -1633,8 +2153,8 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
}
unsigned Reg;
- bool RV = X86FastEmitLoad(VT, SrcAM, Reg);
- RV &= X86FastEmitStore(VT, Reg, DestAM);
+ bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
+ RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM);
assert(RV && "Failed to emit load or store??");
unsigned Size = VT.getSizeInBits()/8;
@@ -1646,10 +2166,74 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
return true;
}
+static bool isCommutativeIntrinsic(IntrinsicInst const &I) {
+ switch (I.getIntrinsicID()) {
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow:
+ return true;
+ default:
+ return false;
+ }
+}
+
bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
// FIXME: Handle more intrinsics.
switch (I.getIntrinsicID()) {
default: return false;
+ case Intrinsic::frameaddress: {
+ Type *RetTy = I.getCalledFunction()->getReturnType();
+
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ unsigned Opc;
+ const TargetRegisterClass *RC = nullptr;
+
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Invalid result type for frameaddress.");
+ case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break;
+ case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break;
+ }
+
+ // This needs to be set before we call getFrameRegister, otherwise we get
+ // the wrong frame register.
+ MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo();
+ MFI->setFrameAddressIsTaken(true);
+
+ const X86RegisterInfo *RegInfo =
+ static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
+ unsigned FrameReg = RegInfo->getFrameRegister(*(FuncInfo.MF));
+ assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
+ (FrameReg == X86::EBP && VT == MVT::i32)) &&
+ "Invalid Frame Register!");
+
+ // Always make a copy of the frame register to to a vreg first, so that we
+ // never directly reference the frame register (the TwoAddressInstruction-
+ // Pass doesn't like that).
+ unsigned SrcReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);
+
+ // Now recursively load from the frame address.
+ // movq (%rbp), %rax
+ // movq (%rax), %rax
+ // movq (%rax), %rax
+ // ...
+ unsigned DestReg;
+ unsigned Depth = cast<ConstantInt>(I.getOperand(0))->getZExtValue();
+ while (Depth--) {
+ DestReg = createResultReg(RC);
+ addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), DestReg), SrcReg);
+ SrcReg = DestReg;
+ }
+
+ UpdateValueMap(&I, SrcReg);
+ return true;
+ }
case Intrinsic::memcpy: {
const MemCpyInst &MCI = cast<MemCpyInst>(I);
// Don't handle volatile or variable length memcpys.
@@ -1726,52 +2310,233 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP));
return true;
}
- case Intrinsic::sadd_with_overflow:
- case Intrinsic::uadd_with_overflow: {
- // FIXME: Should fold immediates.
+ case Intrinsic::sqrt: {
+ if (!Subtarget->hasSSE1())
+ return false;
- // Replace "add with overflow" intrinsics with an "add" instruction followed
- // by a seto/setc instruction.
- const Function *Callee = I.getCalledFunction();
- Type *RetTy =
- cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0));
+ Type *RetTy = I.getCalledFunction()->getReturnType();
MVT VT;
if (!isTypeLegal(RetTy, VT))
return false;
- const Value *Op1 = I.getArgOperand(0);
- const Value *Op2 = I.getArgOperand(1);
- unsigned Reg1 = getRegForValue(Op1);
- unsigned Reg2 = getRegForValue(Op2);
+ // Unfortunately we can't use FastEmit_r, because the AVX version of FSQRT
+ // is not generated by FastISel yet.
+ // FIXME: Update this code once tablegen can handle it.
+ static const unsigned SqrtOpc[2][2] = {
+ {X86::SQRTSSr, X86::VSQRTSSr},
+ {X86::SQRTSDr, X86::VSQRTSDr}
+ };
+ bool HasAVX = Subtarget->hasAVX();
+ unsigned Opc;
+ const TargetRegisterClass *RC;
+ switch (VT.SimpleTy) {
+ default: return false;
+ case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break;
+ case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break;
+ }
+
+ const Value *SrcVal = I.getArgOperand(0);
+ unsigned SrcReg = getRegForValue(SrcVal);
- if (Reg1 == 0 || Reg2 == 0)
- // FIXME: Handle values *not* in registers.
+ if (SrcReg == 0)
return false;
- unsigned OpC = 0;
- if (VT == MVT::i32)
- OpC = X86::ADD32rr;
- else if (VT == MVT::i64)
- OpC = X86::ADD64rr;
- else
+ unsigned ImplicitDefReg = 0;
+ if (HasAVX) {
+ ImplicitDefReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+ }
+
+ unsigned ResultReg = createResultReg(RC);
+ MachineInstrBuilder MIB;
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+ ResultReg);
+
+ if (ImplicitDefReg)
+ MIB.addReg(ImplicitDefReg);
+
+ MIB.addReg(SrcReg);
+
+ UpdateValueMap(&I, ResultReg);
+ return true;
+ }
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow: {
+ // This implements the basic lowering of the xalu with overflow intrinsics
+ // into add/sub/mul followed by either seto or setb.
+ const Function *Callee = I.getCalledFunction();
+ auto *Ty = cast<StructType>(Callee->getReturnType());
+ Type *RetTy = Ty->getTypeAtIndex(0U);
+ Type *CondTy = Ty->getTypeAtIndex(1);
+
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ if (VT < MVT::i8 || VT > MVT::i64)
+ return false;
+
+ const Value *LHS = I.getArgOperand(0);
+ const Value *RHS = I.getArgOperand(1);
+
+ // Canonicalize immediate to the RHS.
+ if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
+ isCommutativeIntrinsic(I))
+ std::swap(LHS, RHS);
+
+ unsigned BaseOpc, CondOpc;
+ switch (I.getIntrinsicID()) {
+ default: llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::sadd_with_overflow:
+ BaseOpc = ISD::ADD; CondOpc = X86::SETOr; break;
+ case Intrinsic::uadd_with_overflow:
+ BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break;
+ case Intrinsic::ssub_with_overflow:
+ BaseOpc = ISD::SUB; CondOpc = X86::SETOr; break;
+ case Intrinsic::usub_with_overflow:
+ BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break;
+ case Intrinsic::smul_with_overflow:
+ BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break;
+ case Intrinsic::umul_with_overflow:
+ BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break;
+ }
+
+ unsigned LHSReg = getRegForValue(LHS);
+ if (LHSReg == 0)
return false;
+ bool LHSIsKill = hasTrivialKill(LHS);
- // The call to CreateRegs builds two sequential registers, to store the
- // both the returned values.
- unsigned ResultReg = FuncInfo.CreateRegs(I.getType());
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpC), ResultReg)
- .addReg(Reg1).addReg(Reg2);
+ unsigned ResultReg = 0;
+ // Check if we have an immediate version.
+ if (auto const *C = dyn_cast<ConstantInt>(RHS)) {
+ ResultReg = FastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
+ C->getZExtValue());
+ }
- unsigned Opc = X86::SETBr;
- if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow)
- Opc = X86::SETOr;
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
- ResultReg + 1);
+ unsigned RHSReg;
+ bool RHSIsKill;
+ if (!ResultReg) {
+ RHSReg = getRegForValue(RHS);
+ if (RHSReg == 0)
+ return false;
+ RHSIsKill = hasTrivialKill(RHS);
+ ResultReg = FastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill);
+ }
+
+ // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit
+ // it manually.
+ if (BaseOpc == X86ISD::UMUL && !ResultReg) {
+ static const unsigned MULOpc[] =
+ { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
+ static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
+ // First copy the first operand into RAX, which is an implicit input to
+ // the X86::MUL*r instruction.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
+ .addReg(LHSReg, getKillRegState(LHSIsKill));
+ ResultReg = FastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
+ TLI.getRegClassFor(VT), RHSReg, RHSIsKill);
+ } else if (BaseOpc == X86ISD::SMUL && !ResultReg) {
+ static const unsigned MULOpc[] =
+ { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr };
+ if (VT == MVT::i8) {
+ // Copy the first operand into AL, which is an implicit input to the
+ // X86::IMUL8r instruction.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), X86::AL)
+ .addReg(LHSReg, getKillRegState(LHSIsKill));
+ ResultReg = FastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg,
+ RHSIsKill);
+ } else
+ ResultReg = FastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],
+ TLI.getRegClassFor(VT), LHSReg, LHSIsKill,
+ RHSReg, RHSIsKill);
+ }
+
+ if (!ResultReg)
+ return false;
+
+ unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy);
+ assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc),
+ ResultReg2);
UpdateValueMap(&I, ResultReg, 2);
return true;
}
+ case Intrinsic::x86_sse_cvttss2si:
+ case Intrinsic::x86_sse_cvttss2si64:
+ case Intrinsic::x86_sse2_cvttsd2si:
+ case Intrinsic::x86_sse2_cvttsd2si64: {
+ bool IsInputDouble;
+ switch (I.getIntrinsicID()) {
+ default: llvm_unreachable("Unexpected intrinsic.");
+ case Intrinsic::x86_sse_cvttss2si:
+ case Intrinsic::x86_sse_cvttss2si64:
+ if (!Subtarget->hasSSE1())
+ return false;
+ IsInputDouble = false;
+ break;
+ case Intrinsic::x86_sse2_cvttsd2si:
+ case Intrinsic::x86_sse2_cvttsd2si64:
+ if (!Subtarget->hasSSE2())
+ return false;
+ IsInputDouble = true;
+ break;
+ }
+
+ Type *RetTy = I.getCalledFunction()->getReturnType();
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ static const unsigned CvtOpc[2][2][2] = {
+ { { X86::CVTTSS2SIrr, X86::VCVTTSS2SIrr },
+ { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr } },
+ { { X86::CVTTSD2SIrr, X86::VCVTTSD2SIrr },
+ { X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr } }
+ };
+ bool HasAVX = Subtarget->hasAVX();
+ unsigned Opc;
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Unexpected result type.");
+ case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break;
+ case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break;
+ }
+
+ // Check if we can fold insertelement instructions into the convert.
+ const Value *Op = I.getArgOperand(0);
+ while (auto *IE = dyn_cast<InsertElementInst>(Op)) {
+ const Value *Index = IE->getOperand(2);
+ if (!isa<ConstantInt>(Index))
+ break;
+ unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
+
+ if (Idx == 0) {
+ Op = IE->getOperand(1);
+ break;
+ }
+ Op = IE->getOperand(0);
+ }
+
+ unsigned Reg = getRegForValue(Op);
+ if (Reg == 0)
+ return false;
+
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addReg(Reg);
+
+ UpdateValueMap(&I, ResultReg);
+ return true;
+ }
}
}
@@ -1794,31 +2559,43 @@ bool X86FastISel::FastLowerArguments() {
return false;
// Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
- unsigned Idx = 1;
- for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
- I != E; ++I, ++Idx) {
- if (Idx > 6)
- return false;
-
+ unsigned GPRCnt = 0;
+ unsigned FPRCnt = 0;
+ unsigned Idx = 0;
+ for (auto const &Arg : F->args()) {
+ // The first argument is at index 1.
+ ++Idx;
if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
F->getAttributes().hasAttribute(Idx, Attribute::Nest))
return false;
- Type *ArgTy = I->getType();
+ Type *ArgTy = Arg.getType();
if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
return false;
EVT ArgVT = TLI.getValueType(ArgTy);
if (!ArgVT.isSimple()) return false;
switch (ArgVT.getSimpleVT().SimpleTy) {
+ default: return false;
case MVT::i32:
case MVT::i64:
+ ++GPRCnt;
+ break;
+ case MVT::f32:
+ case MVT::f64:
+ if (!Subtarget->hasSSE1())
+ return false;
+ ++FPRCnt;
break;
- default:
- return false;
}
+
+ if (GPRCnt > 6)
+ return false;
+
+ if (FPRCnt > 8)
+ return false;
}
static const MCPhysReg GPR32ArgRegs[] = {
@@ -1827,24 +2604,33 @@ bool X86FastISel::FastLowerArguments() {
static const MCPhysReg GPR64ArgRegs[] = {
X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
};
+ static const MCPhysReg XMMArgRegs[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
- Idx = 0;
- const TargetRegisterClass *RC32 = TLI.getRegClassFor(MVT::i32);
- const TargetRegisterClass *RC64 = TLI.getRegClassFor(MVT::i64);
- for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
- I != E; ++I, ++Idx) {
- bool is32Bit = TLI.getValueType(I->getType()) == MVT::i32;
- const TargetRegisterClass *RC = is32Bit ? RC32 : RC64;
- unsigned SrcReg = is32Bit ? GPR32ArgRegs[Idx] : GPR64ArgRegs[Idx];
+ unsigned GPRIdx = 0;
+ unsigned FPRIdx = 0;
+ for (auto const &Arg : F->args()) {
+ MVT VT = TLI.getSimpleValueType(Arg.getType());
+ const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
+ unsigned SrcReg;
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Unexpected value type.");
+ case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break;
+ case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break;
+ case MVT::f32: // fall-through
+ case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
+ }
unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
// FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
// Without this, EmitLiveInCopies may eliminate the livein if its only
// use is a bitcast (which isn't turned into an instruction).
unsigned ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY),
- ResultReg).addReg(DstReg, getKillRegState(true));
- UpdateValueMap(I, ResultReg);
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(DstReg, getKillRegState(true));
+ UpdateValueMap(&Arg, ResultReg);
}
return true;
}
@@ -2147,7 +2933,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
if (!X86FastEmitStore(ArgVT, ArgVal, AM))
return false;
} else {
- if (!X86FastEmitStore(ArgVT, Arg, AM))
+ if (!X86FastEmitStore(ArgVT, Arg, /*ValIsKill=*/false, AM))
return false;
}
}
@@ -2430,7 +3216,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
return 0;
}
- // Materialize addresses with LEA instructions.
+ // Materialize addresses with LEA/MOV instructions.
if (isa<GlobalValue>(C)) {
X86AddressMode AM;
if (X86SelectAddress(C, AM)) {
@@ -2440,10 +3226,19 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
return AM.Base.Reg;
- Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
unsigned ResultReg = createResultReg(RC);
- addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ if (TM.getRelocationModel() == Reloc::Static &&
+ TLI.getPointerTy() == MVT::i64) {
+ // The displacement code be more than 32 bits away so we need to use
+ // an instruction with a 64 bit immediate
+ Opc = X86::MOV64ri;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg).addGlobalAddress(cast<GlobalValue>(C));
+ } else {
+ Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
+ addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Opc), ResultReg), AM);
+ }
return ResultReg;
}
return 0;
@@ -2544,8 +3339,9 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
const LoadInst *LI) {
+ const Value *Ptr = LI->getPointerOperand();
X86AddressMode AM;
- if (!X86SelectAddress(LI->getOperand(0), AM))
+ if (!X86SelectAddress(Ptr, AM))
return false;
const X86InstrInfo &XII = (const X86InstrInfo&)TII;
@@ -2553,13 +3349,18 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
unsigned Size = DL.getTypeAllocSize(LI->getType());
unsigned Alignment = LI->getAlignment();
+ if (Alignment == 0) // Ensure that codegen never sees alignment 0
+ Alignment = DL.getABITypeAlignment(LI->getType());
+
SmallVector<MachineOperand, 8> AddrOps;
AM.getFullAddress(AddrOps);
MachineInstr *Result =
XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment);
- if (!Result) return false;
+ if (!Result)
+ return false;
+ Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
FuncInfo.MBB->insert(FuncInfo.InsertPt, Result);
MI->eraseFromParent();
return true;
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 6c5b86f..4be766a 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -32,86 +32,89 @@ using namespace llvm;
STATISTIC(NumLEAs, "Number of LEA instructions created");
namespace {
- class FixupLEAPass : public MachineFunctionPass {
- enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
- static char ID;
- /// \brief Loop over all of the instructions in the basic block
- /// replacing applicable instructions with LEA instructions,
- /// where appropriate.
- bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
+class FixupLEAPass : public MachineFunctionPass {
+ enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
+ static char ID;
+ /// \brief Loop over all of the instructions in the basic block
+ /// replacing applicable instructions with LEA instructions,
+ /// where appropriate.
+ bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
- const char *getPassName() const override { return "X86 Atom LEA Fixup";}
+ const char *getPassName() const override { return "X86 Atom LEA Fixup"; }
- /// \brief Given a machine register, look for the instruction
- /// which writes it in the current basic block. If found,
- /// try to replace it with an equivalent LEA instruction.
- /// If replacement succeeds, then also process the the newly created
- /// instruction.
- void seekLEAFixup(MachineOperand& p, MachineBasicBlock::iterator& I,
- MachineFunction::iterator MFI);
+ /// \brief Given a machine register, look for the instruction
+ /// which writes it in the current basic block. If found,
+ /// try to replace it with an equivalent LEA instruction.
+ /// If replacement succeeds, then also process the the newly created
+ /// instruction.
+ void seekLEAFixup(MachineOperand &p, MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI);
- /// \brief Given a memory access or LEA instruction
- /// whose address mode uses a base and/or index register, look for
- /// an opportunity to replace the instruction which sets the base or index
- /// register with an equivalent LEA instruction.
- void processInstruction(MachineBasicBlock::iterator& I,
- MachineFunction::iterator MFI);
+ /// \brief Given a memory access or LEA instruction
+ /// whose address mode uses a base and/or index register, look for
+ /// an opportunity to replace the instruction which sets the base or index
+ /// register with an equivalent LEA instruction.
+ void processInstruction(MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI);
- /// \brief Given a LEA instruction which is unprofitable
- /// on Silvermont try to replace it with an equivalent ADD instruction
- void processInstructionForSLM(MachineBasicBlock::iterator& I,
- MachineFunction::iterator MFI);
+ /// \brief Given a LEA instruction which is unprofitable
+ /// on Silvermont try to replace it with an equivalent ADD instruction
+ void processInstructionForSLM(MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI);
- /// \brief Determine if an instruction references a machine register
- /// and, if so, whether it reads or writes the register.
- RegUsageState usesRegister(MachineOperand& p,
- MachineBasicBlock::iterator I);
+ /// \brief Determine if an instruction references a machine register
+ /// and, if so, whether it reads or writes the register.
+ RegUsageState usesRegister(MachineOperand &p, MachineBasicBlock::iterator I);
- /// \brief Step backwards through a basic block, looking
- /// for an instruction which writes a register within
- /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles.
- MachineBasicBlock::iterator searchBackwards(MachineOperand& p,
- MachineBasicBlock::iterator& I,
- MachineFunction::iterator MFI);
+ /// \brief Step backwards through a basic block, looking
+ /// for an instruction which writes a register within
+ /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles.
+ MachineBasicBlock::iterator searchBackwards(MachineOperand &p,
+ MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI);
- /// \brief if an instruction can be converted to an
- /// equivalent LEA, insert the new instruction into the basic block
- /// and return a pointer to it. Otherwise, return zero.
- MachineInstr* postRAConvertToLEA(MachineFunction::iterator &MFI,
- MachineBasicBlock::iterator &MBBI) const;
+ /// \brief if an instruction can be converted to an
+ /// equivalent LEA, insert the new instruction into the basic block
+ /// and return a pointer to it. Otherwise, return zero.
+ MachineInstr *postRAConvertToLEA(MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI) const;
- public:
- FixupLEAPass() : MachineFunctionPass(ID) {}
+public:
+ FixupLEAPass() : MachineFunctionPass(ID) {}
- /// \brief Loop over all of the basic blocks,
- /// replacing instructions by equivalent LEA instructions
- /// if needed and when possible.
- bool runOnMachineFunction(MachineFunction &MF) override;
+ /// \brief Loop over all of the basic blocks,
+ /// replacing instructions by equivalent LEA instructions
+ /// if needed and when possible.
+ bool runOnMachineFunction(MachineFunction &MF) override;
- private:
- MachineFunction *MF;
- const TargetMachine *TM;
- const X86InstrInfo *TII; // Machine instruction info.
-
- };
- char FixupLEAPass::ID = 0;
+private:
+ MachineFunction *MF;
+ const TargetMachine *TM;
+ const X86InstrInfo *TII; // Machine instruction info.
+};
+char FixupLEAPass::ID = 0;
}
MachineInstr *
FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
MachineBasicBlock::iterator &MBBI) const {
- MachineInstr* MI = MBBI;
- MachineInstr* NewMI;
+ MachineInstr *MI = MBBI;
+ MachineInstr *NewMI;
switch (MI->getOpcode()) {
case X86::MOV32rr:
case X86::MOV64rr: {
- const MachineOperand& Src = MI->getOperand(1);
- const MachineOperand& Dest = MI->getOperand(0);
+ const MachineOperand &Src = MI->getOperand(1);
+ const MachineOperand &Dest = MI->getOperand(0);
NewMI = BuildMI(*MF, MI->getDebugLoc(),
- TII->get( MI->getOpcode() == X86::MOV32rr ? X86::LEA32r : X86::LEA64r))
- .addOperand(Dest)
- .addOperand(Src).addImm(1).addReg(0).addImm(0).addReg(0);
- MFI->insert(MBBI, NewMI); // Insert the new inst
+ TII->get(MI->getOpcode() == X86::MOV32rr ? X86::LEA32r
+ : X86::LEA64r))
+ .addOperand(Dest)
+ .addOperand(Src)
+ .addImm(1)
+ .addReg(0)
+ .addImm(0)
+ .addReg(0);
+ MFI->insert(MBBI, NewMI); // Insert the new inst
return NewMI;
}
case X86::ADD64ri32:
@@ -144,17 +147,16 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
return TII->convertToThreeAddress(MFI, MBBI, nullptr);
}
-FunctionPass *llvm::createX86FixupLEAs() {
- return new FixupLEAPass();
-}
+FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); }
bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
+ MF = &Func;
TM = &Func.getTarget();
const X86Subtarget &ST = TM->getSubtarget<X86Subtarget>();
if (!ST.LEAusesAG() && !ST.slowLEA())
return false;
- TII = static_cast<const X86InstrInfo*>(TM->getInstrInfo());
+ TII = static_cast<const X86InstrInfo *>(TM->getInstrInfo());
DEBUG(dbgs() << "Start X86FixupLEAs\n";);
// Process all basic blocks.
@@ -165,14 +167,14 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
return true;
}
-FixupLEAPass::RegUsageState FixupLEAPass::usesRegister(MachineOperand& p,
- MachineBasicBlock::iterator I) {
+FixupLEAPass::RegUsageState
+FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) {
RegUsageState RegUsage = RU_NotUsed;
- MachineInstr* MI = I;
+ MachineInstr *MI = I;
for (unsigned int i = 0; i < MI->getNumOperands(); ++i) {
- MachineOperand& opnd = MI->getOperand(i);
- if (opnd.isReg() && opnd.getReg() == p.getReg()){
+ MachineOperand &opnd = MI->getOperand(i);
+ if (opnd.isReg() && opnd.getReg() == p.getReg()) {
if (opnd.isDef())
return RU_Write;
RegUsage = RU_Read;
@@ -185,23 +187,22 @@ FixupLEAPass::RegUsageState FixupLEAPass::usesRegister(MachineOperand& p,
/// block, return a reference to the previous instruction in the block,
/// wrapping around to the last instruction of the block if the block
/// branches to itself.
-static inline bool getPreviousInstr(MachineBasicBlock::iterator& I,
+static inline bool getPreviousInstr(MachineBasicBlock::iterator &I,
MachineFunction::iterator MFI) {
if (I == MFI->begin()) {
if (MFI->isPredecessor(MFI)) {
I = --MFI->end();
return true;
- }
- else
+ } else
return false;
}
--I;
return true;
}
-MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p,
- MachineBasicBlock::iterator& I,
- MachineFunction::iterator MFI) {
+MachineBasicBlock::iterator
+FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI) {
int InstrDistance = 1;
MachineBasicBlock::iterator CurInst;
static const int INSTR_DISTANCE_THRESHOLD = 5;
@@ -209,12 +210,12 @@ MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p,
CurInst = I;
bool Found;
Found = getPreviousInstr(CurInst, MFI);
- while( Found && I != CurInst) {
+ while (Found && I != CurInst) {
if (CurInst->isCall() || CurInst->isInlineAsm())
break;
if (InstrDistance > INSTR_DISTANCE_THRESHOLD)
break; // too far back to make a difference
- if (usesRegister(p, CurInst) == RU_Write){
+ if (usesRegister(p, CurInst) == RU_Write) {
return CurInst;
}
InstrDistance += TII->getInstrLatency(TM->getInstrItineraryData(), CurInst);
@@ -223,32 +224,32 @@ MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p,
return nullptr;
}
-void FixupLEAPass::processInstruction(MachineBasicBlock::iterator& I,
+void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I,
MachineFunction::iterator MFI) {
// Process a load, store, or LEA instruction.
MachineInstr *MI = I;
int opcode = MI->getOpcode();
- const MCInstrDesc& Desc = MI->getDesc();
+ const MCInstrDesc &Desc = MI->getDesc();
int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags, opcode);
if (AddrOffset >= 0) {
AddrOffset += X86II::getOperandBias(Desc);
- MachineOperand& p = MI->getOperand(AddrOffset + X86::AddrBaseReg);
+ MachineOperand &p = MI->getOperand(AddrOffset + X86::AddrBaseReg);
if (p.isReg() && p.getReg() != X86::ESP) {
seekLEAFixup(p, I, MFI);
}
- MachineOperand& q = MI->getOperand(AddrOffset + X86::AddrIndexReg);
+ MachineOperand &q = MI->getOperand(AddrOffset + X86::AddrIndexReg);
if (q.isReg() && q.getReg() != X86::ESP) {
seekLEAFixup(q, I, MFI);
}
}
}
-void FixupLEAPass::seekLEAFixup(MachineOperand& p,
- MachineBasicBlock::iterator& I,
+void FixupLEAPass::seekLEAFixup(MachineOperand &p,
+ MachineBasicBlock::iterator &I,
MachineFunction::iterator MFI) {
MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI);
if (MBI) {
- MachineInstr* NewMI = postRAConvertToLEA(MFI, MBI);
+ MachineInstr *NewMI = postRAConvertToLEA(MFI, MBI);
if (NewMI) {
++NumLEAs;
DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump(););
@@ -256,7 +257,7 @@ void FixupLEAPass::seekLEAFixup(MachineOperand& p,
DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump(););
MFI->erase(MBI);
MachineBasicBlock::iterator J =
- static_cast<MachineBasicBlock::iterator> (NewMI);
+ static_cast<MachineBasicBlock::iterator>(NewMI);
processInstruction(J, MFI);
}
}
@@ -299,7 +300,7 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
}
DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
DEBUG(dbgs() << "FixLEA: Replaced by: ";);
- MachineInstr *NewMI = 0;
+ MachineInstr *NewMI = nullptr;
const MachineOperand &Dst = MI->getOperand(0);
// Make ADD instruction for two registers writing to LEA's destination
if (SrcR1 != 0 && SrcR2 != 0) {
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 4c1374f..8c029a8 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -29,6 +29,7 @@
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Debug.h"
using namespace llvm;
@@ -45,7 +46,7 @@ bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
const MachineFrameInfo *MFI = MF.getFrameInfo();
const MachineModuleInfo &MMI = MF.getMMI();
- const TargetRegisterInfo *RegInfo = TM.getRegisterInfo();
+ const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
RegInfo->needsStackRealignment(MF) ||
@@ -305,65 +306,25 @@ static bool isEAXLiveIn(MachineFunction &MF) {
return false;
}
-void X86FrameLowering::emitCalleeSavedFrameMoves(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL,
- unsigned FramePtr) const {
+void
+X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL) const {
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo *MFI = MF.getFrameInfo();
MachineModuleInfo &MMI = MF.getMMI();
const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
- const X86InstrInfo &TII = *TM.getInstrInfo();
+ const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
// Add callee saved registers to move list.
const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
if (CSI.empty()) return;
- const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
- bool HasFP = hasFP(MF);
-
- // Calculate amount of bytes used for return address storing.
- int stackGrowth = -RegInfo->getSlotSize();
-
- // FIXME: This is dirty hack. The code itself is pretty mess right now.
- // It should be rewritten from scratch and generalized sometimes.
-
- // Determine maximum offset (minimum due to stack growth).
- int64_t MaxOffset = 0;
- for (std::vector<CalleeSavedInfo>::const_iterator
- I = CSI.begin(), E = CSI.end(); I != E; ++I)
- MaxOffset = std::min(MaxOffset,
- MFI->getObjectOffset(I->getFrameIdx()));
-
// Calculate offsets.
- int64_t saveAreaOffset = (HasFP ? 3 : 2) * stackGrowth;
for (std::vector<CalleeSavedInfo>::const_iterator
I = CSI.begin(), E = CSI.end(); I != E; ++I) {
int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
unsigned Reg = I->getReg();
- Offset = MaxOffset - Offset + saveAreaOffset;
-
- // Don't output a new machine move if we're re-saving the frame
- // pointer. This happens when the PrologEpilogInserter has inserted an extra
- // "PUSH" of the frame pointer -- the "emitPrologue" method automatically
- // generates one when frame pointers are used. If we generate a "machine
- // move" for this extra "PUSH", the linker will lose track of the fact that
- // the frame pointer should have the value of the first "PUSH" when it's
- // trying to unwind.
- //
- // FIXME: This looks inelegant. It's possibly correct, but it's covering up
- // another bug. I.e., one where we generate a prolog like this:
- //
- // pushl %ebp
- // movl %esp, %ebp
- // pushl %ebp
- // pushl %esi
- // ...
- //
- // The immediate re-push of EBP is unnecessary. At the least, it's an
- // optimization bug. EBP can be used as a scratch register in certain
- // cases, but probably not when we have a frame pointer.
- if (HasFP && FramePtr == Reg)
- continue;
unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
unsigned CFIIndex =
@@ -395,23 +356,107 @@ static bool usesTheStack(const MachineFunction &MF) {
/// automatically adjust the stack pointer. Adjust the stack pointer to allocate
/// space for local variables. Also emit labels used by the exception handler to
/// generate the exception handling frames.
+
+/*
+ Here's a gist of what gets emitted:
+
+ ; Establish frame pointer, if needed
+ [if needs FP]
+ push %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ .seh_pushreg %rpb
+ mov %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+
+ ; Spill general-purpose registers
+ [for all callee-saved GPRs]
+ pushq %<reg>
+ [if not needs FP]
+ .cfi_def_cfa_offset (offset from RETADDR)
+ .seh_pushreg %<reg>
+
+ ; If the required stack alignment > default stack alignment
+ ; rsp needs to be re-aligned. This creates a "re-alignment gap"
+ ; of unknown size in the stack frame.
+ [if stack needs re-alignment]
+ and $MASK, %rsp
+
+ ; Allocate space for locals
+ [if target is Windows and allocated space > 4096 bytes]
+ ; Windows needs special care for allocations larger
+ ; than one page.
+ mov $NNN, %rax
+ call ___chkstk_ms/___chkstk
+ sub %rax, %rsp
+ [else]
+ sub $NNN, %rsp
+
+ [if needs FP]
+ .seh_stackalloc (size of XMM spill slots)
+ .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots
+ [else]
+ .seh_stackalloc NNN
+
+ ; Spill XMMs
+ ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved,
+ ; they may get spilled on any platform, if the current function
+ ; calls @llvm.eh.unwind.init
+ [if needs FP]
+ [for all callee-saved XMM registers]
+ movaps %<xmm reg>, -MMM(%rbp)
+ [for all callee-saved XMM registers]
+ .seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset)
+ ; i.e. the offset relative to (%rbp - SEHFrameOffset)
+ [else]
+ [for all callee-saved XMM registers]
+ movaps %<xmm reg>, KKK(%rsp)
+ [for all callee-saved XMM registers]
+ .seh_savexmm %<xmm reg>, KKK
+
+ .seh_endprologue
+
+ [if needs base pointer]
+ mov %rsp, %rbx
+
+ ; Emit CFI info
+ [if needs FP]
+ [for all callee-saved registers]
+ .cfi_offset %<reg>, (offset from %rbp)
+ [else]
+ .cfi_def_cfa_offset (offset from RETADDR)
+ [for all callee-saved registers]
+ .cfi_offset %<reg>, (offset from %rsp)
+
+ Notes:
+ - .seh directives are emitted only for Windows 64 ABI
+ - .cfi directives are emitted for all other ABIs
+ - for 32-bit code, substitute %e?? registers for %r??
+*/
+
void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
MachineBasicBlock::iterator MBBI = MBB.begin();
MachineFrameInfo *MFI = MF.getFrameInfo();
const Function *Fn = MF.getFunction();
- const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
- const X86InstrInfo &TII = *TM.getInstrInfo();
+ const X86RegisterInfo *RegInfo =
+ static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
+ const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
MachineModuleInfo &MMI = MF.getMMI();
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
- bool needsFrameMoves = MMI.hasDebugInfo() ||
- Fn->needsUnwindTableEntry();
uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment.
uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate.
bool HasFP = hasFP(MF);
+ const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
bool Is64Bit = STI.is64Bit();
bool IsLP64 = STI.isTarget64BitLP64();
bool IsWin64 = STI.isTargetWin64();
+ bool IsWinEH =
+ MF.getTarget().getMCAsmInfo()->getExceptionHandlingType() ==
+ ExceptionHandling::WinEH; // Not necessarily synonymous with IsWin64.
+ bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry();
+ bool NeedsDwarfCFI =
+ !IsWinEH && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry());
bool UseLEA = STI.useLeaForSP();
unsigned StackAlign = getStackAlignment();
unsigned SlotSize = RegInfo->getSlotSize();
@@ -509,7 +554,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
.addReg(FramePtr, RegState::Kill)
.setMIFlag(MachineInstr::FrameSetup);
- if (needsFrameMoves) {
+ if (NeedsDwarfCFI) {
// Mark the place where EBP/RBP was saved.
// Define the current CFA rule to use the provided offset.
assert(StackSize);
@@ -527,13 +572,19 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
.addCFIIndex(CFIIndex);
}
+ if (NeedsWinEH) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
+ .addImm(FramePtr)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
// Update EBP with the new base value.
BuildMI(MBB, MBBI, DL,
TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr)
.addReg(StackPtr)
.setMIFlag(MachineInstr::FrameSetup);
- if (needsFrameMoves) {
+ if (NeedsDwarfCFI) {
// Mark effective beginning of when frame pointer becomes valid.
// Define the current CFA to use the EBP/RBP register.
unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true);
@@ -543,9 +594,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
.addCFIIndex(CFIIndex);
}
- // Mark the FramePtr as live-in in every block except the entry.
- for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
- I != E; ++I)
+ // Mark the FramePtr as live-in in every block.
+ for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
I->addLiveIn(FramePtr);
} else {
NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
@@ -559,10 +609,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
(MBBI->getOpcode() == X86::PUSH32r ||
MBBI->getOpcode() == X86::PUSH64r)) {
PushedRegs = true;
- MBBI->setFlag(MachineInstr::FrameSetup);
+ unsigned Reg = MBBI->getOperand(0).getReg();
++MBBI;
- if (!HasFP && needsFrameMoves) {
+ if (!HasFP && NeedsDwarfCFI) {
// Mark callee-saved push instruction.
// Define the current CFA rule to use the provided offset.
assert(StackSize);
@@ -572,16 +622,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
.addCFIIndex(CFIIndex);
StackOffset += stackGrowth;
}
+
+ if (NeedsWinEH) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag(
+ MachineInstr::FrameSetup);
+ }
}
// Realign stack after we pushed callee-saved registers (so that we'll be
// able to calculate their offsets from the frame pointer).
-
- // NOTE: We push the registers before realigning the stack, so
- // vector callee-saved (xmm) registers may be saved w/o proper
- // alignment in this way. However, currently these regs are saved in
- // stack slots (see X86FrameLowering::spillCalleeSavedRegisters()), so
- // this shouldn't be a problem.
if (RegInfo->needsStackRealignment(MF)) {
assert(HasFP && "There should be a frame pointer if stack is realigned.");
MachineInstr *MI =
@@ -680,23 +729,88 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
MI->setFlag(MachineInstr::FrameSetup);
MBB.insert(MBBI, MI);
}
- } else if (NumBytes)
+ } else if (NumBytes) {
emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64,
UseLEA, TII, *RegInfo);
+ }
+
+ int SEHFrameOffset = 0;
+ if (NeedsWinEH) {
+ if (HasFP) {
+ // We need to set frame base offset low enough such that all saved
+ // register offsets would be positive relative to it, but we can't
+ // just use NumBytes, because .seh_setframe offset must be <=240.
+ // So we pretend to have only allocated enough space to spill the
+ // non-volatile registers.
+ // We don't care about the rest of stack allocation, because unwinder
+ // will restore SP to (BP - SEHFrameOffset)
+ for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) {
+ int offset = MFI->getObjectOffset(Info.getFrameIdx());
+ SEHFrameOffset = std::max(SEHFrameOffset, abs(offset));
+ }
+ SEHFrameOffset += SEHFrameOffset % 16; // ensure alignmant
+
+ // This only needs to account for XMM spill slots, GPR slots
+ // are covered by the .seh_pushreg's emitted above.
+ unsigned Size = SEHFrameOffset - X86FI->getCalleeSavedFrameSize();
+ if (Size) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
+ .addImm(Size)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
+ .addImm(FramePtr)
+ .addImm(SEHFrameOffset)
+ .setMIFlag(MachineInstr::FrameSetup);
+ } else {
+ // SP will be the base register for restoring XMMs
+ if (NumBytes) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
+ .addImm(NumBytes)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ }
+ }
+
+ // Skip the rest of register spilling code
+ while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
+ ++MBBI;
+
+ // Emit SEH info for non-GPRs
+ if (NeedsWinEH) {
+ for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) {
+ unsigned Reg = Info.getReg();
+ if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+ continue;
+ assert(X86::FR64RegClass.contains(Reg) && "Unexpected register class");
+
+ int Offset = getFrameIndexOffset(MF, Info.getFrameIdx());
+ Offset += SEHFrameOffset;
+
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
+ .addImm(Reg)
+ .addImm(Offset)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
// If we need a base pointer, set it up here. It's whatever the value
// of the stack pointer is at this point. Any variable size objects
// will be allocated after this, so we can still use the base pointer
// to reference locals.
if (RegInfo->hasBasePointer(MF)) {
- // Update the frame pointer with the current stack pointer.
+ // Update the base pointer with the current stack pointer.
unsigned Opc = Is64Bit ? X86::MOV64rr : X86::MOV32rr;
BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
.addReg(StackPtr)
.setMIFlag(MachineInstr::FrameSetup);
}
- if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) {
+ if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) {
// Mark end of stack pointer adjustment.
if (!HasFP && NumBytes) {
// Define the current CFA rule to use the provided offset.
@@ -711,7 +825,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
// Emit DWARF info specifying the offsets of the callee-saved registers.
if (PushedRegs)
- emitCalleeSavedFrameMoves(MBB, MBBI, DL, HasFP ? FramePtr : StackPtr);
+ emitCalleeSavedFrameMoves(MBB, MBBI, DL);
}
}
@@ -719,12 +833,14 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
const MachineFrameInfo *MFI = MF.getFrameInfo();
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
- const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
- const X86InstrInfo &TII = *TM.getInstrInfo();
+ const X86RegisterInfo *RegInfo =
+ static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
+ const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
assert(MBBI != MBB.end() && "Returning block has no instructions");
unsigned RetOpcode = MBBI->getOpcode();
DebugLoc DL = MBBI->getDebugLoc();
+ const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
bool Is64Bit = STI.is64Bit();
bool IsLP64 = STI.isTarget64BitLP64();
bool UseLEA = STI.useLeaForSP();
@@ -969,46 +1085,97 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
return getFrameIndexOffset(MF, FI);
}
-bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const {
- if (CSI.empty())
- return false;
+bool X86FrameLowering::assignCalleeSavedSpillSlots(
+ MachineFunction &MF, const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const {
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ const X86RegisterInfo *RegInfo =
+ static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
+ unsigned SlotSize = RegInfo->getSlotSize();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
- DebugLoc DL = MBB.findDebugLoc(MI);
+ unsigned CalleeSavedFrameSize = 0;
+ int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();
- MachineFunction &MF = *MBB.getParent();
+ if (hasFP(MF)) {
+ // emitPrologue always spills frame register the first thing.
+ SpillSlotOffset -= SlotSize;
+ MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+
+ // Since emitPrologue and emitEpilogue will handle spilling and restoring of
+ // the frame register, we can delete it from CSI list and not have to worry
+ // about avoiding it later.
+ unsigned FPReg = RegInfo->getFrameRegister(MF);
+ for (unsigned i = 0; i < CSI.size(); ++i) {
+ if (CSI[i].getReg() == FPReg) {
+ CSI.erase(CSI.begin() + i);
+ break;
+ }
+ }
+ }
+
+ // Assign slots for GPRs. It increases frame size.
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i - 1].getReg();
+
+ if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
+ continue;
- unsigned SlotSize = STI.is64Bit() ? 8 : 4;
- unsigned FPReg = TRI->getFrameRegister(MF);
- unsigned CalleeFrameSize = 0;
+ SpillSlotOffset -= SlotSize;
+ CalleeSavedFrameSize += SlotSize;
+
+ int SlotIndex = MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+ CSI[i - 1].setFrameIdx(SlotIndex);
+ }
+
+ X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);
+
+ // Assign slots for XMMs.
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i - 1].getReg();
+ if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+ continue;
+
+ const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
+ // ensure alignment
+ SpillSlotOffset -= abs(SpillSlotOffset) % RC->getAlignment();
+ // spill into slot
+ SpillSlotOffset -= RC->getSize();
+ int SlotIndex =
+ MFI->CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset);
+ CSI[i - 1].setFrameIdx(SlotIndex);
+ MFI->ensureMaxAlignment(RC->getAlignment());
+ }
+
+ return true;
+}
+bool X86FrameLowering::spillCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL = MBB.findDebugLoc(MI);
+
+ MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
- X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
// Push GPRs. It increases frame size.
unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
for (unsigned i = CSI.size(); i != 0; --i) {
- unsigned Reg = CSI[i-1].getReg();
- if (!X86::GR64RegClass.contains(Reg) &&
- !X86::GR32RegClass.contains(Reg))
+ unsigned Reg = CSI[i - 1].getReg();
+
+ if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
continue;
// Add the callee-saved register as live-in. It's killed at the spill.
MBB.addLiveIn(Reg);
- if (Reg == FPReg)
- // X86RegisterInfo::emitPrologue will handle spilling of frame register.
- continue;
- CalleeFrameSize += SlotSize;
+
BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill)
.setMIFlag(MachineInstr::FrameSetup);
}
- X86FI->setCalleeSavedFrameSize(CalleeFrameSize);
-
// Make XMM regs spilled. X86 does not have ability of push/pop XMM.
// It can be done by spilling XMMs to stack frame.
- // Note that only Win64 ABI might spill XMMs.
for (unsigned i = CSI.size(); i != 0; --i) {
unsigned Reg = CSI[i-1].getReg();
if (X86::GR64RegClass.contains(Reg) ||
@@ -1017,8 +1184,12 @@ bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
// Add the callee-saved register as live-in. It's killed at the spill.
MBB.addLiveIn(Reg);
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
- TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(),
- RC, TRI);
+
+ TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC,
+ TRI);
+ --MI;
+ MI->setFlag(MachineInstr::FrameSetup);
+ ++MI;
}
return true;
@@ -1035,6 +1206,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+ const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
// Reload XMMs from stack frame.
for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
@@ -1042,22 +1214,19 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
if (X86::GR64RegClass.contains(Reg) ||
X86::GR32RegClass.contains(Reg))
continue;
+
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
- TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(),
- RC, TRI);
+ TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI);
}
// POP GPRs.
- unsigned FPReg = TRI->getFrameRegister(MF);
unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
unsigned Reg = CSI[i].getReg();
if (!X86::GR64RegClass.contains(Reg) &&
!X86::GR32RegClass.contains(Reg))
continue;
- if (Reg == FPReg)
- // X86RegisterInfo::emitEpilogue will handle restoring of frame register.
- continue;
+
BuildMI(MBB, MI, DL, TII.get(Opc), Reg);
}
return true;
@@ -1065,9 +1234,10 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
void
X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
- RegScavenger *RS) const {
+ RegScavenger *RS) const {
MachineFrameInfo *MFI = MF.getFrameInfo();
- const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
+ const X86RegisterInfo *RegInfo =
+ static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
unsigned SlotSize = RegInfo->getSlotSize();
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
@@ -1087,22 +1257,6 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
TailCallReturnAddrDelta - SlotSize, true);
}
- if (hasFP(MF)) {
- assert((TailCallReturnAddrDelta <= 0) &&
- "The Delta should always be zero or negative");
- const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering();
-
- // Create a frame entry for the EBP register that must be saved.
- int FrameIdx = MFI->CreateFixedObject(SlotSize,
- -(int)SlotSize +
- TFI.getOffsetOfLocalArea() +
- TailCallReturnAddrDelta,
- true);
- assert(FrameIdx == MFI->getObjectIndexBegin() &&
- "Slot for EBP register must be last in order to be found!");
- (void)FrameIdx;
- }
-
// Spill the BasePtr if it's used.
if (RegInfo->hasBasePointer(MF))
MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister());
@@ -1160,8 +1314,9 @@ void
X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
MachineBasicBlock &prologueMBB = MF.front();
MachineFrameInfo *MFI = MF.getFrameInfo();
- const X86InstrInfo &TII = *TM.getInstrInfo();
+ const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
uint64_t StackSize;
+ const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
bool Is64Bit = STI.is64Bit();
unsigned TlsReg, TlsOffset;
DebugLoc DL;
@@ -1368,9 +1523,12 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
/// temp0 = sp - MaxStack
/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
- const X86InstrInfo &TII = *TM.getInstrInfo();
+ const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
MachineFrameInfo *MFI = MF.getFrameInfo();
- const unsigned SlotSize = TM.getRegisterInfo()->getSlotSize();
+ const unsigned SlotSize =
+ static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo())
+ ->getSlotSize();
+ const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
const bool Is64Bit = STI.is64Bit();
DebugLoc DL;
// HiPE-specific values
@@ -1499,12 +1657,14 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
void X86FrameLowering::
eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const {
- const X86InstrInfo &TII = *TM.getInstrInfo();
- const X86RegisterInfo &RegInfo = *TM.getRegisterInfo();
+ const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+ const X86RegisterInfo &RegInfo =
+ *static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
unsigned StackPtr = RegInfo.getStackRegister();
bool reseveCallFrame = hasReservedCallFrame(MF);
int Opcode = I->getOpcode();
bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
+ const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
bool IsLP64 = STI.isTarget64BitLP64();
DebugLoc DL = I->getDebugLoc();
uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0;
@@ -1522,7 +1682,8 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
// We need to keep the stack aligned properly. To do this, we round the
// amount of space needed for the outgoing arguments up to the next
// alignment boundary.
- unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
+ unsigned StackAlign =
+ MF.getTarget().getFrameLowering()->getStackAlignment();
Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
MachineInstr *New = nullptr;
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 208bb8b..5ad3d4d 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -14,7 +14,6 @@
#ifndef X86_FRAMELOWERING_H
#define X86_FRAMELOWERING_H
-#include "X86Subtarget.h"
#include "llvm/Target/TargetFrameLowering.h"
namespace llvm {
@@ -23,19 +22,13 @@ class MCSymbol;
class X86TargetMachine;
class X86FrameLowering : public TargetFrameLowering {
- const X86TargetMachine &TM;
- const X86Subtarget &STI;
public:
- explicit X86FrameLowering(const X86TargetMachine &tm, const X86Subtarget &sti)
- : TargetFrameLowering(StackGrowsDown,
- sti.getStackAlignment(),
- (sti.is64Bit() ? -8 : -4)),
- TM(tm), STI(sti) {
- }
+ explicit X86FrameLowering(StackDirection D, unsigned StackAl, int LAO)
+ : TargetFrameLowering(StackGrowsDown, StackAl, LAO) {}
void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, DebugLoc DL,
- unsigned FramePtr) const;
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL) const;
/// emitProlog/emitEpilog - These methods insert prolog and epilog code into
/// the function.
@@ -49,6 +42,11 @@ public:
void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
RegScavenger *RS = nullptr) const override;
+ bool
+ assignCalleeSavedSpillSlots(MachineFunction &MF,
+ const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const override;
+
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const std::vector<CalleeSavedInfo> &CSI,
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 74386d3..ba2f5f6 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2126,38 +2126,6 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
return getGlobalBaseReg();
- case X86ISD::ATOMOR64_DAG:
- case X86ISD::ATOMXOR64_DAG:
- case X86ISD::ATOMADD64_DAG:
- case X86ISD::ATOMSUB64_DAG:
- case X86ISD::ATOMNAND64_DAG:
- case X86ISD::ATOMAND64_DAG:
- case X86ISD::ATOMMAX64_DAG:
- case X86ISD::ATOMMIN64_DAG:
- case X86ISD::ATOMUMAX64_DAG:
- case X86ISD::ATOMUMIN64_DAG:
- case X86ISD::ATOMSWAP64_DAG: {
- unsigned Opc;
- switch (Opcode) {
- default: llvm_unreachable("Impossible opcode");
- case X86ISD::ATOMOR64_DAG: Opc = X86::ATOMOR6432; break;
- case X86ISD::ATOMXOR64_DAG: Opc = X86::ATOMXOR6432; break;
- case X86ISD::ATOMADD64_DAG: Opc = X86::ATOMADD6432; break;
- case X86ISD::ATOMSUB64_DAG: Opc = X86::ATOMSUB6432; break;
- case X86ISD::ATOMNAND64_DAG: Opc = X86::ATOMNAND6432; break;
- case X86ISD::ATOMAND64_DAG: Opc = X86::ATOMAND6432; break;
- case X86ISD::ATOMMAX64_DAG: Opc = X86::ATOMMAX6432; break;
- case X86ISD::ATOMMIN64_DAG: Opc = X86::ATOMMIN6432; break;
- case X86ISD::ATOMUMAX64_DAG: Opc = X86::ATOMUMAX6432; break;
- case X86ISD::ATOMUMIN64_DAG: Opc = X86::ATOMUMIN6432; break;
- case X86ISD::ATOMSWAP64_DAG: Opc = X86::ATOMSWAP6432; break;
- }
- SDNode *RetVal = SelectAtomic64(Node, Opc);
- if (RetVal)
- return RetVal;
- break;
- }
-
case ISD::ATOMIC_LOAD_XOR:
case ISD::ATOMIC_LOAD_AND:
case ISD::ATOMIC_LOAD_OR:
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index cbaf44e..5ccff20 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -44,11 +44,13 @@
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetOptions.h"
#include <bitset>
+#include <numeric>
#include <cctype>
using namespace llvm;
@@ -56,6 +58,17 @@ using namespace llvm;
STATISTIC(NumTailCalls, "Number of tail calls");
+static cl::opt<bool> ExperimentalVectorWideningLegalization(
+ "x86-experimental-vector-widening-legalization", cl::init(false),
+ cl::desc("Enable an experimental vector type legalization through widening "
+ "rather than promotion."),
+ cl::Hidden);
+
+static cl::opt<bool> ExperimentalVectorShuffleLowering(
+ "x86-experimental-vector-shuffle-lowering", cl::init(false),
+ cl::desc("Enable an experimental vector shuffle lowering code path."),
+ cl::Hidden);
+
// Forward declarations.
static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
SDValue V2);
@@ -178,29 +191,28 @@ static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
}
-static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
- const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
- bool is64Bit = Subtarget->is64Bit();
-
- if (Subtarget->isTargetMacho()) {
- if (is64Bit)
+static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
+ if (TT.isOSBinFormatMachO()) {
+ if (TT.getArch() == Triple::x86_64)
return new X86_64MachoTargetObjectFile();
return new TargetLoweringObjectFileMachO();
}
- if (Subtarget->isTargetLinux())
+ if (TT.isOSLinux())
return new X86LinuxTargetObjectFile();
- if (Subtarget->isTargetELF())
+ if (TT.isOSBinFormatELF())
return new TargetLoweringObjectFileELF();
- if (Subtarget->isTargetKnownWindowsMSVC())
+ if (TT.isKnownWindowsMSVCEnvironment())
return new X86WindowsTargetObjectFile();
- if (Subtarget->isTargetCOFF())
+ if (TT.isOSBinFormatCOFF())
return new TargetLoweringObjectFileCOFF();
llvm_unreachable("unknown subtarget type");
}
+// FIXME: This should stop caching the target machine as soon as
+// we can remove resetOperationActions et al.
X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
- : TargetLowering(TM, createTLOF(TM)) {
+ : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
Subtarget = &TM.getSubtarget<X86Subtarget>();
X86ScalarSSEf64 = Subtarget->hasSSE2();
X86ScalarSSEf32 = Subtarget->hasSSE1();
@@ -443,7 +455,13 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::BR_CC , MVT::i16, Expand);
setOperationAction(ISD::BR_CC , MVT::i32, Expand);
setOperationAction(ISD::BR_CC , MVT::i64, Expand);
- setOperationAction(ISD::SELECT_CC , MVT::Other, Expand);
+ setOperationAction(ISD::SELECT_CC , MVT::f32, Expand);
+ setOperationAction(ISD::SELECT_CC , MVT::f64, Expand);
+ setOperationAction(ISD::SELECT_CC , MVT::f80, Expand);
+ setOperationAction(ISD::SELECT_CC , MVT::i8, Expand);
+ setOperationAction(ISD::SELECT_CC , MVT::i16, Expand);
+ setOperationAction(ISD::SELECT_CC , MVT::i32, Expand);
+ setOperationAction(ISD::SELECT_CC , MVT::i64, Expand);
if (Subtarget->is64Bit())
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
@@ -497,6 +515,14 @@ void X86TargetLowering::resetOperationActions() {
}
}
+ // Special handling for half-precision floating point conversions.
+ // If we don't have F16C support, then lower half float conversions
+ // into library calls.
+ if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
+ setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand);
+ setOperationAction(ISD::FP32_TO_FP16, MVT::i16, Expand);
+ }
+
if (Subtarget->hasPOPCNT()) {
setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
} else {
@@ -575,34 +601,18 @@ void X86TargetLowering::resetOperationActions() {
// Expand certain atomics
for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
MVT VT = IntVTs[i];
- setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
}
- if (!Subtarget->is64Bit()) {
- setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
- setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
- setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
- setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
- setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
- setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
- setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
- setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
- setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom);
- setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom);
- setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
- setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
- }
-
if (Subtarget->hasCmpxchg16b()) {
- setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
}
// FIXME - use subtarget debug flags
- if (!Subtarget->isTargetDarwin() &&
- !Subtarget->isTargetELF() &&
- !Subtarget->isTargetCygMing()) {
+ if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
+ !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
}
@@ -861,6 +871,7 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
setOperationAction(ISD::ANY_EXTEND, VT, Expand);
setOperationAction(ISD::VSELECT, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
setTruncStoreAction(VT,
@@ -1433,6 +1444,11 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::OR, MVT::v16i32, Legal);
setOperationAction(ISD::XOR, MVT::v16i32, Legal);
+ if (Subtarget->hasCDI()) {
+ setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
+ setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
+ }
+
// Custom lower several nodes.
for (int i = MVT::FIRST_VECTOR_VALUETYPE;
i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
@@ -1563,6 +1579,7 @@ void X86TargetLowering::resetOperationActions() {
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::SETCC);
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+ setTargetDAGCombine(ISD::BUILD_VECTOR);
if (Subtarget->is64Bit())
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::XOR);
@@ -1585,6 +1602,16 @@ void X86TargetLowering::resetOperationActions() {
setPrefFunctionAlignment(4); // 2^4 bytes.
}
+TargetLoweringBase::LegalizeTypeAction
+X86TargetLowering::getPreferredVectorAction(EVT VT) const {
+ if (ExperimentalVectorWideningLegalization &&
+ VT.getVectorNumElements() != 1 &&
+ VT.getVectorElementType().getSimpleVT() != MVT::i1)
+ return TypeWidenVector;
+
+ return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
if (!VT.isVector())
return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
@@ -1725,7 +1752,7 @@ const MCExpr *
X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
const MachineBasicBlock *MBB,
unsigned uid,MCContext &Ctx) const{
- assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
Subtarget->isPICStyleGOT());
// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
// entries.
@@ -1824,7 +1851,7 @@ X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
const SmallVectorImpl<ISD::OutputArg> &Outs,
LLVMContext &Context) const {
SmallVector<CCValAssign, 16> RVLocs;
- CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+ CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
RVLocs, Context);
return CCInfo.CheckReturn(Outs, RetCC_X86);
}
@@ -1844,7 +1871,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
SmallVector<CCValAssign, 16> RVLocs;
- CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+ CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
RVLocs, *DAG.getContext());
CCInfo.AnalyzeReturn(Outs, RetCC_X86);
@@ -2016,7 +2043,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
SmallVector<CCValAssign, 16> RVLocs;
bool Is64Bit = Subtarget->is64Bit();
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), RVLocs, *DAG.getContext());
+ DAG.getTarget(), RVLocs, *DAG.getContext());
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
// Copy all of the result registers out of their specified physreg.
@@ -2166,8 +2193,8 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
unsigned i) const {
// Create the nodes corresponding to a load from this parameter slot.
ISD::ArgFlagsTy Flags = Ins[i].Flags;
- bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv,
- getTargetMachine().Options.GuaranteedTailCallOpt);
+ bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
+ CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
EVT ValVT;
@@ -2224,7 +2251,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+ CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64
@@ -2388,7 +2415,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
TotalNumXMMRegs = 0;
if (IsWin64) {
- const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
+ const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering();
// Get to the caller-allocated home save location. Add 8 to account
// for the return address.
int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
@@ -2587,7 +2614,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+ CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64
@@ -2602,7 +2629,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// This is a sibcall. The memory operands are available in caller's
// own caller's stack.
NumBytes = 0;
- else if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+ else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
IsTailCallConvention(CallConv))
NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
@@ -2649,7 +2676,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Walk the register/memloc assignments, inserting copies/loads. In the case
// of tail call optimization arguments are handle later.
const X86RegisterInfo *RegInfo =
- static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+ static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
// Skip inalloca arguments, they have already been written.
ISD::ArgFlagsTy Flags = Outs[i].Flags;
@@ -2840,7 +2867,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
InFlag = Chain.getValue(1);
}
- if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+ if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
// In the 64-bit large code model, we have to make all calls
// through a register, since the call instruction's 32-bit
@@ -2864,7 +2891,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// has hidden or protected visibility, or if it is static or local, then
// we don't need to use the PLT - we can directly call it.
if (Subtarget->isTargetELF() &&
- getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
OpFlags = X86II::MO_PLT;
} else if (Subtarget->isPICStyleStubAny() &&
@@ -2906,7 +2933,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// On ELF targets, in either X86-64 or X86-32 mode, direct calls to
// external symbols should go through the PLT.
if (Subtarget->isTargetELF() &&
- getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+ DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
OpFlags = X86II::MO_PLT;
} else if (Subtarget->isPICStyleStubAny() &&
(!Subtarget->getTargetTriple().isMacOSX() ||
@@ -2945,7 +2972,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
RegsToPass[i].second.getValueType()));
// Add a register mask operand representing the call-preserved registers.
- const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+ const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
@@ -2969,7 +2996,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Create the CALLSEQ_END node.
unsigned NumBytesForCalleeToPop;
if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
- getTargetMachine().Options.GuaranteedTailCallOpt))
+ DAG.getTarget().Options.GuaranteedTailCallOpt))
NumBytesForCalleeToPop = NumBytes; // Callee pops everything
else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
!Subtarget->getTargetTriple().isOSMSVCRT() &&
@@ -3140,7 +3167,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
- if (getTargetMachine().Options.GuaranteedTailCallOpt) {
+ if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
if (IsTailCallConvention(CalleeCC) && CCMatch)
return true;
return false;
@@ -3152,7 +3179,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
// emit a special epilogue.
const X86RegisterInfo *RegInfo =
- static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+ static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
if (RegInfo->needsStackRealignment(MF))
return false;
@@ -3181,7 +3208,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), ArgLocs, *DAG.getContext());
+ DAG.getTarget(), ArgLocs, *DAG.getContext());
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
@@ -3202,7 +3229,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
if (Unused) {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
- getTargetMachine(), RVLocs, *DAG.getContext());
+ DAG.getTarget(), RVLocs, *DAG.getContext());
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
CCValAssign &VA = RVLocs[i];
@@ -3216,12 +3243,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
if (!CCMatch) {
SmallVector<CCValAssign, 16> RVLocs1;
CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
- getTargetMachine(), RVLocs1, *DAG.getContext());
+ DAG.getTarget(), RVLocs1, *DAG.getContext());
CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
SmallVector<CCValAssign, 16> RVLocs2;
CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
- getTargetMachine(), RVLocs2, *DAG.getContext());
+ DAG.getTarget(), RVLocs2, *DAG.getContext());
CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
if (RVLocs1.size() != RVLocs2.size())
@@ -3248,7 +3275,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
// argument is passed on the stack.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), ArgLocs, *DAG.getContext());
+ DAG.getTarget(), ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64
if (IsCalleeWin64)
@@ -3265,7 +3292,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
MachineFrameInfo *MFI = MF.getFrameInfo();
const MachineRegisterInfo *MRI = &MF.getRegInfo();
const X86InstrInfo *TII =
- ((const X86TargetMachine&)getTargetMachine()).getInstrInfo();
+ static_cast<const X86InstrInfo *>(DAG.getTarget().getInstrInfo());
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
SDValue Arg = OutVals[i];
@@ -3288,12 +3315,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
if (!Subtarget->is64Bit() &&
((!isa<GlobalAddressSDNode>(Callee) &&
!isa<ExternalSymbolSDNode>(Callee)) ||
- getTargetMachine().getRelocationModel() == Reloc::PIC_)) {
+ DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
unsigned NumInRegs = 0;
// In PIC we need an extra register to formulate the address computation
// for the callee.
unsigned MaxInRegs =
- (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
+ (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
@@ -3417,7 +3444,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
const X86RegisterInfo *RegInfo =
- static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+ static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
int ReturnAddrIndex = FuncInfo->getRAIndex();
@@ -3967,14 +3994,22 @@ static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
unsigned CorrectPosV1 = 0;
unsigned CorrectPosV2 = 0;
- for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i)
+ for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
+ if (Mask[i] == -1) {
+ ++CorrectPosV1;
+ ++CorrectPosV2;
+ continue;
+ }
+
if (Mask[i] == i)
++CorrectPosV1;
else if (Mask[i] == i + 4)
++CorrectPosV2;
+ }
if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
- // We have 3 elements from one vector, and one from another.
+ // We have 3 elements (undefs count as elements from any vector) from one
+ // vector, and one from another.
return true;
return false;
@@ -4823,19 +4858,6 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
return true;
}
-/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
-/// all the same.
-static bool isSplatVector(SDNode *N) {
- if (N->getOpcode() != ISD::BUILD_VECTOR)
- return false;
-
- SDValue SplatValue = N->getOperand(0);
- for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
- if (N->getOperand(i) != SplatValue)
- return false;
- return true;
-}
-
/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
/// to an zero vector.
/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
@@ -5744,18 +5766,22 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
return SDValue();
case ISD::BUILD_VECTOR: {
- // The BUILD_VECTOR node must be a splat.
- if (!isSplatVector(Op.getNode()))
+ auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
+ BitVector UndefElements;
+ SDValue Splat = BVOp->getSplatValue(&UndefElements);
+
+ // We need a splat of a single value to use broadcast, and it doesn't
+ // make any sense if the value is only in one element of the vector.
+ if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
return SDValue();
- Ld = Op.getOperand(0);
+ Ld = Splat;
ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
- Ld.getOpcode() == ISD::ConstantFP);
+ Ld.getOpcode() == ISD::ConstantFP);
- // The suspected load node has several users. Make sure that all
- // of its users are from the BUILD_VECTOR node.
- // Constants may have multiple users.
- if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
+ // Make sure that all of the users of a non-constant load are from the
+ // BUILD_VECTOR node.
+ if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
return SDValue();
break;
}
@@ -6042,6 +6068,433 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::BITCAST, dl, VT, Select);
}
+/// \brief Return true if \p N implements a horizontal binop and return the
+/// operands for the horizontal binop into V0 and V1.
+///
+/// This is a helper function of PerformBUILD_VECTORCombine.
+/// This function checks that the build_vector \p N in input implements a
+/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
+/// operation to match.
+/// For example, if \p Opcode is equal to ISD::ADD, then this function
+/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
+/// is equal to ISD::SUB, then this function checks if this is a horizontal
+/// arithmetic sub.
+///
+/// This function only analyzes elements of \p N whose indices are
+/// in range [BaseIdx, LastIdx).
+static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
+ SelectionDAG &DAG,
+ unsigned BaseIdx, unsigned LastIdx,
+ SDValue &V0, SDValue &V1) {
+ EVT VT = N->getValueType(0);
+
+ assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
+ assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
+ "Invalid Vector in input!");
+
+ bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
+ bool CanFold = true;
+ unsigned ExpectedVExtractIdx = BaseIdx;
+ unsigned NumElts = LastIdx - BaseIdx;
+ V0 = DAG.getUNDEF(VT);
+ V1 = DAG.getUNDEF(VT);
+
+ // Check if N implements a horizontal binop.
+ for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
+ SDValue Op = N->getOperand(i + BaseIdx);
+
+ // Skip UNDEFs.
+ if (Op->getOpcode() == ISD::UNDEF) {
+ // Update the expected vector extract index.
+ if (i * 2 == NumElts)
+ ExpectedVExtractIdx = BaseIdx;
+ ExpectedVExtractIdx += 2;
+ continue;
+ }
+
+ CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
+
+ if (!CanFold)
+ break;
+
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ // Try to match the following pattern:
+ // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
+ CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Op0.getOperand(0) == Op1.getOperand(0) &&
+ isa<ConstantSDNode>(Op0.getOperand(1)) &&
+ isa<ConstantSDNode>(Op1.getOperand(1)));
+ if (!CanFold)
+ break;
+
+ unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+ unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
+
+ if (i * 2 < NumElts) {
+ if (V0.getOpcode() == ISD::UNDEF)
+ V0 = Op0.getOperand(0);
+ } else {
+ if (V1.getOpcode() == ISD::UNDEF)
+ V1 = Op0.getOperand(0);
+ if (i * 2 == NumElts)
+ ExpectedVExtractIdx = BaseIdx;
+ }
+
+ SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
+ if (I0 == ExpectedVExtractIdx)
+ CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
+ else if (IsCommutable && I1 == ExpectedVExtractIdx) {
+ // Try to match the following dag sequence:
+ // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
+ CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
+ } else
+ CanFold = false;
+
+ ExpectedVExtractIdx += 2;
+ }
+
+ return CanFold;
+}
+
+/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
+/// a concat_vector.
+///
+/// This is a helper function of PerformBUILD_VECTORCombine.
+/// This function expects two 256-bit vectors called V0 and V1.
+/// At first, each vector is split into two separate 128-bit vectors.
+/// Then, the resulting 128-bit vectors are used to implement two
+/// horizontal binary operations.
+///
+/// The kind of horizontal binary operation is defined by \p X86Opcode.
+///
+/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
+/// the two new horizontal binop.
+/// When Mode is set, the first horizontal binop dag node would take as input
+/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
+/// horizontal binop dag node would take as input the lower 128-bit of V1
+/// and the upper 128-bit of V1.
+/// Example:
+/// HADD V0_LO, V0_HI
+/// HADD V1_LO, V1_HI
+///
+/// Otherwise, the first horizontal binop dag node takes as input the lower
+/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
+/// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
+/// Example:
+/// HADD V0_LO, V1_LO
+/// HADD V0_HI, V1_HI
+///
+/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
+/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
+/// the upper 128-bits of the result.
+static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
+ SDLoc DL, SelectionDAG &DAG,
+ unsigned X86Opcode, bool Mode,
+ bool isUndefLO, bool isUndefHI) {
+ EVT VT = V0.getValueType();
+ assert(VT.is256BitVector() && VT == V1.getValueType() &&
+ "Invalid nodes in input!");
+
+ unsigned NumElts = VT.getVectorNumElements();
+ SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
+ SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
+ SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
+ SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
+ EVT NewVT = V0_LO.getValueType();
+
+ SDValue LO = DAG.getUNDEF(NewVT);
+ SDValue HI = DAG.getUNDEF(NewVT);
+
+ if (Mode) {
+ // Don't emit a horizontal binop if the result is expected to be UNDEF.
+ if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
+ LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
+ if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
+ HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
+ } else {
+ // Don't emit a horizontal binop if the result is expected to be UNDEF.
+ if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
+ V1_LO->getOpcode() != ISD::UNDEF))
+ LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
+
+ if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
+ V1_HI->getOpcode() != ISD::UNDEF))
+ HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
+ }
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
+}
+
+/// \brief Try to fold a build_vector that performs an 'addsub' into the
+/// sequence of 'vadd + vsub + blendi'.
+static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ SDLoc DL(BV);
+ EVT VT = BV->getValueType(0);
+ unsigned NumElts = VT.getVectorNumElements();
+ SDValue InVec0 = DAG.getUNDEF(VT);
+ SDValue InVec1 = DAG.getUNDEF(VT);
+
+ assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
+ VT == MVT::v2f64) && "build_vector with an invalid type found!");
+
+ // Don't try to emit a VSELECT that cannot be lowered into a blend.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
+ return SDValue();
+
+ // Odd-numbered elements in the input build vector are obtained from
+ // adding two integer/float elements.
+ // Even-numbered elements in the input build vector are obtained from
+ // subtracting two integer/float elements.
+ unsigned ExpectedOpcode = ISD::FSUB;
+ unsigned NextExpectedOpcode = ISD::FADD;
+ bool AddFound = false;
+ bool SubFound = false;
+
+ for (unsigned i = 0, e = NumElts; i != e; i++) {
+ SDValue Op = BV->getOperand(i);
+
+ // Skip 'undef' values.
+ unsigned Opcode = Op.getOpcode();
+ if (Opcode == ISD::UNDEF) {
+ std::swap(ExpectedOpcode, NextExpectedOpcode);
+ continue;
+ }
+
+ // Early exit if we found an unexpected opcode.
+ if (Opcode != ExpectedOpcode)
+ return SDValue();
+
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ // Try to match the following pattern:
+ // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
+ // Early exit if we cannot match that sequence.
+ if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(Op0.getOperand(1)) ||
+ !isa<ConstantSDNode>(Op1.getOperand(1)) ||
+ Op0.getOperand(1) != Op1.getOperand(1))
+ return SDValue();
+
+ unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+ if (I0 != i)
+ return SDValue();
+
+ // We found a valid add/sub node. Update the information accordingly.
+ if (i & 1)
+ AddFound = true;
+ else
+ SubFound = true;
+
+ // Update InVec0 and InVec1.
+ if (InVec0.getOpcode() == ISD::UNDEF)
+ InVec0 = Op0.getOperand(0);
+ if (InVec1.getOpcode() == ISD::UNDEF)
+ InVec1 = Op1.getOperand(0);
+
+ // Make sure that operands in input to each add/sub node always
+ // come from a same pair of vectors.
+ if (InVec0 != Op0.getOperand(0)) {
+ if (ExpectedOpcode == ISD::FSUB)
+ return SDValue();
+
+ // FADD is commutable. Try to commute the operands
+ // and then test again.
+ std::swap(Op0, Op1);
+ if (InVec0 != Op0.getOperand(0))
+ return SDValue();
+ }
+
+ if (InVec1 != Op1.getOperand(0))
+ return SDValue();
+
+ // Update the pair of expected opcodes.
+ std::swap(ExpectedOpcode, NextExpectedOpcode);
+ }
+
+ // Don't try to fold this build_vector into a VSELECT if it has
+ // too many UNDEF operands.
+ if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
+ InVec1.getOpcode() != ISD::UNDEF) {
+ // Emit a sequence of vector add and sub followed by a VSELECT.
+ // The new VSELECT will be lowered into a BLENDI.
+ // At ISel stage, we pattern-match the sequence 'add + sub + BLENDI'
+ // and emit a single ADDSUB instruction.
+ SDValue Sub = DAG.getNode(ExpectedOpcode, DL, VT, InVec0, InVec1);
+ SDValue Add = DAG.getNode(NextExpectedOpcode, DL, VT, InVec0, InVec1);
+
+ // Construct the VSELECT mask.
+ EVT MaskVT = VT.changeVectorElementTypeToInteger();
+ EVT SVT = MaskVT.getVectorElementType();
+ unsigned SVTBits = SVT.getSizeInBits();
+ SmallVector<SDValue, 8> Ops;
+
+ for (unsigned i = 0, e = NumElts; i != e; ++i) {
+ APInt Value = i & 1 ? APInt::getNullValue(SVTBits) :
+ APInt::getAllOnesValue(SVTBits);
+ SDValue Constant = DAG.getConstant(Value, SVT);
+ Ops.push_back(Constant);
+ }
+
+ SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, Ops);
+ return DAG.getSelect(DL, VT, Mask, Sub, Add);
+ }
+
+ return SDValue();
+}
+
+static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ unsigned NumElts = VT.getVectorNumElements();
+ BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
+ SDValue InVec0, InVec1;
+
+ // Try to match an ADDSUB.
+ if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+ (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
+ SDValue Value = matchAddSub(BV, DAG, Subtarget);
+ if (Value.getNode())
+ return Value;
+ }
+
+ // Try to match horizontal ADD/SUB.
+ unsigned NumUndefsLO = 0;
+ unsigned NumUndefsHI = 0;
+ unsigned Half = NumElts/2;
+
+ // Count the number of UNDEF operands in the build_vector in input.
+ for (unsigned i = 0, e = Half; i != e; ++i)
+ if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
+ NumUndefsLO++;
+
+ for (unsigned i = Half, e = NumElts; i != e; ++i)
+ if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
+ NumUndefsHI++;
+
+ // Early exit if this is either a build_vector of all UNDEFs or all the
+ // operands but one are UNDEF.
+ if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
+ return SDValue();
+
+ if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
+ // Try to match an SSE3 float HADD/HSUB.
+ if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
+ return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
+
+ if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
+ return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
+ } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
+ // Try to match an SSSE3 integer HADD/HSUB.
+ if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
+ return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
+
+ if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
+ return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
+ }
+
+ if (!Subtarget->hasAVX())
+ return SDValue();
+
+ if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
+ // Try to match an AVX horizontal add/sub of packed single/double
+ // precision floating point values from 256-bit vectors.
+ SDValue InVec2, InVec3;
+ if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
+ isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
+ ((InVec0.getOpcode() == ISD::UNDEF ||
+ InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
+ ((InVec1.getOpcode() == ISD::UNDEF ||
+ InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+ return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
+
+ if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
+ isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
+ ((InVec0.getOpcode() == ISD::UNDEF ||
+ InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
+ ((InVec1.getOpcode() == ISD::UNDEF ||
+ InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+ return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
+ } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
+ // Try to match an AVX2 horizontal add/sub of signed integers.
+ SDValue InVec2, InVec3;
+ unsigned X86Opcode;
+ bool CanFold = true;
+
+ if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
+ isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
+ ((InVec0.getOpcode() == ISD::UNDEF ||
+ InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
+ ((InVec1.getOpcode() == ISD::UNDEF ||
+ InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+ X86Opcode = X86ISD::HADD;
+ else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
+ isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
+ ((InVec0.getOpcode() == ISD::UNDEF ||
+ InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
+ ((InVec1.getOpcode() == ISD::UNDEF ||
+ InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+ X86Opcode = X86ISD::HSUB;
+ else
+ CanFold = false;
+
+ if (CanFold) {
+ // Fold this build_vector into a single horizontal add/sub.
+ // Do this only if the target has AVX2.
+ if (Subtarget->hasAVX2())
+ return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
+
+ // Do not try to expand this build_vector into a pair of horizontal
+ // add/sub if we can emit a pair of scalar add/sub.
+ if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
+ return SDValue();
+
+ // Convert this build_vector into a pair of horizontal binop followed by
+ // a concat vector.
+ bool isUndefLO = NumUndefsLO == Half;
+ bool isUndefHI = NumUndefsHI == Half;
+ return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
+ isUndefLO, isUndefHI);
+ }
+ }
+
+ if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
+ VT == MVT::v16i16) && Subtarget->hasAVX()) {
+ unsigned X86Opcode;
+ if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
+ X86Opcode = X86ISD::HADD;
+ else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
+ X86Opcode = X86ISD::HSUB;
+ else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
+ X86Opcode = X86ISD::FHADD;
+ else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
+ X86Opcode = X86ISD::FHSUB;
+ else
+ return SDValue();
+
+ // Don't try to expand this build_vector into a pair of horizontal add/sub
+ // if we can simply emit a pair of scalar add/sub.
+ if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
+ return SDValue();
+
+ // Convert this build_vector into two horizontal add/sub followed by
+ // a concat vector.
+ bool isUndefLO = NumUndefsLO == Half;
+ bool isUndefHI = NumUndefsHI == Half;
+ return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
+ isUndefLO, isUndefHI);
+ }
+
+ return SDValue();
+}
+
SDValue
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
@@ -6429,38 +6882,1160 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
return LowerAVXCONCAT_VECTORS(Op, DAG);
}
-// Try to lower a shuffle node into a simple blend instruction.
-static SDValue
-LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
- const X86Subtarget *Subtarget, SelectionDAG &DAG) {
- SDValue V1 = SVOp->getOperand(0);
- SDValue V2 = SVOp->getOperand(1);
- SDLoc dl(SVOp);
- MVT VT = SVOp->getSimpleValueType(0);
+
+//===----------------------------------------------------------------------===//
+// Vector shuffle lowering
+//
+// This is an experimental code path for lowering vector shuffles on x86. It is
+// designed to handle arbitrary vector shuffles and blends, gracefully
+// degrading performance as necessary. It works hard to recognize idiomatic
+// shuffles and lower them to optimal instruction patterns without leaving
+// a framework that allows reasonably efficient handling of all vector shuffle
+// patterns.
+//===----------------------------------------------------------------------===//
+
+/// \brief Tiny helper function to identify a no-op mask.
+///
+/// This is a somewhat boring predicate function. It checks whether the mask
+/// array input, which is assumed to be a single-input shuffle mask of the kind
+/// used by the X86 shuffle instructions (not a fully general
+/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
+/// in-place shuffle are 'no-op's.
+static bool isNoopShuffleMask(ArrayRef<int> Mask) {
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] != -1 && Mask[i] != i)
+ return false;
+ return true;
+}
+
+/// \brief Helper function to classify a mask as a single-input mask.
+///
+/// This isn't a generic single-input test because in the vector shuffle
+/// lowering we canonicalize single inputs to be the first input operand. This
+/// means we can more quickly test for a single input by only checking whether
+/// an input from the second operand exists. We also assume that the size of
+/// mask corresponds to the size of the input vectors which isn't true in the
+/// fully general case.
+static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
+ for (int M : Mask)
+ if (M >= (int)Mask.size())
+ return false;
+ return true;
+}
+
+/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
+///
+/// This helper function produces an 8-bit shuffle immediate corresponding to
+/// the ubiquitous shuffle encoding scheme used in x86 instructions for
+/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
+/// example.
+///
+/// NB: We rely heavily on "undef" masks preserving the input lane.
+static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
+ assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
+ assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
+ assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
+ assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
+
+ unsigned Imm = 0;
+ Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
+ Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
+ Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
+ Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
+ return DAG.getConstant(Imm, MVT::i8);
+}
+
+/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
+///
+/// This is the basis function for the 2-lane 64-bit shuffles as we have full
+/// support for floating point shuffles but not integer shuffles. These
+/// instructions will incur a domain crossing penalty on some chips though so
+/// it is better to avoid lowering through this for integer vectors where
+/// possible.
+static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
+
+ if (isSingleInputShuffleMask(Mask)) {
+ // Straight shuffle of a single input vector. Simulate this by using the
+ // single input as both of the "inputs" to this instruction..
+ unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
+ return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
+ DAG.getConstant(SHUFPDMask, MVT::i8));
+ }
+ assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
+ assert(Mask[1] >= 2 && "Non-canonicalized blend!");
+
+ unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
+ return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
+ DAG.getConstant(SHUFPDMask, MVT::i8));
+}
+
+/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
+///
+/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
+/// the integer unit to minimize domain crossing penalties. However, for blends
+/// it falls back to the floating point shuffle operation with appropriate bit
+/// casting.
+static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
+
+ if (isSingleInputShuffleMask(Mask)) {
+ // Straight shuffle of a single input vector. For everything from SSE2
+ // onward this has a single fast instruction with no scary immediates.
+ // We have to map the mask as it is actually a v4i32 shuffle instruction.
+ V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
+ int WidenedMask[4] = {
+ std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
+ std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
+ return DAG.getNode(
+ ISD::BITCAST, DL, MVT::v2i64,
+ DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
+ getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
+ }
+
+ // We implement this with SHUFPD which is pretty lame because it will likely
+ // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
+ // However, all the alternatives are still more cycles and newer chips don't
+ // have this problem. It would be really nice if x86 had better shuffles here.
+ V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
+ return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+ DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
+}
+
+/// \brief Lower 4-lane 32-bit floating point shuffles.
+///
+/// Uses instructions exclusively from the floating point unit to minimize
+/// domain crossing penalties, as these are sufficient to implement all v4f32
+/// shuffles.
+static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ SDValue LowV = V1, HighV = V2;
+ int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
+
+ int NumV2Elements =
+ std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+
+ if (NumV2Elements == 0)
+ // Straight shuffle of a single input vector. We pass the input vector to
+ // both operands to simulate this with a SHUFPS.
+ return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+ if (NumV2Elements == 1) {
+ int V2Index =
+ std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
+ Mask.begin();
+ // Compute the index adjacent to V2Index and in the same half by toggling
+ // the low bit.
+ int V2AdjIndex = V2Index ^ 1;
+
+ if (Mask[V2AdjIndex] == -1) {
+ // Handles all the cases where we have a single V2 element and an undef.
+ // This will only ever happen in the high lanes because we commute the
+ // vector otherwise.
+ if (V2Index < 2)
+ std::swap(LowV, HighV);
+ NewMask[V2Index] -= 4;
+ } else {
+ // Handle the case where the V2 element ends up adjacent to a V1 element.
+ // To make this work, blend them together as the first step.
+ int V1Index = V2AdjIndex;
+ int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
+ V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1,
+ getV4X86ShuffleImm8ForMask(BlendMask, DAG));
+
+ // Now proceed to reconstruct the final blend as we have the necessary
+ // high or low half formed.
+ if (V2Index < 2) {
+ LowV = V2;
+ HighV = V1;
+ } else {
+ HighV = V2;
+ }
+ NewMask[V1Index] = 2; // We put the V1 element in V2[2].
+ NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
+ }
+ } else if (NumV2Elements == 2) {
+ if (Mask[0] < 4 && Mask[1] < 4) {
+ // Handle the easy case where we have V1 in the low lanes and V2 in the
+ // high lanes. We never see this reversed because we sort the shuffle.
+ NewMask[2] -= 4;
+ NewMask[3] -= 4;
+ } else {
+ // We have a mixture of V1 and V2 in both low and high lanes. Rather than
+ // trying to place elements directly, just blend them and set up the final
+ // shuffle to place them.
+
+ // The first two blend mask elements are for V1, the second two are for
+ // V2.
+ int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
+ Mask[2] < 4 ? Mask[2] : Mask[3],
+ (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
+ (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
+ V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2,
+ getV4X86ShuffleImm8ForMask(BlendMask, DAG));
+
+ // Now we do a normal shuffle of V1 by giving V1 as both operands to
+ // a blend.
+ LowV = HighV = V1;
+ NewMask[0] = Mask[0] < 4 ? 0 : 2;
+ NewMask[1] = Mask[0] < 4 ? 2 : 0;
+ NewMask[2] = Mask[2] < 4 ? 1 : 3;
+ NewMask[3] = Mask[2] < 4 ? 3 : 1;
+ }
+ }
+ return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV,
+ getV4X86ShuffleImm8ForMask(NewMask, DAG));
+}
+
+/// \brief Lower 4-lane i32 vector shuffles.
+///
+/// We try to handle these with integer-domain shuffles where we can, but for
+/// blends we use the floating point domain blend instructions.
+static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ if (isSingleInputShuffleMask(Mask))
+ // Straight shuffle of a single input vector. For everything from SSE2
+ // onward this has a single fast instruction with no scary immediates.
+ return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+ // We implement this with SHUFPS because it can blend from two vectors.
+ // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
+ // up the inputs, bypassing domain shift penalties that we would encur if we
+ // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
+ // relevant.
+ return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
+ DAG.getVectorShuffle(
+ MVT::v4f32, DL,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
+ DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
+}
+
+/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
+/// shuffle lowering, and the most complex part.
+///
+/// The lowering strategy is to try to form pairs of input lanes which are
+/// targeted at the same half of the final vector, and then use a dword shuffle
+/// to place them onto the right half, and finally unpack the paired lanes into
+/// their final position.
+///
+/// The exact breakdown of how to form these dword pairs and align them on the
+/// correct sides is really tricky. See the comments within the function for
+/// more of the details.
+static SDValue lowerV8I16SingleInputVectorShuffle(
+ SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
+ const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
+ MutableArrayRef<int> LoMask = Mask.slice(0, 4);
+ MutableArrayRef<int> HiMask = Mask.slice(4, 4);
+
+ SmallVector<int, 4> LoInputs;
+ std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
+ [](int M) { return M >= 0; });
+ std::sort(LoInputs.begin(), LoInputs.end());
+ LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
+ SmallVector<int, 4> HiInputs;
+ std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
+ [](int M) { return M >= 0; });
+ std::sort(HiInputs.begin(), HiInputs.end());
+ HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
+ int NumLToL =
+ std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
+ int NumHToL = LoInputs.size() - NumLToL;
+ int NumLToH =
+ std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
+ int NumHToH = HiInputs.size() - NumLToH;
+ MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
+ MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
+ MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
+ MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
+
+ // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
+ // such inputs we can swap two of the dwords across the half mark and end up
+ // with <=2 inputs to each half in each half. Once there, we can fall through
+ // to the generic code below. For example:
+ //
+ // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
+ // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
+ //
+ // Before we had 3-1 in the low half and 3-1 in the high half. Afterward, 2-2
+ // and 2-2.
+ auto balanceSides = [&](ArrayRef<int> ThreeInputs, int OneInput,
+ int ThreeInputHalfSum, int OneInputHalfOffset) {
+ // Compute the index of dword with only one word among the three inputs in
+ // a half by taking the sum of the half with three inputs and subtracting
+ // the sum of the actual three inputs. The difference is the remaining
+ // slot.
+ int DWordA = (ThreeInputHalfSum -
+ std::accumulate(ThreeInputs.begin(), ThreeInputs.end(), 0)) /
+ 2;
+ int DWordB = OneInputHalfOffset / 2 + (OneInput / 2 + 1) % 2;
+
+ int PSHUFDMask[] = {0, 1, 2, 3};
+ PSHUFDMask[DWordA] = DWordB;
+ PSHUFDMask[DWordB] = DWordA;
+ V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+ DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
+
+ // Adjust the mask to match the new locations of A and B.
+ for (int &M : Mask)
+ if (M != -1 && M/2 == DWordA)
+ M = 2 * DWordB + M % 2;
+ else if (M != -1 && M/2 == DWordB)
+ M = 2 * DWordA + M % 2;
+
+ // Recurse back into this routine to re-compute state now that this isn't
+ // a 3 and 1 problem.
+ return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
+ Mask);
+ };
+ if (NumLToL == 3 && NumHToL == 1)
+ return balanceSides(LToLInputs, HToLInputs[0], 0 + 1 + 2 + 3, 4);
+ else if (NumLToL == 1 && NumHToL == 3)
+ return balanceSides(HToLInputs, LToLInputs[0], 4 + 5 + 6 + 7, 0);
+ else if (NumLToH == 1 && NumHToH == 3)
+ return balanceSides(HToHInputs, LToHInputs[0], 4 + 5 + 6 + 7, 0);
+ else if (NumLToH == 3 && NumHToH == 1)
+ return balanceSides(LToHInputs, HToHInputs[0], 0 + 1 + 2 + 3, 4);
+
+ // At this point there are at most two inputs to the low and high halves from
+ // each half. That means the inputs can always be grouped into dwords and
+ // those dwords can then be moved to the correct half with a dword shuffle.
+ // We use at most one low and one high word shuffle to collect these paired
+ // inputs into dwords, and finally a dword shuffle to place them.
+ int PSHUFLMask[4] = {-1, -1, -1, -1};
+ int PSHUFHMask[4] = {-1, -1, -1, -1};
+ int PSHUFDMask[4] = {-1, -1, -1, -1};
+
+ // First fix the masks for all the inputs that are staying in their
+ // original halves. This will then dictate the targets of the cross-half
+ // shuffles.
+ auto fixInPlaceInputs = [&PSHUFDMask](
+ ArrayRef<int> InPlaceInputs, MutableArrayRef<int> SourceHalfMask,
+ MutableArrayRef<int> HalfMask, int HalfOffset) {
+ if (InPlaceInputs.empty())
+ return;
+ if (InPlaceInputs.size() == 1) {
+ SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+ InPlaceInputs[0] - HalfOffset;
+ PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
+ return;
+ }
+
+ assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
+ SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+ InPlaceInputs[0] - HalfOffset;
+ // Put the second input next to the first so that they are packed into
+ // a dword. We find the adjacent index by toggling the low bit.
+ int AdjIndex = InPlaceInputs[0] ^ 1;
+ SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
+ std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
+ PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
+ };
+ if (!HToLInputs.empty())
+ fixInPlaceInputs(LToLInputs, PSHUFLMask, LoMask, 0);
+ if (!LToHInputs.empty())
+ fixInPlaceInputs(HToHInputs, PSHUFHMask, HiMask, 4);
+
+ // Now gather the cross-half inputs and place them into a free dword of
+ // their target half.
+ // FIXME: This operation could almost certainly be simplified dramatically to
+ // look more like the 3-1 fixing operation.
+ auto moveInputsToRightHalf = [&PSHUFDMask](
+ MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
+ MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
+ int SourceOffset, int DestOffset) {
+ auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
+ return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
+ };
+ auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
+ int Word) {
+ int LowWord = Word & ~1;
+ int HighWord = Word | 1;
+ return isWordClobbered(SourceHalfMask, LowWord) ||
+ isWordClobbered(SourceHalfMask, HighWord);
+ };
+
+ if (IncomingInputs.empty())
+ return;
+
+ if (ExistingInputs.empty()) {
+ // Map any dwords with inputs from them into the right half.
+ for (int Input : IncomingInputs) {
+ // If the source half mask maps over the inputs, turn those into
+ // swaps and use the swapped lane.
+ if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
+ if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
+ SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
+ Input - SourceOffset;
+ // We have to swap the uses in our half mask in one sweep.
+ for (int &M : HalfMask)
+ if (M == SourceHalfMask[Input - SourceOffset])
+ M = Input;
+ else if (M == Input)
+ M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
+ } else {
+ assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
+ Input - SourceOffset &&
+ "Previous placement doesn't match!");
+ }
+ // Note that this correctly re-maps both when we do a swap and when
+ // we observe the other side of the swap above. We rely on that to
+ // avoid swapping the members of the input list directly.
+ Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
+ }
+
+ // Map the input's dword into the correct half.
+ if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
+ PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
+ else
+ assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
+ Input / 2 &&
+ "Previous placement doesn't match!");
+ }
+
+ // And just directly shift any other-half mask elements to be same-half
+ // as we will have mirrored the dword containing the element into the
+ // same position within that half.
+ for (int &M : HalfMask)
+ if (M >= SourceOffset && M < SourceOffset + 4) {
+ M = M - SourceOffset + DestOffset;
+ assert(M >= 0 && "This should never wrap below zero!");
+ }
+ return;
+ }
+
+ // Ensure we have the input in a viable dword of its current half. This
+ // is particularly tricky because the original position may be clobbered
+ // by inputs being moved and *staying* in that half.
+ if (IncomingInputs.size() == 1) {
+ if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
+ int InputFixed = std::find(std::begin(SourceHalfMask),
+ std::end(SourceHalfMask), -1) -
+ std::begin(SourceHalfMask) + SourceOffset;
+ SourceHalfMask[InputFixed - SourceOffset] =
+ IncomingInputs[0] - SourceOffset;
+ std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
+ InputFixed);
+ IncomingInputs[0] = InputFixed;
+ }
+ } else if (IncomingInputs.size() == 2) {
+ if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
+ isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
+ int SourceDWordBase = !isDWordClobbered(SourceHalfMask, 0) ? 0 : 2;
+ assert(!isDWordClobbered(SourceHalfMask, SourceDWordBase) &&
+ "Not all dwords can be clobbered!");
+ SourceHalfMask[SourceDWordBase] = IncomingInputs[0] - SourceOffset;
+ SourceHalfMask[SourceDWordBase + 1] = IncomingInputs[1] - SourceOffset;
+ for (int &M : HalfMask)
+ if (M == IncomingInputs[0])
+ M = SourceDWordBase + SourceOffset;
+ else if (M == IncomingInputs[1])
+ M = SourceDWordBase + 1 + SourceOffset;
+ IncomingInputs[0] = SourceDWordBase + SourceOffset;
+ IncomingInputs[1] = SourceDWordBase + 1 + SourceOffset;
+ }
+ } else {
+ llvm_unreachable("Unhandled input size!");
+ }
+
+ // Now hoist the DWord down to the right half.
+ int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
+ assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
+ PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
+ for (int Input : IncomingInputs)
+ std::replace(HalfMask.begin(), HalfMask.end(), Input,
+ FreeDWord * 2 + Input % 2);
+ };
+ moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask,
+ /*SourceOffset*/ 4, /*DestOffset*/ 0);
+ moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask,
+ /*SourceOffset*/ 0, /*DestOffset*/ 4);
+
+ // Now enact all the shuffles we've computed to move the inputs into their
+ // target half.
+ if (!isNoopShuffleMask(PSHUFLMask))
+ V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
+ getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
+ if (!isNoopShuffleMask(PSHUFHMask))
+ V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
+ getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
+ if (!isNoopShuffleMask(PSHUFDMask))
+ V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+ DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
+
+ // At this point, each half should contain all its inputs, and we can then
+ // just shuffle them into their final position.
+ assert(std::count_if(LoMask.begin(), LoMask.end(),
+ [](int M) { return M >= 4; }) == 0 &&
+ "Failed to lift all the high half inputs to the low mask!");
+ assert(std::count_if(HiMask.begin(), HiMask.end(),
+ [](int M) { return M >= 0 && M < 4; }) == 0 &&
+ "Failed to lift all the low half inputs to the high mask!");
+
+ // Do a half shuffle for the low mask.
+ if (!isNoopShuffleMask(LoMask))
+ V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
+ getV4X86ShuffleImm8ForMask(LoMask, DAG));
+
+ // Do a half shuffle with the high mask after shifting its values down.
+ for (int &M : HiMask)
+ if (M >= 0)
+ M -= 4;
+ if (!isNoopShuffleMask(HiMask))
+ V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
+ getV4X86ShuffleImm8ForMask(HiMask, DAG));
+
+ return V;
+}
+
+/// \brief Detect whether the mask pattern should be lowered through
+/// interleaving.
+///
+/// This essentially tests whether viewing the mask as an interleaving of two
+/// sub-sequences reduces the cross-input traffic of a blend operation. If so,
+/// lowering it through interleaving is a significantly better strategy.
+static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
+ int NumEvenInputs[2] = {0, 0};
+ int NumOddInputs[2] = {0, 0};
+ int NumLoInputs[2] = {0, 0};
+ int NumHiInputs[2] = {0, 0};
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ int InputIdx = Mask[i] >= Size;
+
+ if (i < Size / 2)
+ ++NumLoInputs[InputIdx];
+ else
+ ++NumHiInputs[InputIdx];
+
+ if ((i % 2) == 0)
+ ++NumEvenInputs[InputIdx];
+ else
+ ++NumOddInputs[InputIdx];
+ }
+
+ // The minimum number of cross-input results for both the interleaved and
+ // split cases. If interleaving results in fewer cross-input results, return
+ // true.
+ int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
+ NumEvenInputs[0] + NumOddInputs[1]);
+ int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
+ NumLoInputs[0] + NumHiInputs[1]);
+ return InterleavedCrosses < SplitCrosses;
+}
+
+/// \brief Blend two v8i16 vectors using a naive unpack strategy.
+///
+/// This strategy only works when the inputs from each vector fit into a single
+/// half of that vector, and generally there are not so many inputs as to leave
+/// the in-place shuffles required highly constrained (and thus expensive). It
+/// shifts all the inputs into a single side of both input vectors and then
+/// uses an unpack to interleave these inputs in a single vector. At that
+/// point, we will fall back on the generic single input shuffle lowering.
+static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
+ SDValue V2,
+ MutableArrayRef<int> Mask,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
+ assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
+ SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
+ for (int i = 0; i < 8; ++i)
+ if (Mask[i] >= 0 && Mask[i] < 4)
+ LoV1Inputs.push_back(i);
+ else if (Mask[i] >= 4 && Mask[i] < 8)
+ HiV1Inputs.push_back(i);
+ else if (Mask[i] >= 8 && Mask[i] < 12)
+ LoV2Inputs.push_back(i);
+ else if (Mask[i] >= 12)
+ HiV2Inputs.push_back(i);
+
+ int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
+ int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
+ (void)NumV1Inputs;
+ (void)NumV2Inputs;
+ assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
+ assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
+ assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
+
+ bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
+ HiV1Inputs.size() + HiV2Inputs.size();
+
+ auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
+ ArrayRef<int> HiInputs, bool MoveToLo,
+ int MaskOffset) {
+ ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
+ ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
+ if (BadInputs.empty())
+ return V;
+
+ int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ int MoveOffset = MoveToLo ? 0 : 4;
+
+ if (GoodInputs.empty()) {
+ for (int BadInput : BadInputs) {
+ MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
+ Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
+ }
+ } else {
+ if (GoodInputs.size() == 2) {
+ // If the low inputs are spread across two dwords, pack them into
+ // a single dword.
+ MoveMask[Mask[GoodInputs[0]] % 2 + MoveOffset] =
+ Mask[GoodInputs[0]] - MaskOffset;
+ MoveMask[Mask[GoodInputs[1]] % 2 + MoveOffset] =
+ Mask[GoodInputs[1]] - MaskOffset;
+ Mask[GoodInputs[0]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset;
+ Mask[GoodInputs[1]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset;
+ } else {
+ // Otherwise pin the low inputs.
+ for (int GoodInput : GoodInputs)
+ MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
+ }
+
+ int MoveMaskIdx =
+ std::find(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), -1) -
+ std::begin(MoveMask);
+ assert(MoveMaskIdx >= MoveOffset && "Established above");
+
+ if (BadInputs.size() == 2) {
+ assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
+ assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
+ MoveMask[MoveMaskIdx + Mask[BadInputs[0]] % 2] =
+ Mask[BadInputs[0]] - MaskOffset;
+ MoveMask[MoveMaskIdx + Mask[BadInputs[1]] % 2] =
+ Mask[BadInputs[1]] - MaskOffset;
+ Mask[BadInputs[0]] = MoveMaskIdx + Mask[BadInputs[0]] % 2 + MaskOffset;
+ Mask[BadInputs[1]] = MoveMaskIdx + Mask[BadInputs[1]] % 2 + MaskOffset;
+ } else {
+ assert(BadInputs.size() == 1 && "All sizes handled");
+ MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
+ Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
+ }
+ }
+
+ return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
+ MoveMask);
+ };
+ V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
+ /*MaskOffset*/ 0);
+ V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
+ /*MaskOffset*/ 8);
+
+ // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
+ // cross-half traffic in the final shuffle.
+
+ // Munge the mask to be a single-input mask after the unpack merges the
+ // results.
+ for (int &M : Mask)
+ if (M != -1)
+ M = 2 * (M % 4) + (M / 8);
+
+ return DAG.getVectorShuffle(
+ MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
+ DL, MVT::v8i16, V1, V2),
+ DAG.getUNDEF(MVT::v8i16), Mask);
+}
+
+/// \brief Generic lowering of 8-lane i16 shuffles.
+///
+/// This handles both single-input shuffles and combined shuffle/blends with
+/// two inputs. The single input shuffles are immediately delegated to
+/// a dedicated lowering routine.
+///
+/// The blends are lowered in one of three fundamental ways. If there are few
+/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
+/// of the input is significantly cheaper when lowered as an interleaving of
+/// the two inputs, try to interleave them. Otherwise, blend the low and high
+/// halves of the inputs separately (making them have relatively few inputs)
+/// and then concatenate them.
+static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> OrigMask = SVOp->getMask();
+ int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
+ OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
+ MutableArrayRef<int> Mask(MaskStorage);
+
+ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+ auto isV1 = [](int M) { return M >= 0 && M < 8; };
+ auto isV2 = [](int M) { return M >= 8; };
+
+ int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
+ int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
+
+ if (NumV2Inputs == 0)
+ return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
+
+ assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
+ "to be V1-input shuffles.");
+
+ if (NumV1Inputs + NumV2Inputs <= 4)
+ return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
+
+ // Check whether an interleaving lowering is likely to be more efficient.
+ // This isn't perfect but it is a strong heuristic that tends to work well on
+ // the kinds of shuffles that show up in practice.
+ //
+ // FIXME: Handle 1x, 2x, and 4x interleaving.
+ if (shouldLowerAsInterleaving(Mask)) {
+ // FIXME: Figure out whether we should pack these into the low or high
+ // halves.
+
+ int EMask[8], OMask[8];
+ for (int i = 0; i < 4; ++i) {
+ EMask[i] = Mask[2*i];
+ OMask[i] = Mask[2*i + 1];
+ EMask[i + 4] = -1;
+ OMask[i + 4] = -1;
+ }
+
+ SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
+ SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
+
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
+ }
+
+ int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+
+ for (int i = 0; i < 4; ++i) {
+ LoBlendMask[i] = Mask[i];
+ HiBlendMask[i] = Mask[i + 4];
+ }
+
+ SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
+ SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
+ LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
+ HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
+
+ return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+ DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
+}
+
+/// \brief Generic lowering of v16i8 shuffles.
+///
+/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
+/// detect any complexity reducing interleaving. If that doesn't help, it uses
+/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
+/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
+/// back together.
+static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> OrigMask = SVOp->getMask();
+ assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ int MaskStorage[16] = {
+ OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
+ OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7],
+ OrigMask[8], OrigMask[9], OrigMask[10], OrigMask[11],
+ OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
+ MutableArrayRef<int> Mask(MaskStorage);
+ MutableArrayRef<int> LoMask = Mask.slice(0, 8);
+ MutableArrayRef<int> HiMask = Mask.slice(8, 8);
+
+ // For single-input shuffles, there are some nicer lowering tricks we can use.
+ if (isSingleInputShuffleMask(Mask)) {
+ // Check whether we can widen this to an i16 shuffle by duplicating bytes.
+ // Notably, this handles splat and partial-splat shuffles more efficiently.
+ // However, it only makes sense if the pre-duplication shuffle simplifies
+ // things significantly. Currently, this means we need to be able to
+ // express the pre-duplication shuffle as an i16 shuffle.
+ //
+ // FIXME: We should check for other patterns which can be widened into an
+ // i16 shuffle as well.
+ auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
+ for (int i = 0; i < 16; i += 2) {
+ if (Mask[i] != Mask[i + 1])
+ return false;
+ }
+ return true;
+ };
+ auto tryToWidenViaDuplication = [&]() -> SDValue {
+ if (!canWidenViaDuplication(Mask))
+ return SDValue();
+ SmallVector<int, 4> LoInputs;
+ std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
+ [](int M) { return M >= 0 && M < 8; });
+ std::sort(LoInputs.begin(), LoInputs.end());
+ LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
+ LoInputs.end());
+ SmallVector<int, 4> HiInputs;
+ std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
+ [](int M) { return M >= 8; });
+ std::sort(HiInputs.begin(), HiInputs.end());
+ HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
+ HiInputs.end());
+
+ bool TargetLo = LoInputs.size() >= HiInputs.size();
+ ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
+ ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
+
+ int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ SmallDenseMap<int, int, 8> LaneMap;
+ for (int I : InPlaceInputs) {
+ PreDupI16Shuffle[I/2] = I/2;
+ LaneMap[I] = I;
+ }
+ int j = TargetLo ? 0 : 4, je = j + 4;
+ for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
+ // Check if j is already a shuffle of this input. This happens when
+ // there are two adjacent bytes after we move the low one.
+ if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
+ // If we haven't yet mapped the input, search for a slot into which
+ // we can map it.
+ while (j < je && PreDupI16Shuffle[j] != -1)
+ ++j;
+
+ if (j == je)
+ // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
+ return SDValue();
+
+ // Map this input with the i16 shuffle.
+ PreDupI16Shuffle[j] = MovingInputs[i] / 2;
+ }
+
+ // Update the lane map based on the mapping we ended up with.
+ LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
+ }
+ V1 = DAG.getNode(
+ ISD::BITCAST, DL, MVT::v16i8,
+ DAG.getVectorShuffle(MVT::v8i16, DL,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+ DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
+
+ // Unpack the bytes to form the i16s that will be shuffled into place.
+ V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+ MVT::v16i8, V1, V1);
+
+ int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ for (int i = 0; i < 16; i += 2) {
+ if (Mask[i] != -1)
+ PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
+ assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!");
+ }
+ return DAG.getNode(
+ ISD::BITCAST, DL, MVT::v16i8,
+ DAG.getVectorShuffle(MVT::v8i16, DL,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+ DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
+ };
+ if (SDValue V = tryToWidenViaDuplication())
+ return V;
+ }
+
+ // Check whether an interleaving lowering is likely to be more efficient.
+ // This isn't perfect but it is a strong heuristic that tends to work well on
+ // the kinds of shuffles that show up in practice.
+ //
+ // FIXME: We need to handle other interleaving widths (i16, i32, ...).
+ if (shouldLowerAsInterleaving(Mask)) {
+ // FIXME: Figure out whether we should pack these into the low or high
+ // halves.
+
+ int EMask[16], OMask[16];
+ for (int i = 0; i < 8; ++i) {
+ EMask[i] = Mask[2*i];
+ OMask[i] = Mask[2*i + 1];
+ EMask[i + 8] = -1;
+ OMask[i + 8] = -1;
+ }
+
+ SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
+ SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
+
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds);
+ }
+
+ int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+
+ auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
+ MutableArrayRef<int> V1HalfBlendMask,
+ MutableArrayRef<int> V2HalfBlendMask) {
+ for (int i = 0; i < 8; ++i)
+ if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
+ V1HalfBlendMask[i] = HalfMask[i];
+ HalfMask[i] = i;
+ } else if (HalfMask[i] >= 16) {
+ V2HalfBlendMask[i] = HalfMask[i] - 16;
+ HalfMask[i] = i + 8;
+ }
+ };
+ buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
+ buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
+
+ SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
+
+ auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
+ MutableArrayRef<int> HiBlendMask) {
+ SDValue V1, V2;
+ // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
+ // them out and avoid using UNPCK{L,H} to extract the elements of V as
+ // i16s.
+ if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
+ [](int M) { return M >= 0 && M % 2 == 1; }) &&
+ std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
+ [](int M) { return M >= 0 && M % 2 == 1; })) {
+ // Use a mask to drop the high bytes.
+ V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+ V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
+ DAG.getConstant(0x00FF, MVT::v8i16));
+
+ // This will be a single vector shuffle instead of a blend so nuke V2.
+ V2 = DAG.getUNDEF(MVT::v8i16);
+
+ // Squash the masks to point directly into V1.
+ for (int &M : LoBlendMask)
+ if (M >= 0)
+ M /= 2;
+ for (int &M : HiBlendMask)
+ if (M >= 0)
+ M /= 2;
+ } else {
+ // Otherwise just unpack the low half of V into V1 and the high half into
+ // V2 so that we can blend them as i16s.
+ V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+ DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
+ V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+ DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
+ }
+
+ SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
+ SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
+ return std::make_pair(BlendedLo, BlendedHi);
+ };
+ SDValue V1Lo, V1Hi, V2Lo, V2Hi;
+ std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
+ std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
+
+ SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
+ SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
+
+ return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
+}
+
+/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
+///
+/// This routine breaks down the specific type of 128-bit shuffle and
+/// dispatches to the lowering routines accordingly.
+static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ MVT VT, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ switch (VT.SimpleTy) {
+ case MVT::v2i64:
+ return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v2f64:
+ return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v4i32:
+ return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v4f32:
+ return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v8i16:
+ return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v16i8:
+ return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
+
+ default:
+ llvm_unreachable("Unimplemented!");
+ }
+}
+
+/// \brief Tiny helper function to test whether adjacent masks are sequential.
+static bool areAdjacentMasksSequential(ArrayRef<int> Mask) {
+ for (int i = 0, Size = Mask.size(); i < Size; i += 2)
+ if (Mask[i] + 1 != Mask[i+1])
+ return false;
+
+ return true;
+}
+
+/// \brief Top-level lowering for x86 vector shuffles.
+///
+/// This handles decomposition, canonicalization, and lowering of all x86
+/// vector shuffles. Most of the specific lowering strategies are encapsulated
+/// above in helper routines. The canonicalization attempts to widen shuffles
+/// to involve fewer lanes of wider elements, consolidate symmetric patterns
+/// s.t. only one of the two inputs needs to be tested, etc.
+static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ MVT VT = Op.getSimpleValueType();
+ int NumElements = VT.getVectorNumElements();
+ SDLoc dl(Op);
+
+ assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
+
+ bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
+ bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+ if (V1IsUndef && V2IsUndef)
+ return DAG.getUNDEF(VT);
+
+ // When we create a shuffle node we put the UNDEF node to second operand,
+ // but in some cases the first operand may be transformed to UNDEF.
+ // In this case we should just commute the node.
+ if (V1IsUndef)
+ return CommuteVectorShuffle(SVOp, DAG);
+
+ // Check for non-undef masks pointing at an undef vector and make the masks
+ // undef as well. This makes it easier to match the shuffle based solely on
+ // the mask.
+ if (V2IsUndef)
+ for (int M : Mask)
+ if (M >= NumElements) {
+ SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
+ for (int &M : NewMask)
+ if (M >= NumElements)
+ M = -1;
+ return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
+ }
+
+ // For integer vector shuffles, try to collapse them into a shuffle of fewer
+ // lanes but wider integers. We cap this to not form integers larger than i64
+ // but it might be interesting to form i128 integers to handle flipping the
+ // low and high halves of AVX 256-bit vectors.
+ if (VT.isInteger() && VT.getScalarSizeInBits() < 64 &&
+ areAdjacentMasksSequential(Mask)) {
+ SmallVector<int, 8> NewMask;
+ for (int i = 0, Size = Mask.size(); i < Size; i += 2)
+ NewMask.push_back(Mask[i] / 2);
+ MVT NewVT =
+ MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2),
+ VT.getVectorNumElements() / 2);
+ V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
+ V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
+ return DAG.getNode(ISD::BITCAST, dl, VT,
+ DAG.getVectorShuffle(NewVT, dl, V1, V2, NewMask));
+ }
+
+ int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
+ for (int M : SVOp->getMask())
+ if (M < 0)
+ ++NumUndefElements;
+ else if (M < NumElements)
+ ++NumV1Elements;
+ else
+ ++NumV2Elements;
+
+ // Commute the shuffle as needed such that more elements come from V1 than
+ // V2. This allows us to match the shuffle pattern strictly on how many
+ // elements come from V1 without handling the symmetric cases.
+ if (NumV2Elements > NumV1Elements)
+ return CommuteVectorShuffle(SVOp, DAG);
+
+ // When the number of V1 and V2 elements are the same, try to minimize the
+ // number of uses of V2 in the low half of the vector.
+ if (NumV1Elements == NumV2Elements) {
+ int LowV1Elements = 0, LowV2Elements = 0;
+ for (int M : SVOp->getMask().slice(0, NumElements / 2))
+ if (M >= NumElements)
+ ++LowV2Elements;
+ else if (M >= 0)
+ ++LowV1Elements;
+ if (LowV2Elements > LowV1Elements)
+ return CommuteVectorShuffle(SVOp, DAG);
+ }
+
+ // For each vector width, delegate to a specialized lowering routine.
+ if (VT.getSizeInBits() == 128)
+ return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+
+ llvm_unreachable("Unimplemented!");
+}
+
+
+//===----------------------------------------------------------------------===//
+// Legacy vector shuffle lowering
+//
+// This code is the legacy code handling vector shuffles until the above
+// replaces its functionality and performance.
+//===----------------------------------------------------------------------===//
+
+static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
+ bool hasInt256, unsigned *MaskOut = nullptr) {
MVT EltVT = VT.getVectorElementType();
- unsigned NumElems = VT.getVectorNumElements();
// There is no blend with immediate in AVX-512.
if (VT.is512BitVector())
- return SDValue();
+ return false;
- if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
- return SDValue();
- if (!Subtarget->hasInt256() && VT == MVT::v16i16)
- return SDValue();
+ if (!hasSSE41 || EltVT == MVT::i8)
+ return false;
+ if (!hasInt256 && VT == MVT::v16i16)
+ return false;
- // Check the mask for BLEND and build the value.
unsigned MaskValue = 0;
+ unsigned NumElems = VT.getVectorNumElements();
// There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
- unsigned NumLanes = (NumElems-1)/8 + 1;
+ unsigned NumLanes = (NumElems - 1) / 8 + 1;
unsigned NumElemsInLane = NumElems / NumLanes;
// Blend for v16i16 should be symetric for the both lanes.
for (unsigned i = 0; i < NumElemsInLane; ++i) {
- int SndLaneEltIdx = (NumLanes == 2) ?
- SVOp->getMaskElt(i + NumElemsInLane) : -1;
- int EltIdx = SVOp->getMaskElt(i);
+ int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
+ int EltIdx = MaskVals[i];
if ((EltIdx < 0 || EltIdx == (int)i) &&
(SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
@@ -6469,11 +8044,34 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
if (((unsigned)EltIdx == (i + NumElems)) &&
(SndLaneEltIdx < 0 ||
(unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
- MaskValue |= (1<<i);
+ MaskValue |= (1 << i);
else
- return SDValue();
+ return false;
}
+ if (MaskOut)
+ *MaskOut = MaskValue;
+ return true;
+}
+
+// Try to lower a shuffle node into a simple blend instruction.
+// This function assumes isBlendMask returns true for this
+// SuffleVectorSDNode
+static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
+ unsigned MaskValue,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = SVOp->getSimpleValueType(0);
+ MVT EltVT = VT.getVectorElementType();
+ assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
+ Subtarget->hasInt256() && "Trying to lower a "
+ "VECTOR_SHUFFLE to a Blend but "
+ "with the wrong mask"));
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ SDLoc dl(SVOp);
+ unsigned NumElems = VT.getVectorNumElements();
+
// Convert i32 vectors to floating point if it is not AVX2.
// AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
MVT BlendVT = VT;
@@ -7450,8 +9048,9 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
"unsupported vector type for insertps/pinsrd");
- int FromV1 = std::count_if(Mask.begin(), Mask.end(),
- [](const int &i) { return i < 4; });
+ auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
+ auto FromV2Predicate = [](const int &i) { return i >= 4; };
+ int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
SDValue From;
SDValue To;
@@ -7459,23 +9058,26 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
if (FromV1 == 1) {
From = V1;
To = V2;
- DestIndex = std::find_if(Mask.begin(), Mask.end(),
- [](const int &i) { return i < 4; }) -
+ DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
Mask.begin();
} else {
+ assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
+ "More than one element from V1 and from V2, or no elements from one "
+ "of the vectors. This case should not have returned true from "
+ "isINSERTPSMask");
From = V2;
To = V1;
- DestIndex = std::find_if(Mask.begin(), Mask.end(),
- [](const int &i) { return i >= 4; }) -
- Mask.begin();
+ DestIndex =
+ std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
}
+ unsigned SrcIndex = Mask[DestIndex] % 4;
if (MayFoldLoad(From)) {
// Trivial case, when From comes from a load and is only used by the
// shuffle. Make it use insertps from the vector that we need from that
// load.
SDValue NewLoad =
- NarrowVectorLoadToElement(cast<LoadSDNode>(From), DestIndex, DAG);
+ NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
if (!NewLoad.getNode())
return SDValue();
@@ -7496,7 +9098,6 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
}
// Vector-element-to-vector
- unsigned SrcIndex = Mask[DestIndex] % 4;
SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
}
@@ -7663,6 +9264,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
bool OptForSize = MF.getFunction()->getAttributes().
hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+ // Check if we should use the experimental vector shuffle lowering. If so,
+ // delegate completely to that code path.
+ if (ExperimentalVectorShuffleLowering)
+ return lowerVectorShuffle(Op, Subtarget, DAG);
+
assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
if (V1IsUndef && V2IsUndef)
@@ -7796,8 +9402,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
bool Commuted = false;
// FIXME: This should also accept a bitcast of a splat? Be careful, not
// 1,1,1,1 -> v8i16 though.
- V1IsSplat = isSplatVector(V1.getNode());
- V2IsSplat = isSplatVector(V2.getNode());
+ BitVector UndefElements;
+ if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
+ if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
+ V1IsSplat = true;
+ if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
+ if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
+ V2IsSplat = true;
// Canonicalize the splat or undef, if present, to be on the RHS.
if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
@@ -7873,6 +9484,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
getShufflePSHUFLWImmediate(SVOp),
DAG);
+ unsigned MaskValue;
+ if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),
+ &MaskValue))
+ return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
+
if (isSHUFPMask(M, VT))
return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
getShuffleSHUFImmediate(SVOp), DAG);
@@ -7910,10 +9526,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
- SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG);
- if (BlendOp.getNode())
- return BlendOp;
-
if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
return getINSERTPS(SVOp, dl, DAG);
@@ -8530,7 +10142,7 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
// global base reg.
unsigned char OpFlag = 0;
unsigned WrapperKind = X86ISD::Wrapper;
- CodeModel::Model M = getTargetMachine().getCodeModel();
+ CodeModel::Model M = DAG.getTarget().getCodeModel();
if (Subtarget->isPICStyleRIPRel() &&
(M == CodeModel::Small || M == CodeModel::Kernel))
@@ -8563,7 +10175,7 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
// global base reg.
unsigned char OpFlag = 0;
unsigned WrapperKind = X86ISD::Wrapper;
- CodeModel::Model M = getTargetMachine().getCodeModel();
+ CodeModel::Model M = DAG.getTarget().getCodeModel();
if (Subtarget->isPICStyleRIPRel() &&
(M == CodeModel::Small || M == CodeModel::Kernel))
@@ -8596,7 +10208,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
// global base reg.
unsigned char OpFlag = 0;
unsigned WrapperKind = X86ISD::Wrapper;
- CodeModel::Model M = getTargetMachine().getCodeModel();
+ CodeModel::Model M = DAG.getTarget().getCodeModel();
if (Subtarget->isPICStyleRIPRel() &&
(M == CodeModel::Small || M == CodeModel::Kernel)) {
@@ -8617,7 +10229,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
// With PIC, the address is actually $g + Offset.
- if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
!Subtarget->is64Bit()) {
Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
DAG.getNode(X86ISD::GlobalBaseReg,
@@ -8639,7 +10251,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
// Create the TargetBlockAddressAddress node.
unsigned char OpFlags =
Subtarget->ClassifyBlockAddressReference();
- CodeModel::Model M = getTargetMachine().getCodeModel();
+ CodeModel::Model M = DAG.getTarget().getCodeModel();
const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
SDLoc dl(Op);
@@ -8668,8 +10280,8 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
// Create the TargetGlobalAddress node, folding in the constant
// offset if it is legal.
unsigned char OpFlags =
- Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
- CodeModel::Model M = getTargetMachine().getCodeModel();
+ Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
+ CodeModel::Model M = DAG.getTarget().getCodeModel();
SDValue Result;
if (OpFlags == X86II::MO_NO_FLAG &&
X86::isOffsetSuitableForCodeModel(Offset, M)) {
@@ -8868,7 +10480,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
const GlobalValue *GV = GA->getGlobal();
if (Subtarget->isTargetELF()) {
- TLSModel::Model model = getTargetMachine().getTLSModel(GV);
+ TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
switch (model) {
case TLSModel::GeneralDynamic:
@@ -8880,9 +10492,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
Subtarget->is64Bit());
case TLSModel::InitialExec:
case TLSModel::LocalExec:
- return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
- Subtarget->is64Bit(),
- getTargetMachine().getRelocationModel() == Reloc::PIC_);
+ return LowerToTLSExecModel(
+ GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
+ DAG.getTarget().getRelocationModel() == Reloc::PIC_);
}
llvm_unreachable("Unknown TLS model.");
}
@@ -8895,8 +10507,8 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
// global base reg.
- bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) &&
- !Subtarget->is64Bit();
+ bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
+ !Subtarget->is64Bit();
if (PIC32)
OpFlag = X86II::MO_TLVP_PIC_BASE;
else
@@ -10050,10 +11662,27 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
break;
case X86::COND_G: case X86::COND_GE:
case X86::COND_L: case X86::COND_LE:
- case X86::COND_O: case X86::COND_NO:
- NeedOF = true;
+ case X86::COND_O: case X86::COND_NO: {
+ // Check if we really need to set the
+ // Overflow flag. If NoSignedWrap is present
+ // that is not actually needed.
+ switch (Op->getOpcode()) {
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL:
+ case ISD::SHL: {
+ const BinaryWithFlagsSDNode *BinNode =
+ cast<BinaryWithFlagsSDNode>(Op.getNode());
+ if (BinNode->hasNoSignedWrap())
+ break;
+ }
+ default:
+ NeedOF = true;
+ break;
+ }
break;
}
+ }
// See if we can use the EFLAGS value from the operand instead of
// doing a separate TEST. TEST always sets OF and CF to 0, so unless
// we prove that the arithmetic won't overflow, we can't use OF or CF.
@@ -10115,14 +11744,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
if (ConstantSDNode *C =
dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
// An add of one will be selected as an INC.
- if (C->getAPIntValue() == 1) {
+ if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
Opcode = X86ISD::INC;
NumOperands = 1;
break;
}
// An add of negative one (subtract of one) will be selected as a DEC.
- if (C->getAPIntValue().isAllOnesValue()) {
+ if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
Opcode = X86ISD::DEC;
NumOperands = 1;
break;
@@ -10138,7 +11767,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
// If we have a constant logical shift that's only used in a comparison
// against zero turn it into an equivalent AND. This allows turning it into
// a TEST instruction later.
- if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
+ if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
EVT VT = Op.getValueType();
unsigned BitWidth = VT.getSizeInBits();
@@ -11469,8 +13098,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
}
if (addTest) {
- CC = DAG.getConstant(X86::COND_NE, MVT::i8);
- Cond = EmitTest(Cond, X86::COND_NE, dl, DAG);
+ X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
+ CC = DAG.getConstant(X86Cond, MVT::i8);
+ Cond = EmitTest(Cond, X86Cond, dl, DAG);
}
Cond = ConvertCmpIfNecessary(Cond, DAG);
return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
@@ -11513,7 +13143,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
Chain = SP.getValue(1);
unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
- const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
+ const TargetFrameLowering &TFI = *DAG.getTarget().getFrameLowering();
unsigned StackAlign = TFI.getStackAlignment();
Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
if (Align > StackAlign)
@@ -11572,7 +13202,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
const X86RegisterInfo *RegInfo =
- static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+ static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
unsigned SPReg = RegInfo->getStackRegister();
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
Chain = SP.getValue(1);
@@ -11681,7 +13311,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
if (ArgMode == 2) {
// Sanity Check: Make sure using fp_offset makes sense.
- assert(!getTargetMachine().Options.UseSoftFloat &&
+ assert(!DAG.getTarget().Options.UseSoftFloat &&
!(DAG.getMachineFunction()
.getFunction()->getAttributes()
.hasAttribute(AttributeSet::FunctionIndex,
@@ -12158,11 +13788,37 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
Op.getOperand(1), Op.getOperand(2));
}
+ case Intrinsic::x86_sse2_packssdw_128:
+ case Intrinsic::x86_sse2_packsswb_128:
+ case Intrinsic::x86_avx2_packssdw:
+ case Intrinsic::x86_avx2_packsswb:
+ return DAG.getNode(X86ISD::PACKSS, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
+ case Intrinsic::x86_sse2_packuswb_128:
+ case Intrinsic::x86_sse41_packusdw:
+ case Intrinsic::x86_avx2_packuswb:
+ case Intrinsic::x86_avx2_packusdw:
+ return DAG.getNode(X86ISD::PACKUS, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
case Intrinsic::x86_ssse3_pshuf_b_128:
case Intrinsic::x86_avx2_pshuf_b:
return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::x86_sse2_pshuf_d:
+ return DAG.getNode(X86ISD::PSHUFD, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
+ case Intrinsic::x86_sse2_pshufl_w:
+ return DAG.getNode(X86ISD::PSHUFLW, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
+ case Intrinsic::x86_sse2_pshufh_w:
+ return DAG.getNode(X86ISD::PSHUFHW, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
case Intrinsic::x86_ssse3_psign_b_128:
case Intrinsic::x86_ssse3_psign_w_128:
case Intrinsic::x86_ssse3_psign_d_128:
@@ -12610,6 +14266,51 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
return SDValue(Res, 0);
}
+// getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
+// read performance monitor counters (x86_rdpmc).
+static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
+ SelectionDAG &DAG, const X86Subtarget *Subtarget,
+ SmallVectorImpl<SDValue> &Results) {
+ assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue LO, HI;
+
+ // The ECX register is used to select the index of the performance counter
+ // to read.
+ SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
+ N->getOperand(2));
+ SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
+
+ // Reads the content of a 64-bit performance counter and returns it in the
+ // registers EDX:EAX.
+ if (Subtarget->is64Bit()) {
+ LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+ LO.getValue(2));
+ } else {
+ LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+ LO.getValue(2));
+ }
+ Chain = HI.getValue(1);
+
+ if (Subtarget->is64Bit()) {
+ // The EAX register is loaded with the low-order 32 bits. The EDX register
+ // is loaded with the supported high-order bits of the counter.
+ SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+ DAG.getConstant(32, MVT::i8));
+ Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+ Results.push_back(Chain);
+ return;
+ }
+
+ // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+ SDValue Ops[] = { LO, HI };
+ SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+ Results.push_back(Pair);
+ Results.push_back(Chain);
+}
+
// getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
// read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
// also used to custom lower READCYCLECOUNTER nodes.
@@ -12674,7 +14375,7 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
}
enum IntrinsicType {
- GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDTSC, XTEST
+ GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST
};
struct IntrinsicData {
@@ -12768,6 +14469,8 @@ static void InitIntinsicsMap() {
IntrinsicData(RDTSC, X86ISD::RDTSC_DAG, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp,
IntrinsicData(RDTSC, X86ISD::RDTSCP_DAG, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdpmc,
+ IntrinsicData(RDPMC, X86ISD::RDPMC_DAG, 0)));
Initialized = true;
}
@@ -12826,7 +14529,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
case PREFETCH: {
SDValue Hint = Op.getOperand(6);
unsigned HintVal;
- if (dyn_cast<ConstantSDNode> (Hint) == 0 ||
+ if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||
(HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
unsigned Opcode = (HintVal ? Intr.Opc1 : Intr.Opc0);
@@ -12843,6 +14546,12 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results);
return DAG.getMergeValues(Results, dl);
}
+ // Read Performance Monitoring Counters.
+ case RDPMC: {
+ SmallVector<SDValue, 2> Results;
+ getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
+ return DAG.getMergeValues(Results, dl);
+ }
// XTEST intrinsics.
case XTEST: {
SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
@@ -12873,7 +14582,7 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
if (Depth > 0) {
SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
const X86RegisterInfo *RegInfo =
- static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+ static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
DAG.getNode(ISD::ADD, dl, PtrVT,
@@ -12895,7 +14604,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op); // FIXME probably not meaningful
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
const X86RegisterInfo *RegInfo =
- static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+ static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
(FrameReg == X86::EBP && VT == MVT::i32)) &&
@@ -12924,7 +14633,7 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName,
SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
SelectionDAG &DAG) const {
const X86RegisterInfo *RegInfo =
- static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+ static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
}
@@ -12936,7 +14645,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
EVT PtrVT = getPointerTy();
const X86RegisterInfo *RegInfo =
- static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+ static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
@@ -12983,7 +14692,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
SDLoc dl (Op);
const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
- const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
+ const TargetRegisterInfo* TRI = DAG.getTarget().getRegisterInfo();
if (Subtarget->is64Bit()) {
SDValue OutChains[6];
@@ -13431,7 +15140,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
CLI.setDebugLoc(dl).setChain(InChain)
.setCallee(getLibcallCallingConv(LC),
static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
- Callee, &Args, 0)
+ Callee, std::move(Args), 0)
.setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
@@ -13448,7 +15157,7 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
(VT == MVT::v8i32 && Subtarget->hasInt256()));
// Get the high parts.
- const int Mask[] = {1, 2, 3, 4, 5, 6, 7, 8};
+ const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
@@ -13464,10 +15173,18 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1));
// Shuffle it back into the right order.
- const int HighMask[] = {1, 5, 3, 7, 9, 13, 11, 15};
- SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
- const int LowMask[] = {0, 4, 2, 6, 8, 12, 10, 14};
- SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+ SDValue Highs, Lows;
+ if (VT == MVT::v8i32) {
+ const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
+ Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+ const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
+ Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+ } else {
+ const int HighMask[] = {1, 5, 3, 7};
+ Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+ const int LowMask[] = {0, 4, 2, 6};
+ Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+ }
// If we have a signed multiply but no PMULDQ fix up the high parts of a
// unsigned multiply.
@@ -13494,10 +15211,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
SDValue Amt = Op.getOperand(1);
// Optimize shl/srl/sra with constant shift amount.
- if (isSplatVector(Amt.getNode())) {
- SDValue SclrAmt = Amt->getOperand(0);
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
- uint64_t ShiftAmt = C->getZExtValue();
+ if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
+ if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
+ uint64_t ShiftAmt = ShiftConst->getZExtValue();
if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
(Subtarget->hasInt256() &&
@@ -13804,15 +15520,14 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
SelectionDAG &DAG) {
-
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
SDValue V;
- if (!Subtarget->hasSSE2())
- return SDValue();
+ assert(VT.isVector() && "Custom lowering only for vector shifts!");
+ assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
V = LowerScalarImmediateShift(Op, DAG, Subtarget);
if (V.getNode())
@@ -14254,7 +15969,7 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
break;
}
SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
- Op.getOperand(2), SDValue());
+ Op.getOperand(2), SDValue());
SDValue Ops[] = { cpIn.getValue(0),
Op.getOperand(1),
Op.getOperand(3),
@@ -14264,9 +15979,18 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
Ops, T, MMO);
+
SDValue cpOut =
DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
- return cpOut;
+ SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
+ MVT::i32, cpOut.getValue(2));
+ SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
+ DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
+
+ DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
+ DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
+ DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
+ return SDValue();
}
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
@@ -14422,7 +16146,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
- .setCallee(CallingConv::C, RetTy, Callee, &Args, 0);
+ .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
@@ -14446,7 +16170,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
default: llvm_unreachable("Should not custom lower this!");
case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG);
case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
- case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op, Subtarget, DAG);
+ case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+ return LowerCMP_SWAP(Op, Subtarget, DAG);
case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
@@ -14528,8 +16253,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
}
static void ReplaceATOMIC_LOAD(SDNode *Node,
- SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG) {
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) {
SDLoc dl(Node);
EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
@@ -14538,38 +16263,16 @@ static void ReplaceATOMIC_LOAD(SDNode *Node,
// (The only way to get a 16-byte load is cmpxchg16b)
// FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
SDValue Zero = DAG.getConstant(0, VT);
- SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
- Node->getOperand(0),
- Node->getOperand(1), Zero, Zero,
- cast<AtomicSDNode>(Node)->getMemOperand(),
- cast<AtomicSDNode>(Node)->getOrdering(),
- cast<AtomicSDNode>(Node)->getOrdering(),
- cast<AtomicSDNode>(Node)->getSynchScope());
+ SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other);
+ SDValue Swap =
+ DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, VT, VTs,
+ Node->getOperand(0), Node->getOperand(1), Zero, Zero,
+ cast<AtomicSDNode>(Node)->getMemOperand(),
+ cast<AtomicSDNode>(Node)->getOrdering(),
+ cast<AtomicSDNode>(Node)->getOrdering(),
+ cast<AtomicSDNode>(Node)->getSynchScope());
Results.push_back(Swap.getValue(0));
- Results.push_back(Swap.getValue(1));
-}
-
-static void
-ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
- SelectionDAG &DAG, unsigned NewOp) {
- SDLoc dl(Node);
- assert (Node->getValueType(0) == MVT::i64 &&
- "Only know how to expand i64 atomics");
-
- SDValue Chain = Node->getOperand(0);
- SDValue In1 = Node->getOperand(1);
- SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
- Node->getOperand(2), DAG.getIntPtrConstant(0));
- SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
- Node->getOperand(2), DAG.getIntPtrConstant(1));
- SDValue Ops[] = { Chain, In1, In2L, In2H };
- SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
- SDValue Result =
- DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, MVT::i64,
- cast<MemSDNode>(Node)->getMemOperand());
- SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
- Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF));
- Results.push_back(Result.getValue(2));
+ Results.push_back(Swap.getValue(2));
}
/// ReplaceNodeResults - Replace a node with an illegal result type
@@ -14656,13 +16359,15 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case Intrinsic::x86_rdtscp:
return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
Results);
+ case Intrinsic::x86_rdpmc:
+ return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
}
}
case ISD::READCYCLECOUNTER: {
return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
Results);
}
- case ISD::ATOMIC_CMP_SWAP: {
+ case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
EVT T = N->getValueType(0);
assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
bool Regs64bit = T == MVT::i128;
@@ -14704,61 +16409,33 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Regs64bit ? X86::RDX : X86::EDX,
HalfT, cpOutL.getValue(2));
SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
+
+ SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
+ MVT::i32, cpOutH.getValue(2));
+ SDValue Success =
+ DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
+ Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
+
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
- Results.push_back(cpOutH.getValue(1));
+ Results.push_back(Success);
+ Results.push_back(EFLAGS.getValue(1));
return;
}
+ case ISD::ATOMIC_SWAP:
case ISD::ATOMIC_LOAD_ADD:
+ case ISD::ATOMIC_LOAD_SUB:
case ISD::ATOMIC_LOAD_AND:
- case ISD::ATOMIC_LOAD_NAND:
case ISD::ATOMIC_LOAD_OR:
- case ISD::ATOMIC_LOAD_SUB:
case ISD::ATOMIC_LOAD_XOR:
- case ISD::ATOMIC_LOAD_MAX:
+ case ISD::ATOMIC_LOAD_NAND:
case ISD::ATOMIC_LOAD_MIN:
- case ISD::ATOMIC_LOAD_UMAX:
+ case ISD::ATOMIC_LOAD_MAX:
case ISD::ATOMIC_LOAD_UMIN:
- case ISD::ATOMIC_SWAP: {
- unsigned Opc;
- switch (N->getOpcode()) {
- default: llvm_unreachable("Unexpected opcode");
- case ISD::ATOMIC_LOAD_ADD:
- Opc = X86ISD::ATOMADD64_DAG;
- break;
- case ISD::ATOMIC_LOAD_AND:
- Opc = X86ISD::ATOMAND64_DAG;
- break;
- case ISD::ATOMIC_LOAD_NAND:
- Opc = X86ISD::ATOMNAND64_DAG;
- break;
- case ISD::ATOMIC_LOAD_OR:
- Opc = X86ISD::ATOMOR64_DAG;
- break;
- case ISD::ATOMIC_LOAD_SUB:
- Opc = X86ISD::ATOMSUB64_DAG;
- break;
- case ISD::ATOMIC_LOAD_XOR:
- Opc = X86ISD::ATOMXOR64_DAG;
- break;
- case ISD::ATOMIC_LOAD_MAX:
- Opc = X86ISD::ATOMMAX64_DAG;
- break;
- case ISD::ATOMIC_LOAD_MIN:
- Opc = X86ISD::ATOMMIN64_DAG;
- break;
- case ISD::ATOMIC_LOAD_UMAX:
- Opc = X86ISD::ATOMUMAX64_DAG;
- break;
- case ISD::ATOMIC_LOAD_UMIN:
- Opc = X86ISD::ATOMUMIN64_DAG;
- break;
- case ISD::ATOMIC_SWAP:
- Opc = X86ISD::ATOMSWAP64_DAG;
- break;
- }
- ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc);
- return;
- }
+ case ISD::ATOMIC_LOAD_UMAX:
+ // Delegate to generic TypeLegalization. Situations we can really handle
+ // should have already been dealt with by X86AtomicExpand.cpp.
+ break;
case ISD::ATOMIC_LOAD: {
ReplaceATOMIC_LOAD(N, Results, DAG);
return;
@@ -14779,6 +16456,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
MVT::v2f64, N->getOperand(0));
SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
+ if (ExperimentalVectorWideningLegalization) {
+ // If we are legalizing vectors by widening, we already have the desired
+ // legal vector type, just return it.
+ Results.push_back(ToVecInt);
+ return;
+ }
+
SmallVector<SDValue, 8> Elts;
for (unsigned i = 0, e = NumElts; i != e; ++i)
Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
@@ -14810,6 +16494,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FST: return "X86ISD::FST";
case X86ISD::CALL: return "X86ISD::CALL";
case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
+ case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
+ case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
case X86ISD::BT: return "X86ISD::BT";
case X86ISD::CMP: return "X86ISD::CMP";
case X86ISD::COMI: return "X86ISD::COMI";
@@ -14863,12 +16549,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
- case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG";
- case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG";
- case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG";
- case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG";
- case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG";
- case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG";
+ case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
case X86ISD::VZEXT: return "X86ISD::VZEXT";
@@ -14909,6 +16590,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::TESTM: return "X86ISD::TESTM";
case X86ISD::TESTNM: return "X86ISD::TESTNM";
case X86ISD::KORTEST: return "X86ISD::KORTEST";
+ case X86ISD::PACKSS: return "X86ISD::PACKSS";
+ case X86ISD::PACKUS: return "X86ISD::PACKUS";
case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
@@ -15173,7 +16856,8 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
- isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()));
+ isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
+ isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()));
}
bool
@@ -15256,685 +16940,6 @@ static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
return sinkMBB;
}
-// Get CMPXCHG opcode for the specified data type.
-static unsigned getCmpXChgOpcode(EVT VT) {
- switch (VT.getSimpleVT().SimpleTy) {
- case MVT::i8: return X86::LCMPXCHG8;
- case MVT::i16: return X86::LCMPXCHG16;
- case MVT::i32: return X86::LCMPXCHG32;
- case MVT::i64: return X86::LCMPXCHG64;
- default:
- break;
- }
- llvm_unreachable("Invalid operand size!");
-}
-
-// Get LOAD opcode for the specified data type.
-static unsigned getLoadOpcode(EVT VT) {
- switch (VT.getSimpleVT().SimpleTy) {
- case MVT::i8: return X86::MOV8rm;
- case MVT::i16: return X86::MOV16rm;
- case MVT::i32: return X86::MOV32rm;
- case MVT::i64: return X86::MOV64rm;
- default:
- break;
- }
- llvm_unreachable("Invalid operand size!");
-}
-
-// Get opcode of the non-atomic one from the specified atomic instruction.
-static unsigned getNonAtomicOpcode(unsigned Opc) {
- switch (Opc) {
- case X86::ATOMAND8: return X86::AND8rr;
- case X86::ATOMAND16: return X86::AND16rr;
- case X86::ATOMAND32: return X86::AND32rr;
- case X86::ATOMAND64: return X86::AND64rr;
- case X86::ATOMOR8: return X86::OR8rr;
- case X86::ATOMOR16: return X86::OR16rr;
- case X86::ATOMOR32: return X86::OR32rr;
- case X86::ATOMOR64: return X86::OR64rr;
- case X86::ATOMXOR8: return X86::XOR8rr;
- case X86::ATOMXOR16: return X86::XOR16rr;
- case X86::ATOMXOR32: return X86::XOR32rr;
- case X86::ATOMXOR64: return X86::XOR64rr;
- }
- llvm_unreachable("Unhandled atomic-load-op opcode!");
-}
-
-// Get opcode of the non-atomic one from the specified atomic instruction with
-// extra opcode.
-static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc,
- unsigned &ExtraOpc) {
- switch (Opc) {
- case X86::ATOMNAND8: ExtraOpc = X86::NOT8r; return X86::AND8rr;
- case X86::ATOMNAND16: ExtraOpc = X86::NOT16r; return X86::AND16rr;
- case X86::ATOMNAND32: ExtraOpc = X86::NOT32r; return X86::AND32rr;
- case X86::ATOMNAND64: ExtraOpc = X86::NOT64r; return X86::AND64rr;
- case X86::ATOMMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVL32rr;
- case X86::ATOMMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr;
- case X86::ATOMMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr;
- case X86::ATOMMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr;
- case X86::ATOMMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVG32rr;
- case X86::ATOMMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr;
- case X86::ATOMMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr;
- case X86::ATOMMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr;
- case X86::ATOMUMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVB32rr;
- case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr;
- case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr;
- case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr;
- case X86::ATOMUMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVA32rr;
- case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr;
- case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr;
- case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr;
- }
- llvm_unreachable("Unhandled atomic-load-op opcode!");
-}
-
-// Get opcode of the non-atomic one from the specified atomic instruction for
-// 64-bit data type on 32-bit target.
-static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) {
- switch (Opc) {
- case X86::ATOMAND6432: HiOpc = X86::AND32rr; return X86::AND32rr;
- case X86::ATOMOR6432: HiOpc = X86::OR32rr; return X86::OR32rr;
- case X86::ATOMXOR6432: HiOpc = X86::XOR32rr; return X86::XOR32rr;
- case X86::ATOMADD6432: HiOpc = X86::ADC32rr; return X86::ADD32rr;
- case X86::ATOMSUB6432: HiOpc = X86::SBB32rr; return X86::SUB32rr;
- case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr;
- case X86::ATOMMAX6432: HiOpc = X86::SETLr; return X86::SETLr;
- case X86::ATOMMIN6432: HiOpc = X86::SETGr; return X86::SETGr;
- case X86::ATOMUMAX6432: HiOpc = X86::SETBr; return X86::SETBr;
- case X86::ATOMUMIN6432: HiOpc = X86::SETAr; return X86::SETAr;
- }
- llvm_unreachable("Unhandled atomic-load-op opcode!");
-}
-
-// Get opcode of the non-atomic one from the specified atomic instruction for
-// 64-bit data type on 32-bit target with extra opcode.
-static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc,
- unsigned &HiOpc,
- unsigned &ExtraOpc) {
- switch (Opc) {
- case X86::ATOMNAND6432:
- ExtraOpc = X86::NOT32r;
- HiOpc = X86::AND32rr;
- return X86::AND32rr;
- }
- llvm_unreachable("Unhandled atomic-load-op opcode!");
-}
-
-// Get pseudo CMOV opcode from the specified data type.
-static unsigned getPseudoCMOVOpc(EVT VT) {
- switch (VT.getSimpleVT().SimpleTy) {
- case MVT::i8: return X86::CMOV_GR8;
- case MVT::i16: return X86::CMOV_GR16;
- case MVT::i32: return X86::CMOV_GR32;
- default:
- break;
- }
- llvm_unreachable("Unknown CMOV opcode!");
-}
-
-// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions.
-// They will be translated into a spin-loop or compare-exchange loop from
-//
-// ...
-// dst = atomic-fetch-op MI.addr, MI.val
-// ...
-//
-// to
-//
-// ...
-// t1 = LOAD MI.addr
-// loop:
-// t4 = phi(t1, t3 / loop)
-// t2 = OP MI.val, t4
-// EAX = t4
-// LCMPXCHG [MI.addr], t2, [EAX is implicitly used & defined]
-// t3 = EAX
-// JNE loop
-// sink:
-// dst = t3
-// ...
-MachineBasicBlock *
-X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
- MachineBasicBlock *MBB) const {
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
- DebugLoc DL = MI->getDebugLoc();
-
- MachineFunction *MF = MBB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
-
- const BasicBlock *BB = MBB->getBasicBlock();
- MachineFunction::iterator I = MBB;
- ++I;
-
- assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 &&
- "Unexpected number of operands");
-
- assert(MI->hasOneMemOperand() &&
- "Expected atomic-load-op to have one memoperand");
-
- // Memory Reference
- MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
- MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
-
- unsigned DstReg, SrcReg;
- unsigned MemOpndSlot;
-
- unsigned CurOp = 0;
-
- DstReg = MI->getOperand(CurOp++).getReg();
- MemOpndSlot = CurOp;
- CurOp += X86::AddrNumOperands;
- SrcReg = MI->getOperand(CurOp++).getReg();
-
- const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
- MVT::SimpleValueType VT = *RC->vt_begin();
- unsigned t1 = MRI.createVirtualRegister(RC);
- unsigned t2 = MRI.createVirtualRegister(RC);
- unsigned t3 = MRI.createVirtualRegister(RC);
- unsigned t4 = MRI.createVirtualRegister(RC);
- unsigned PhyReg = getX86SubSuperRegister(X86::EAX, VT);
-
- unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT);
- unsigned LOADOpc = getLoadOpcode(VT);
-
- // For the atomic load-arith operator, we generate
- //
- // thisMBB:
- // t1 = LOAD [MI.addr]
- // mainMBB:
- // t4 = phi(t1 / thisMBB, t3 / mainMBB)
- // t1 = OP MI.val, EAX
- // EAX = t4
- // LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
- // t3 = EAX
- // JNE mainMBB
- // sinkMBB:
- // dst = t3
-
- MachineBasicBlock *thisMBB = MBB;
- MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
- MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
- MF->insert(I, mainMBB);
- MF->insert(I, sinkMBB);
-
- MachineInstrBuilder MIB;
-
- // Transfer the remainder of BB and its successor edges to sinkMBB.
- sinkMBB->splice(sinkMBB->begin(), MBB,
- std::next(MachineBasicBlock::iterator(MI)), MBB->end());
- sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
-
- // thisMBB:
- MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1);
- for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
- MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
- if (NewMO.isReg())
- NewMO.setIsKill(false);
- MIB.addOperand(NewMO);
- }
- for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
- unsigned flags = (*MMOI)->getFlags();
- flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
- MachineMemOperand *MMO =
- MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
- (*MMOI)->getSize(),
- (*MMOI)->getBaseAlignment(),
- (*MMOI)->getTBAAInfo(),
- (*MMOI)->getRanges());
- MIB.addMemOperand(MMO);
- }
-
- thisMBB->addSuccessor(mainMBB);
-
- // mainMBB:
- MachineBasicBlock *origMainMBB = mainMBB;
-
- // Add a PHI.
- MachineInstr *Phi = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4)
- .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
-
- unsigned Opc = MI->getOpcode();
- switch (Opc) {
- default:
- llvm_unreachable("Unhandled atomic-load-op opcode!");
- case X86::ATOMAND8:
- case X86::ATOMAND16:
- case X86::ATOMAND32:
- case X86::ATOMAND64:
- case X86::ATOMOR8:
- case X86::ATOMOR16:
- case X86::ATOMOR32:
- case X86::ATOMOR64:
- case X86::ATOMXOR8:
- case X86::ATOMXOR16:
- case X86::ATOMXOR32:
- case X86::ATOMXOR64: {
- unsigned ARITHOpc = getNonAtomicOpcode(Opc);
- BuildMI(mainMBB, DL, TII->get(ARITHOpc), t2).addReg(SrcReg)
- .addReg(t4);
- break;
- }
- case X86::ATOMNAND8:
- case X86::ATOMNAND16:
- case X86::ATOMNAND32:
- case X86::ATOMNAND64: {
- unsigned Tmp = MRI.createVirtualRegister(RC);
- unsigned NOTOpc;
- unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc);
- BuildMI(mainMBB, DL, TII->get(ANDOpc), Tmp).addReg(SrcReg)
- .addReg(t4);
- BuildMI(mainMBB, DL, TII->get(NOTOpc), t2).addReg(Tmp);
- break;
- }
- case X86::ATOMMAX8:
- case X86::ATOMMAX16:
- case X86::ATOMMAX32:
- case X86::ATOMMAX64:
- case X86::ATOMMIN8:
- case X86::ATOMMIN16:
- case X86::ATOMMIN32:
- case X86::ATOMMIN64:
- case X86::ATOMUMAX8:
- case X86::ATOMUMAX16:
- case X86::ATOMUMAX32:
- case X86::ATOMUMAX64:
- case X86::ATOMUMIN8:
- case X86::ATOMUMIN16:
- case X86::ATOMUMIN32:
- case X86::ATOMUMIN64: {
- unsigned CMPOpc;
- unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc);
-
- BuildMI(mainMBB, DL, TII->get(CMPOpc))
- .addReg(SrcReg)
- .addReg(t4);
-
- if (Subtarget->hasCMov()) {
- if (VT != MVT::i8) {
- // Native support
- BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2)
- .addReg(SrcReg)
- .addReg(t4);
- } else {
- // Promote i8 to i32 to use CMOV32
- const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
- const TargetRegisterClass *RC32 =
- TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit);
- unsigned SrcReg32 = MRI.createVirtualRegister(RC32);
- unsigned AccReg32 = MRI.createVirtualRegister(RC32);
- unsigned Tmp = MRI.createVirtualRegister(RC32);
-
- unsigned Undef = MRI.createVirtualRegister(RC32);
- BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef);
-
- BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32)
- .addReg(Undef)
- .addReg(SrcReg)
- .addImm(X86::sub_8bit);
- BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32)
- .addReg(Undef)
- .addReg(t4)
- .addImm(X86::sub_8bit);
-
- BuildMI(mainMBB, DL, TII->get(CMOVOpc), Tmp)
- .addReg(SrcReg32)
- .addReg(AccReg32);
-
- BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t2)
- .addReg(Tmp, 0, X86::sub_8bit);
- }
- } else {
- // Use pseudo select and lower them.
- assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) &&
- "Invalid atomic-load-op transformation!");
- unsigned SelOpc = getPseudoCMOVOpc(VT);
- X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc);
- assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!");
- MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t2)
- .addReg(SrcReg).addReg(t4)
- .addImm(CC);
- mainMBB = EmitLoweredSelect(MIB, mainMBB);
- // Replace the original PHI node as mainMBB is changed after CMOV
- // lowering.
- BuildMI(*origMainMBB, Phi, DL, TII->get(X86::PHI), t4)
- .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
- Phi->eraseFromParent();
- }
- break;
- }
- }
-
- // Copy PhyReg back from virtual register.
- BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), PhyReg)
- .addReg(t4);
-
- MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
- for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
- MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
- if (NewMO.isReg())
- NewMO.setIsKill(false);
- MIB.addOperand(NewMO);
- }
- MIB.addReg(t2);
- MIB.setMemRefs(MMOBegin, MMOEnd);
-
- // Copy PhyReg back to virtual register.
- BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3)
- .addReg(PhyReg);
-
- BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
-
- mainMBB->addSuccessor(origMainMBB);
- mainMBB->addSuccessor(sinkMBB);
-
- // sinkMBB:
- BuildMI(*sinkMBB, sinkMBB->begin(), DL,
- TII->get(TargetOpcode::COPY), DstReg)
- .addReg(t3);
-
- MI->eraseFromParent();
- return sinkMBB;
-}
-
-// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic
-// instructions. They will be translated into a spin-loop or compare-exchange
-// loop from
-//
-// ...
-// dst = atomic-fetch-op MI.addr, MI.val
-// ...
-//
-// to
-//
-// ...
-// t1L = LOAD [MI.addr + 0]
-// t1H = LOAD [MI.addr + 4]
-// loop:
-// t4L = phi(t1L, t3L / loop)
-// t4H = phi(t1H, t3H / loop)
-// t2L = OP MI.val.lo, t4L
-// t2H = OP MI.val.hi, t4H
-// EAX = t4L
-// EDX = t4H
-// EBX = t2L
-// ECX = t2H
-// LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
-// t3L = EAX
-// t3H = EDX
-// JNE loop
-// sink:
-// dstL = t3L
-// dstH = t3H
-// ...
-MachineBasicBlock *
-X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
- MachineBasicBlock *MBB) const {
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
- DebugLoc DL = MI->getDebugLoc();
-
- MachineFunction *MF = MBB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
-
- const BasicBlock *BB = MBB->getBasicBlock();
- MachineFunction::iterator I = MBB;
- ++I;
-
- assert(MI->getNumOperands() <= X86::AddrNumOperands + 7 &&
- "Unexpected number of operands");
-
- assert(MI->hasOneMemOperand() &&
- "Expected atomic-load-op32 to have one memoperand");
-
- // Memory Reference
- MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
- MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
-
- unsigned DstLoReg, DstHiReg;
- unsigned SrcLoReg, SrcHiReg;
- unsigned MemOpndSlot;
-
- unsigned CurOp = 0;
-
- DstLoReg = MI->getOperand(CurOp++).getReg();
- DstHiReg = MI->getOperand(CurOp++).getReg();
- MemOpndSlot = CurOp;
- CurOp += X86::AddrNumOperands;
- SrcLoReg = MI->getOperand(CurOp++).getReg();
- SrcHiReg = MI->getOperand(CurOp++).getReg();
-
- const TargetRegisterClass *RC = &X86::GR32RegClass;
- const TargetRegisterClass *RC8 = &X86::GR8RegClass;
-
- unsigned t1L = MRI.createVirtualRegister(RC);
- unsigned t1H = MRI.createVirtualRegister(RC);
- unsigned t2L = MRI.createVirtualRegister(RC);
- unsigned t2H = MRI.createVirtualRegister(RC);
- unsigned t3L = MRI.createVirtualRegister(RC);
- unsigned t3H = MRI.createVirtualRegister(RC);
- unsigned t4L = MRI.createVirtualRegister(RC);
- unsigned t4H = MRI.createVirtualRegister(RC);
-
- unsigned LCMPXCHGOpc = X86::LCMPXCHG8B;
- unsigned LOADOpc = X86::MOV32rm;
-
- // For the atomic load-arith operator, we generate
- //
- // thisMBB:
- // t1L = LOAD [MI.addr + 0]
- // t1H = LOAD [MI.addr + 4]
- // mainMBB:
- // t4L = phi(t1L / thisMBB, t3L / mainMBB)
- // t4H = phi(t1H / thisMBB, t3H / mainMBB)
- // t2L = OP MI.val.lo, t4L
- // t2H = OP MI.val.hi, t4H
- // EBX = t2L
- // ECX = t2H
- // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
- // t3L = EAX
- // t3H = EDX
- // JNE loop
- // sinkMBB:
- // dstL = t3L
- // dstH = t3H
-
- MachineBasicBlock *thisMBB = MBB;
- MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
- MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
- MF->insert(I, mainMBB);
- MF->insert(I, sinkMBB);
-
- MachineInstrBuilder MIB;
-
- // Transfer the remainder of BB and its successor edges to sinkMBB.
- sinkMBB->splice(sinkMBB->begin(), MBB,
- std::next(MachineBasicBlock::iterator(MI)), MBB->end());
- sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
-
- // thisMBB:
- // Lo
- MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1L);
- for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
- MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
- if (NewMO.isReg())
- NewMO.setIsKill(false);
- MIB.addOperand(NewMO);
- }
- for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
- unsigned flags = (*MMOI)->getFlags();
- flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
- MachineMemOperand *MMO =
- MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
- (*MMOI)->getSize(),
- (*MMOI)->getBaseAlignment(),
- (*MMOI)->getTBAAInfo(),
- (*MMOI)->getRanges());
- MIB.addMemOperand(MMO);
- };
- MachineInstr *LowMI = MIB;
-
- // Hi
- MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1H);
- for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
- if (i == X86::AddrDisp) {
- MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32)
- } else {
- MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
- if (NewMO.isReg())
- NewMO.setIsKill(false);
- MIB.addOperand(NewMO);
- }
- }
- MIB.setMemRefs(LowMI->memoperands_begin(), LowMI->memoperands_end());
-
- thisMBB->addSuccessor(mainMBB);
-
- // mainMBB:
- MachineBasicBlock *origMainMBB = mainMBB;
-
- // Add PHIs.
- MachineInstr *PhiL = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4L)
- .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
- MachineInstr *PhiH = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4H)
- .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
-
- unsigned Opc = MI->getOpcode();
- switch (Opc) {
- default:
- llvm_unreachable("Unhandled atomic-load-op6432 opcode!");
- case X86::ATOMAND6432:
- case X86::ATOMOR6432:
- case X86::ATOMXOR6432:
- case X86::ATOMADD6432:
- case X86::ATOMSUB6432: {
- unsigned HiOpc;
- unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
- BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(t4L)
- .addReg(SrcLoReg);
- BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(t4H)
- .addReg(SrcHiReg);
- break;
- }
- case X86::ATOMNAND6432: {
- unsigned HiOpc, NOTOpc;
- unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc);
- unsigned TmpL = MRI.createVirtualRegister(RC);
- unsigned TmpH = MRI.createVirtualRegister(RC);
- BuildMI(mainMBB, DL, TII->get(LoOpc), TmpL).addReg(SrcLoReg)
- .addReg(t4L);
- BuildMI(mainMBB, DL, TII->get(HiOpc), TmpH).addReg(SrcHiReg)
- .addReg(t4H);
- BuildMI(mainMBB, DL, TII->get(NOTOpc), t2L).addReg(TmpL);
- BuildMI(mainMBB, DL, TII->get(NOTOpc), t2H).addReg(TmpH);
- break;
- }
- case X86::ATOMMAX6432:
- case X86::ATOMMIN6432:
- case X86::ATOMUMAX6432:
- case X86::ATOMUMIN6432: {
- unsigned HiOpc;
- unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
- unsigned cL = MRI.createVirtualRegister(RC8);
- unsigned cH = MRI.createVirtualRegister(RC8);
- unsigned cL32 = MRI.createVirtualRegister(RC);
- unsigned cH32 = MRI.createVirtualRegister(RC);
- unsigned cc = MRI.createVirtualRegister(RC);
- // cl := cmp src_lo, lo
- BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
- .addReg(SrcLoReg).addReg(t4L);
- BuildMI(mainMBB, DL, TII->get(LoOpc), cL);
- BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL);
- // ch := cmp src_hi, hi
- BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
- .addReg(SrcHiReg).addReg(t4H);
- BuildMI(mainMBB, DL, TII->get(HiOpc), cH);
- BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH);
- // cc := if (src_hi == hi) ? cl : ch;
- if (Subtarget->hasCMov()) {
- BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc)
- .addReg(cH32).addReg(cL32);
- } else {
- MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc)
- .addReg(cH32).addReg(cL32)
- .addImm(X86::COND_E);
- mainMBB = EmitLoweredSelect(MIB, mainMBB);
- }
- BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc);
- if (Subtarget->hasCMov()) {
- BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2L)
- .addReg(SrcLoReg).addReg(t4L);
- BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2H)
- .addReg(SrcHiReg).addReg(t4H);
- } else {
- MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2L)
- .addReg(SrcLoReg).addReg(t4L)
- .addImm(X86::COND_NE);
- mainMBB = EmitLoweredSelect(MIB, mainMBB);
- // As the lowered CMOV won't clobber EFLAGS, we could reuse it for the
- // 2nd CMOV lowering.
- mainMBB->addLiveIn(X86::EFLAGS);
- MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2H)
- .addReg(SrcHiReg).addReg(t4H)
- .addImm(X86::COND_NE);
- mainMBB = EmitLoweredSelect(MIB, mainMBB);
- // Replace the original PHI node as mainMBB is changed after CMOV
- // lowering.
- BuildMI(*origMainMBB, PhiL, DL, TII->get(X86::PHI), t4L)
- .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
- BuildMI(*origMainMBB, PhiH, DL, TII->get(X86::PHI), t4H)
- .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
- PhiL->eraseFromParent();
- PhiH->eraseFromParent();
- }
- break;
- }
- case X86::ATOMSWAP6432: {
- unsigned HiOpc;
- unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
- BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg);
- BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg);
- break;
- }
- }
-
- // Copy EDX:EAX back from HiReg:LoReg
- BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(t4L);
- BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(t4H);
- // Copy ECX:EBX from t1H:t1L
- BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t2L);
- BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t2H);
-
- MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
- for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
- MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
- if (NewMO.isReg())
- NewMO.setIsKill(false);
- MIB.addOperand(NewMO);
- }
- MIB.setMemRefs(MMOBegin, MMOEnd);
-
- // Copy EDX:EAX back to t3H:t3L
- BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3L).addReg(X86::EAX);
- BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3H).addReg(X86::EDX);
-
- BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
-
- mainMBB->addSuccessor(origMainMBB);
- mainMBB->addSuccessor(sinkMBB);
-
- // sinkMBB:
- BuildMI(*sinkMBB, sinkMBB->begin(), DL,
- TII->get(TargetOpcode::COPY), DstLoReg)
- .addReg(t3L);
- BuildMI(*sinkMBB, sinkMBB->begin(), DL,
- TII->get(TargetOpcode::COPY), DstHiReg)
- .addReg(t3H);
-
- MI->eraseFromParent();
- return sinkMBB;
-}
-
// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
// or XMM0_V32I8 in AVX all of this code can be replaced with that
// in the .td file.
@@ -16068,7 +17073,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
// Machine Information
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo();
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
@@ -16324,7 +17329,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
XMMSaveMBB->addSuccessor(EndMBB);
// Now add the instructions.
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
unsigned CountReg = MI->getOperand(0).getReg();
@@ -16407,7 +17412,7 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
MachineBasicBlock *
X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
MachineBasicBlock *BB) const {
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
// To "insert" a SELECT_CC instruction, we actually have to insert the
@@ -16433,7 +17438,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// If the EFLAGS register isn't dead in the terminator, then claim that it's
// live into the sink and copy blocks.
- const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
+ const TargetRegisterInfo* TRI = BB->getParent()->getTarget().getRegisterInfo();
if (!MI->killsRegister(X86::EFLAGS) &&
!checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
copy0MBB->addLiveIn(X86::EFLAGS);
@@ -16474,9 +17479,9 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
MachineBasicBlock *
X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
bool Is64Bit) const {
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
- DebugLoc DL = MI->getDebugLoc();
MachineFunction *MF = BB->getParent();
+ const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
assert(MF->shouldSplitStack());
@@ -16546,7 +17551,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
// Calls into a routine in libgcc to allocate more space from the heap.
const uint32_t *RegMask =
- getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+ MF->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
if (Is64Bit) {
BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
.addReg(sizeVReg);
@@ -16594,8 +17599,8 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
MachineBasicBlock *
X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
- MachineBasicBlock *BB) const {
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
assert(!Subtarget->isTargetMacho());
@@ -16651,10 +17656,10 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
// our load from the relocation, sticking it in either RDI (x86-64)
// or EAX and doing an indirect call. The return value will then
// be in the normal return register.
+ MachineFunction *F = BB->getParent();
const X86InstrInfo *TII
- = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo());
+ = static_cast<const X86InstrInfo*>(F->getTarget().getInstrInfo());
DebugLoc DL = MI->getDebugLoc();
- MachineFunction *F = BB->getParent();
assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
assert(MI->getOperand(3).isGlobal() && "This should be a global");
@@ -16663,7 +17668,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
// FIXME: The 32-bit calls have non-standard calling conventions. Use a
// proper register mask.
const uint32_t *RegMask =
- getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+ F->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
if (Subtarget->is64Bit()) {
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
TII->get(X86::MOV64rm), X86::RDI)
@@ -16675,7 +17680,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
addDirectMem(MIB, X86::RDI);
MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
- } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
+ } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
TII->get(X86::MOV32rm), X86::EAX)
.addReg(0)
@@ -16707,9 +17712,8 @@ MachineBasicBlock *
X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
MachineBasicBlock *MBB) const {
DebugLoc DL = MI->getDebugLoc();
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
const BasicBlock *BB = MBB->getBasicBlock();
@@ -16771,8 +17775,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
unsigned PtrStoreOpc = 0;
unsigned LabelReg = 0;
const int64_t LabelOffset = 1 * PVT.getStoreSize();
- Reloc::Model RM = getTargetMachine().getRelocationModel();
- bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) &&
+ Reloc::Model RM = MF->getTarget().getRelocationModel();
+ bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
(RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
// Prepare IP either in reg or imm.
@@ -16816,7 +17820,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
.addMBB(restoreMBB);
const X86RegisterInfo *RegInfo =
- static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+ static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo());
MIB.addRegMask(RegInfo->getNoPreservedMask());
thisMBB->addSuccessor(mainMBB);
thisMBB->addSuccessor(restoreMBB);
@@ -16845,9 +17849,8 @@ MachineBasicBlock *
X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
MachineBasicBlock *MBB) const {
DebugLoc DL = MI->getDebugLoc();
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
// Memory Reference
@@ -16863,7 +17866,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
unsigned Tmp = MRI.createVirtualRegister(RC);
// Since FP is only updated here but NOT referenced, it's treated as GPR.
const X86RegisterInfo *RegInfo =
- static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+ static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo());
unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
unsigned SP = RegInfo->getStackRegister();
@@ -17038,12 +18041,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::FP80_TO_INT16_IN_MEM:
case X86::FP80_TO_INT32_IN_MEM:
case X86::FP80_TO_INT64_IN_MEM: {
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ MachineFunction *F = BB->getParent();
+ const TargetInstrInfo *TII = F->getTarget().getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
// Change the floating point control register to use "round towards zero"
// mode when truncating to an integer value.
- MachineFunction *F = BB->getParent();
int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
addFrameReference(BuildMI(*BB, MI, DL,
TII->get(X86::FNSTCW16m)), CWFrameIdx);
@@ -17123,7 +18126,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::VPCMPESTRM128MEM:
assert(Subtarget->hasSSE42() &&
"Target must have SSE4.2 or AVX features enabled");
- return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo());
+ return EmitPCMPSTRM(MI, BB, BB->getParent()->getTarget().getInstrInfo());
// String/text processing lowering.
case X86::PCMPISTRIREG:
@@ -17136,71 +18139,15 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::VPCMPESTRIMEM:
assert(Subtarget->hasSSE42() &&
"Target must have SSE4.2 or AVX features enabled");
- return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo());
+ return EmitPCMPSTRI(MI, BB, BB->getParent()->getTarget().getInstrInfo());
// Thread synchronization.
case X86::MONITOR:
- return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget);
+ return EmitMonitor(MI, BB, BB->getParent()->getTarget().getInstrInfo(), Subtarget);
// xbegin
case X86::XBEGIN:
- return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo());
-
- // Atomic Lowering.
- case X86::ATOMAND8:
- case X86::ATOMAND16:
- case X86::ATOMAND32:
- case X86::ATOMAND64:
- // Fall through
- case X86::ATOMOR8:
- case X86::ATOMOR16:
- case X86::ATOMOR32:
- case X86::ATOMOR64:
- // Fall through
- case X86::ATOMXOR16:
- case X86::ATOMXOR8:
- case X86::ATOMXOR32:
- case X86::ATOMXOR64:
- // Fall through
- case X86::ATOMNAND8:
- case X86::ATOMNAND16:
- case X86::ATOMNAND32:
- case X86::ATOMNAND64:
- // Fall through
- case X86::ATOMMAX8:
- case X86::ATOMMAX16:
- case X86::ATOMMAX32:
- case X86::ATOMMAX64:
- // Fall through
- case X86::ATOMMIN8:
- case X86::ATOMMIN16:
- case X86::ATOMMIN32:
- case X86::ATOMMIN64:
- // Fall through
- case X86::ATOMUMAX8:
- case X86::ATOMUMAX16:
- case X86::ATOMUMAX32:
- case X86::ATOMUMAX64:
- // Fall through
- case X86::ATOMUMIN8:
- case X86::ATOMUMIN16:
- case X86::ATOMUMIN32:
- case X86::ATOMUMIN64:
- return EmitAtomicLoadArith(MI, BB);
-
- // This group does 64-bit operations on a 32-bit host.
- case X86::ATOMAND6432:
- case X86::ATOMOR6432:
- case X86::ATOMXOR6432:
- case X86::ATOMNAND6432:
- case X86::ATOMADD6432:
- case X86::ATOMSUB6432:
- case X86::ATOMMAX6432:
- case X86::ATOMMIN6432:
- case X86::ATOMUMAX6432:
- case X86::ATOMUMIN6432:
- case X86::ATOMSWAP6432:
- return EmitAtomicLoadArith6432(MI, BB);
+ return EmitXBegin(MI, BB, BB->getParent()->getTarget().getInstrInfo());
case X86::VASTART_SAVE_XMM_REGS:
return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
@@ -17473,13 +18420,385 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// \brief Get the PSHUF-style mask from PSHUF node.
+///
+/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
+/// PSHUF-style masks that can be reused with such instructions.
+static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
+ SmallVector<int, 4> Mask;
+ bool IsUnary;
+ bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
+ (void)HaveMask;
+ assert(HaveMask);
+
+ switch (N.getOpcode()) {
+ case X86ISD::PSHUFD:
+ return Mask;
+ case X86ISD::PSHUFLW:
+ Mask.resize(4);
+ return Mask;
+ case X86ISD::PSHUFHW:
+ Mask.erase(Mask.begin(), Mask.begin() + 4);
+ for (int &M : Mask)
+ M -= 4;
+ return Mask;
+ default:
+ llvm_unreachable("No valid shuffle instruction found!");
+ }
+}
+
+/// \brief Search for a combinable shuffle across a chain ending in pshufd.
+///
+/// We walk up the chain and look for a combinable shuffle, skipping over
+/// shuffles that we could hoist this shuffle's transformation past without
+/// altering anything.
+static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
+ SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert(N.getOpcode() == X86ISD::PSHUFD &&
+ "Called with something other than an x86 128-bit half shuffle!");
+ SDLoc DL(N);
+
+ // Walk up a single-use chain looking for a combinable shuffle.
+ SDValue V = N.getOperand(0);
+ for (; V.hasOneUse(); V = V.getOperand(0)) {
+ switch (V.getOpcode()) {
+ default:
+ return false; // Nothing combined!
+
+ case ISD::BITCAST:
+ // Skip bitcasts as we always know the type for the target specific
+ // instructions.
+ continue;
+
+ case X86ISD::PSHUFD:
+ // Found another dword shuffle.
+ break;
+
+ case X86ISD::PSHUFLW:
+ // Check that the low words (being shuffled) are the identity in the
+ // dword shuffle, and the high words are self-contained.
+ if (Mask[0] != 0 || Mask[1] != 1 ||
+ !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
+ return false;
+
+ continue;
+
+ case X86ISD::PSHUFHW:
+ // Check that the high words (being shuffled) are the identity in the
+ // dword shuffle, and the low words are self-contained.
+ if (Mask[2] != 2 || Mask[3] != 3 ||
+ !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
+ return false;
+
+ continue;
+
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
+ // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
+ // shuffle into a preceding word shuffle.
+ if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
+ return false;
+
+ // Search for a half-shuffle which we can combine with.
+ unsigned CombineOp =
+ V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
+ if (V.getOperand(0) != V.getOperand(1) ||
+ !V->isOnlyUserOf(V.getOperand(0).getNode()))
+ return false;
+ V = V.getOperand(0);
+ do {
+ switch (V.getOpcode()) {
+ default:
+ return false; // Nothing to combine.
+
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ if (V.getOpcode() == CombineOp)
+ break;
+
+ // Fallthrough!
+ case ISD::BITCAST:
+ V = V.getOperand(0);
+ continue;
+ }
+ break;
+ } while (V.hasOneUse());
+ break;
+ }
+ // Break out of the loop if we break out of the switch.
+ break;
+ }
+
+ if (!V.hasOneUse())
+ // We fell out of the loop without finding a viable combining instruction.
+ return false;
+
+ // Record the old value to use in RAUW-ing.
+ SDValue Old = V;
+
+ // Merge this node's mask and our incoming mask.
+ SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+ for (int &M : Mask)
+ M = VMask[M];
+ V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
+ getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+ // It is possible that one of the combinable shuffles was completely absorbed
+ // by the other, just replace it and revisit all users in that case.
+ if (Old.getNode() == V.getNode()) {
+ DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo=*/true);
+ return true;
+ }
+
+ // Replace N with its operand as we're going to combine that shuffle away.
+ DAG.ReplaceAllUsesWith(N, N.getOperand(0));
+
+ // Replace the combinable shuffle with the combined one, updating all users
+ // so that we re-evaluate the chain here.
+ DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
+ return true;
+}
+
+/// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
+///
+/// We walk up the chain, skipping shuffles of the other half and looking
+/// through shuffles which switch halves trying to find a shuffle of the same
+/// pair of dwords.
+static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
+ SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert(
+ (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
+ "Called with something other than an x86 128-bit half shuffle!");
+ SDLoc DL(N);
+ unsigned CombineOpcode = N.getOpcode();
+
+ // Walk up a single-use chain looking for a combinable shuffle.
+ SDValue V = N.getOperand(0);
+ for (; V.hasOneUse(); V = V.getOperand(0)) {
+ switch (V.getOpcode()) {
+ default:
+ return false; // Nothing combined!
+
+ case ISD::BITCAST:
+ // Skip bitcasts as we always know the type for the target specific
+ // instructions.
+ continue;
+
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ if (V.getOpcode() == CombineOpcode)
+ break;
+
+ // Other-half shuffles are no-ops.
+ continue;
+
+ case X86ISD::PSHUFD: {
+ // We can only handle pshufd if the half we are combining either stays in
+ // its half, or switches to the other half. Bail if one of these isn't
+ // true.
+ SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+ int DOffset = CombineOpcode == X86ISD::PSHUFLW ? 0 : 2;
+ if (!((VMask[DOffset + 0] < 2 && VMask[DOffset + 1] < 2) ||
+ (VMask[DOffset + 0] >= 2 && VMask[DOffset + 1] >= 2)))
+ return false;
+
+ // Map the mask through the pshufd and keep walking up the chain.
+ for (int i = 0; i < 4; ++i)
+ Mask[i] = 2 * (VMask[DOffset + Mask[i] / 2] % 2) + Mask[i] % 2;
+
+ // Switch halves if the pshufd does.
+ CombineOpcode =
+ VMask[DOffset + Mask[0] / 2] < 2 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
+ continue;
+ }
+ }
+ // Break out of the loop if we break out of the switch.
+ break;
+ }
+
+ if (!V.hasOneUse())
+ // We fell out of the loop without finding a viable combining instruction.
+ return false;
+
+ // Record the old value to use in RAUW-ing.
+ SDValue Old = V;
+
+ // Merge this node's mask and our incoming mask (adjusted to account for all
+ // the pshufd instructions encountered).
+ SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+ for (int &M : Mask)
+ M = VMask[M];
+ V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
+ getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+ // Replace N with its operand as we're going to combine that shuffle away.
+ DAG.ReplaceAllUsesWith(N, N.getOperand(0));
+
+ // Replace the combinable shuffle with the combined one, updating all users
+ // so that we re-evaluate the chain here.
+ DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
+ return true;
+}
+
+/// \brief Try to combine x86 target specific shuffles.
+static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ SDLoc DL(N);
+ MVT VT = N.getSimpleValueType();
+ SmallVector<int, 4> Mask;
+
+ switch (N.getOpcode()) {
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ Mask = getPSHUFShuffleMask(N);
+ assert(Mask.size() == 4);
+ break;
+ default:
+ return SDValue();
+ }
+
+ // Nuke no-op shuffles that show up after combining.
+ if (isNoopShuffleMask(Mask))
+ return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
+
+ // Look for simplifications involving one or two shuffle instructions.
+ SDValue V = N.getOperand(0);
+ switch (N.getOpcode()) {
+ default:
+ break;
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ assert(VT == MVT::v8i16);
+ (void)VT;
+
+ if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
+ return SDValue(); // We combined away this shuffle, so we're done.
+
+ // See if this reduces to a PSHUFD which is no more expensive and can
+ // combine with more operations.
+ if (Mask[0] % 2 == 0 && Mask[2] % 2 == 0 &&
+ areAdjacentMasksSequential(Mask)) {
+ int DMask[] = {-1, -1, -1, -1};
+ int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
+ DMask[DOffset + 0] = DOffset + Mask[0] / 2;
+ DMask[DOffset + 1] = DOffset + Mask[2] / 2;
+ V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
+ DCI.AddToWorklist(V.getNode());
+ V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
+ getV4X86ShuffleImm8ForMask(DMask, DAG));
+ DCI.AddToWorklist(V.getNode());
+ return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+ }
+
+ // Look for shuffle patterns which can be implemented as a single unpack.
+ // FIXME: This doesn't handle the location of the PSHUFD generically, and
+ // only works when we have a PSHUFD followed by two half-shuffles.
+ if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
+ (V.getOpcode() == X86ISD::PSHUFLW ||
+ V.getOpcode() == X86ISD::PSHUFHW) &&
+ V.getOpcode() != N.getOpcode() &&
+ V.hasOneUse()) {
+ SDValue D = V.getOperand(0);
+ while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
+ D = D.getOperand(0);
+ if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
+ SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+ SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
+ int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+ int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+ int WordMask[8];
+ for (int i = 0; i < 4; ++i) {
+ WordMask[i + NOffset] = Mask[i] + NOffset;
+ WordMask[i + VOffset] = VMask[i] + VOffset;
+ }
+ // Map the word mask through the DWord mask.
+ int MappedMask[8];
+ for (int i = 0; i < 8; ++i)
+ MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
+ const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
+ const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
+ if (std::equal(std::begin(MappedMask), std::end(MappedMask),
+ std::begin(UnpackLoMask)) ||
+ std::equal(std::begin(MappedMask), std::end(MappedMask),
+ std::begin(UnpackHiMask))) {
+ // We can replace all three shuffles with an unpack.
+ V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
+ DCI.AddToWorklist(V.getNode());
+ return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
+ : X86ISD::UNPCKH,
+ DL, MVT::v8i16, V, V);
+ }
+ }
+ }
+
+ break;
+
+ case X86ISD::PSHUFD:
+ if (combineRedundantDWordShuffle(N, Mask, DAG, DCI))
+ return SDValue(); // We combined away this shuffle.
+
+ break;
+ }
+
+ return SDValue();
+}
+
/// PerformShuffleCombine - Performs several different shuffle combines.
static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
SDLoc dl(N);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
+ // Canonicalize shuffles that perform 'addsub' on packed float vectors
+ // according to the rule:
+ // (shuffle (FADD A, B), (FSUB A, B), Mask) ->
+ // (shuffle (FSUB A, -B), (FADD A, -B), Mask)
+ //
+ // Where 'Mask' is:
+ // <0,5,2,7> -- for v4f32 and v4f64 shuffles;
+ // <0,3> -- for v2f64 shuffles;
+ // <0,9,2,11,4,13,6,15> -- for v8f32 shuffles.
+ //
+ // This helps pattern-matching more SSE3/AVX ADDSUB instructions
+ // during ISel stage.
+ if (N->getOpcode() == ISD::VECTOR_SHUFFLE &&
+ ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+ (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+ N0->getOpcode() == ISD::FADD && N1->getOpcode() == ISD::FSUB &&
+ // Operands to the FADD and FSUB must be the same.
+ ((N0->getOperand(0) == N1->getOperand(0) &&
+ N0->getOperand(1) == N1->getOperand(1)) ||
+ // FADD is commutable. See if by commuting the operands of the FADD
+ // we would still be able to match the operands of the FSUB dag node.
+ (N0->getOperand(1) == N1->getOperand(0) &&
+ N0->getOperand(0) == N1->getOperand(1))) &&
+ N0->getOperand(0)->getOpcode() != ISD::UNDEF &&
+ N0->getOperand(1)->getOpcode() != ISD::UNDEF) {
+
+ ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N);
+ unsigned NumElts = VT.getVectorNumElements();
+ ArrayRef<int> Mask = SV->getMask();
+ bool CanFold = true;
+
+ for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i)
+ CanFold = Mask[i] == (int)((i & 1) ? i + NumElts : i);
+
+ if (CanFold) {
+ SDValue Op0 = N1->getOperand(0);
+ SDValue Op1 = DAG.getNode(ISD::FNEG, dl, VT, N1->getOperand(1));
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, VT, Op0, Op1);
+ SDValue Add = DAG.getNode(ISD::FADD, dl, VT, Op0, Op1);
+ return DAG.getVectorShuffle(VT, dl, Sub, Add, Mask);
+ }
+ }
+
// Don't create instructions with illegal types after legalize types has run.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
@@ -17490,6 +18809,57 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
N->getOpcode() == ISD::VECTOR_SHUFFLE)
return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
+ // During Type Legalization, when promoting illegal vector types,
+ // the backend might introduce new shuffle dag nodes and bitcasts.
+ //
+ // This code performs the following transformation:
+ // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
+ // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
+ //
+ // We do this only if both the bitcast and the BINOP dag nodes have
+ // one use. Also, perform this transformation only if the new binary
+ // operation is legal. This is to avoid introducing dag nodes that
+ // potentially need to be further expanded (or custom lowered) into a
+ // less optimal sequence of dag nodes.
+ if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
+ N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
+ N0.getOpcode() == ISD::BITCAST) {
+ SDValue BC0 = N0.getOperand(0);
+ EVT SVT = BC0.getValueType();
+ unsigned Opcode = BC0.getOpcode();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ if (BC0.hasOneUse() && SVT.isVector() &&
+ SVT.getVectorNumElements() * 2 == NumElts &&
+ TLI.isOperationLegal(Opcode, VT)) {
+ bool CanFold = false;
+ switch (Opcode) {
+ default : break;
+ case ISD::ADD :
+ case ISD::FADD :
+ case ISD::SUB :
+ case ISD::FSUB :
+ case ISD::MUL :
+ case ISD::FMUL :
+ CanFold = true;
+ }
+
+ unsigned SVTNumElts = SVT.getVectorNumElements();
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
+ CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
+ for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
+ CanFold = SVOp->getMaskElt(i) < 0;
+
+ if (CanFold) {
+ SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
+ SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
+ SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
+ return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
+ }
+ }
+ }
+
// Only handle 128 wide vector from here on.
if (!VT.is128BitVector())
return SDValue();
@@ -17501,7 +18871,18 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
- return EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
+ SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
+ if (LD.getNode())
+ return LD;
+
+ if (isTargetShuffle(N->getOpcode())) {
+ SDValue Shuffle =
+ PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
+ if (Shuffle.getNode())
+ return Shuffle;
+ }
+
+ return SDValue();
}
/// PerformTruncateCombine - Converts truncate operation to
@@ -18155,28 +19536,34 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
- // If the RHS is a constant we have to reverse the const canonicalization.
- // x > C-1 ? x+-C : 0 --> subus x, C
- if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
- isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) {
- APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
- if (CondRHS.getConstantOperandVal(0) == -A-1)
- return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS,
- DAG.getConstant(-A, VT));
- }
-
- // Another special case: If C was a sign bit, the sub has been
- // canonicalized into a xor.
- // FIXME: Would it be better to use computeKnownBits to determine whether
- // it's safe to decanonicalize the xor?
- // x s< 0 ? x^C : 0 --> subus x, C
- if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
- ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
- isSplatVector(OpRHS.getNode())) {
- APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
- if (A.isSignBit())
- return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
- }
+ if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
+ if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
+ if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
+ if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
+ // If the RHS is a constant we have to reverse the const
+ // canonicalization.
+ // x > C-1 ? x+-C : 0 --> subus x, C
+ if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
+ CondRHSConst->getAPIntValue() ==
+ (-OpRHSConst->getAPIntValue() - 1))
+ return DAG.getNode(
+ X86ISD::SUBUS, DL, VT, OpLHS,
+ DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));
+
+ // Another special case: If C was a sign bit, the sub has been
+ // canonicalized into a xor.
+ // FIXME: Would it be better to use computeKnownBits to determine
+ // whether it's safe to decanonicalize the xor?
+ // x s< 0 ? x^C : 0 --> subus x, C
+ if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
+ ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
+ OpRHSConst->getAPIntValue().isSignBit())
+ // Note that we have to rebuild the RHS constant here to ensure we
+ // don't rely on particular values of undef lanes.
+ return DAG.getNode(
+ X86ISD::SUBUS, DL, VT, OpLHS,
+ DAG.getConstant(OpRHSConst->getAPIntValue(), VT));
+ }
}
}
@@ -18743,6 +20130,8 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
if (C->isAllOnesValue())
return Op1;
}
+
+ return SDValue();
}
// Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
@@ -18882,16 +20271,15 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
// vector operations in many cases. Also, on sandybridge ADD is faster than
// shl.
// (shl V, 1) -> add V,V
- if (isSplatVector(N1.getNode())) {
- assert(N0.getValueType().isVector() && "Invalid vector shift type");
- ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0));
- // We shift all of the values by one. In many cases we do not have
- // hardware support for this operation. This is better expressed as an ADD
- // of two values.
- if (N1C && (1 == N1C->getZExtValue())) {
- return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
+ if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
+ if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
+ assert(N0.getValueType().isVector() && "Invalid vector shift type");
+ // We shift all of the values by one. In many cases we do not have
+ // hardware support for this operation. This is better expressed as an ADD
+ // of two values.
+ if (N1SplatC->getZExtValue() == 1)
+ return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
}
- }
return SDValue();
}
@@ -18910,10 +20298,9 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
SDValue Amt = N->getOperand(1);
SDLoc DL(N);
- if (isSplatVector(Amt.getNode())) {
- SDValue SclrAmt = Amt->getOperand(0);
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
- APInt ShiftAmt = C->getAPIntValue();
+ if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
+ if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
+ APInt ShiftAmt = AmtSplat->getAPIntValue();
unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
// SSE2/AVX2 logical shifts always return a vector of 0s
@@ -18923,7 +20310,6 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
if (ShiftAmt.trunc(8).uge(MaxAmount))
return getZeroVector(VT, Subtarget, DAG, DL);
}
- }
return SDValue();
}
@@ -19117,9 +20503,10 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
// The right side has to be a 'trunc' or a constant vector.
bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
- bool RHSConst = (isSplatVector(N1.getNode()) &&
- isa<ConstantSDNode>(N1->getOperand(0)));
- if (!RHSTrunc && !RHSConst)
+ ConstantSDNode *RHSConstSplat = nullptr;
+ if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
+ RHSConstSplat = RHSBV->getConstantSplatNode();
+ if (!RHSTrunc && !RHSConstSplat)
return SDValue();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -19129,9 +20516,9 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
// Set N0 and N1 to hold the inputs to the new wide operation.
N0 = N0->getOperand(0);
- if (RHSConst) {
+ if (RHSConstSplat) {
N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
- N1->getOperand(0));
+ SDValue(RHSConstSplat, 0));
SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
} else if (RHSTrunc) {
@@ -19277,12 +20664,9 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
unsigned SraAmt = ~0;
if (Mask.getOpcode() == ISD::SRA) {
- SDValue Amt = Mask.getOperand(1);
- if (isSplatVector(Amt.getNode())) {
- SDValue SclrAmt = Amt->getOperand(0);
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt))
- SraAmt = C->getZExtValue();
- }
+ if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
+ if (auto *AmtConst = AmtBV->getConstantSplatNode())
+ SraAmt = AmtConst->getZExtValue();
} else if (Mask.getOpcode() == X86ISD::VSRAI) {
SDValue SraC = Mask.getOperand(1);
SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
@@ -20642,6 +22026,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
case X86ISD::INSERTPS:
return PerformINSERTPSCombine(N, DAG, Subtarget);
+ case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
}
return SDValue();
@@ -21146,8 +22531,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
const GlobalValue *GV = GA->getGlobal();
// If we require an extra load to get this address, as in PIC mode, we
// can't accept it.
- if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
- getTargetMachine())))
+ if (isGlobalStubReference(
+ Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
return;
Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
@@ -21425,3 +22810,7 @@ int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
return AM.Scale != 0;
return -1;
}
+
+bool X86TargetLowering::isTargetFTOL() const {
+ return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
+}
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 9f51b53..c8cdce7 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -15,13 +15,13 @@
#ifndef X86ISELLOWERING_H
#define X86ISELLOWERING_H
-#include "X86Subtarget.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetOptions.h"
namespace llvm {
+ class X86Subtarget;
class X86TargetMachine;
namespace X86ISD {
@@ -86,6 +86,9 @@ namespace llvm {
/// X86 Read Time-Stamp Counter and Processor ID.
RDTSCP_DAG,
+ /// X86 Read Performance Monitoring Counters.
+ RDPMC_DAG,
+
/// X86 compare and logical compare instructions.
CMP, COMI, UCOMI,
@@ -315,6 +318,8 @@ namespace llvm {
KORTEST,
// Several flavors of instructions with vector shuffle behaviors.
+ PACKSS,
+ PACKUS,
PALIGNR,
PSHUFD,
PSHUFHW,
@@ -400,23 +405,8 @@ namespace llvm {
// XTEST - Test if in transactional execution.
XTEST,
- // ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG,
- // ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG -
- // Atomic 64-bit binary operations.
- ATOMADD64_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
- ATOMSUB64_DAG,
- ATOMOR64_DAG,
- ATOMXOR64_DAG,
- ATOMAND64_DAG,
- ATOMNAND64_DAG,
- ATOMMAX64_DAG,
- ATOMMIN64_DAG,
- ATOMUMAX64_DAG,
- ATOMUMIN64_DAG,
- ATOMSWAP64_DAG,
-
// LCMPXCHG_DAG, LCMPXCHG8_DAG, LCMPXCHG16_DAG - Compare and swap.
- LCMPXCHG_DAG,
+ LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
LCMPXCHG8_DAG,
LCMPXCHG16_DAG,
@@ -766,9 +756,7 @@ namespace llvm {
/// isTargetFTOL - Return true if the target uses the MSVC _ftol2 routine
/// for fptoui.
- bool isTargetFTOL() const {
- return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
- }
+ bool isTargetFTOL() const;
/// isIntegerTypeFTOL - Return true if the MSVC _ftol2 routine should be
/// used for fptoui to the given type.
@@ -808,6 +796,9 @@ namespace llvm {
/// \brief Reset the operation actions based on target options.
void resetOperationActions() override;
+ /// \brief Customize the preferred legalization strategy for certain types.
+ LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
+
protected:
std::pair<const TargetRegisterClass*, uint8_t>
findRepresentativeClass(MVT VT) const override;
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 37bcc52..41e900e 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -476,6 +476,28 @@ defm VPBROADCASTQZ : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem,
loadi64, VR512, v8i64, v2i64, VK8WM>, EVEX_V512, VEX_W,
EVEX_CD8<64, CD8VT1>;
+multiclass avx512_int_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ RegisterClass KRC> {
+ let mayLoad = 1 in {
+ def rm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+ []>, EVEX;
+ def krm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins KRC:$mask,
+ x86memop:$src),
+ !strconcat(OpcodeStr,
+ " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+ []>, EVEX, EVEX_KZ;
+ }
+}
+
+defm VBROADCASTI32X4 : avx512_int_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
+ i128mem, loadv2i64, VK16WM>,
+ EVEX_V512, EVEX_CD8<32, CD8VT4>;
+defm VBROADCASTI64X4 : avx512_int_subvec_broadcast_rm<0x5b, "vbroadcasti64x4",
+ i256mem, loadv4i64, VK16WM>, VEX_W,
+ EVEX_V512, EVEX_CD8<64, CD8VT4>;
+
def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_512 (v4i32 VR128X:$src))),
(VPBROADCASTDZrr VR128X:$src)>;
def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))),
@@ -517,10 +539,12 @@ def rr : AVX512XS8I<opc, MRMDestReg, (outs DstRC:$dst), (ins KRC:$src),
[]>, EVEX;
}
+let Predicates = [HasCDI] in {
defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512,
VK16, v16i32, v16i1>, EVEX_V512;
defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512,
VK8, v8i64, v8i1>, EVEX_V512, VEX_W;
+}
//===----------------------------------------------------------------------===//
// AVX-512 - VPERM
@@ -585,7 +609,7 @@ defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, memopv8f64, f512mem,
// -- VPERM2I - 3 source operands form --
multiclass avx512_perm_3src<bits<8> opc, string OpcodeStr, RegisterClass RC,
PatFrag mem_frag, X86MemOperand x86memop,
- SDNode OpNode, ValueType OpVT> {
+ SDNode OpNode, ValueType OpVT, RegisterClass KRC> {
let Constraints = "$src1 = $dst" in {
def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
@@ -595,48 +619,107 @@ let Constraints = "$src1 = $dst" in {
(OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>,
EVEX_4V;
+ def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, KRC:$mask, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ " \t{$src3, $src2, $dst {${mask}}|"
+ "$dst {${mask}}, $src2, $src3}"),
+ [(set RC:$dst, (OpVT (vselect KRC:$mask,
+ (OpNode RC:$src1, RC:$src2,
+ RC:$src3),
+ RC:$src1)))]>,
+ EVEX_4V, EVEX_K;
+
+ let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<>
+ def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, KRC:$mask, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ " \t{$src3, $src2, $dst {${mask}} {z} |",
+ "$dst {${mask}} {z}, $src2, $src3}"),
+ [(set RC:$dst, (OpVT (vselect KRC:$mask,
+ (OpNode RC:$src1, RC:$src2,
+ RC:$src3),
+ (OpVT (bitconvert
+ (v16i32 immAllZerosV))))))]>,
+ EVEX_4V, EVEX_KZ;
+
def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, RC:$src2, x86memop:$src3),
!strconcat(OpcodeStr,
" \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set RC:$dst,
- (OpVT (OpNode RC:$src1, RC:$src2,
+ (OpVT (OpNode RC:$src1, RC:$src2,
(mem_frag addr:$src3))))]>, EVEX_4V;
+
+ def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, KRC:$mask, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr,
+ " \t{$src3, $src2, $dst {${mask}}|"
+ "$dst {${mask}}, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpVT (vselect KRC:$mask,
+ (OpNode RC:$src1, RC:$src2,
+ (mem_frag addr:$src3)),
+ RC:$src1)))]>,
+ EVEX_4V, EVEX_K;
+
+ let AddedComplexity = 10 in // Prefer over the rrkz variant
+ def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, KRC:$mask, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr,
+ " \t{$src3, $src2, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpVT (vselect KRC:$mask,
+ (OpNode RC:$src1, RC:$src2,
+ (mem_frag addr:$src3)),
+ (OpVT (bitconvert
+ (v16i32 immAllZerosV))))))]>,
+ EVEX_4V, EVEX_KZ;
}
}
-defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32, i512mem,
- X86VPermiv3, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, memopv8i64, i512mem,
- X86VPermiv3, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, memopv16f32, i512mem,
- X86VPermiv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, memopv8f64, i512mem,
- X86VPermiv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-
-defm VPERMT2D : avx512_perm_3src<0x7E, "vpermt2d", VR512, memopv16i32, i512mem,
- X86VPermv3, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPERMT2Q : avx512_perm_3src<0x7E, "vpermt2q", VR512, memopv8i64, i512mem,
- X86VPermv3, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPERMT2PS : avx512_perm_3src<0x7F, "vpermt2ps", VR512, memopv16f32, i512mem,
- X86VPermv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPERMT2PD : avx512_perm_3src<0x7F, "vpermt2pd", VR512, memopv8f64, i512mem,
- X86VPermv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-
-def : Pat<(v16f32 (int_x86_avx512_mask_vpermt_ps_512 (v16i32 VR512:$idx),
- (v16f32 VR512:$src1), (v16f32 VR512:$src2), (i16 -1))),
- (VPERMT2PSrr VR512:$src1, VR512:$idx, VR512:$src2)>;
-
-def : Pat<(v16i32 (int_x86_avx512_mask_vpermt_d_512 (v16i32 VR512:$idx),
- (v16i32 VR512:$src1), (v16i32 VR512:$src2), (i16 -1))),
- (VPERMT2Drr VR512:$src1, VR512:$idx, VR512:$src2)>;
-
-def : Pat<(v8f64 (int_x86_avx512_mask_vpermt_pd_512 (v8i64 VR512:$idx),
- (v8f64 VR512:$src1), (v8f64 VR512:$src2), (i8 -1))),
- (VPERMT2PDrr VR512:$src1, VR512:$idx, VR512:$src2)>;
-
-def : Pat<(v8i64 (int_x86_avx512_mask_vpermt_q_512 (v8i64 VR512:$idx),
- (v8i64 VR512:$src1), (v8i64 VR512:$src2), (i8 -1))),
- (VPERMT2Qrr VR512:$src1, VR512:$idx, VR512:$src2)>;
+defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32,
+ i512mem, X86VPermiv3, v16i32, VK16WM>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, memopv8i64,
+ i512mem, X86VPermiv3, v8i64, VK8WM>,
+ EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, memopv16f32,
+ i512mem, X86VPermiv3, v16f32, VK16WM>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, memopv8f64,
+ i512mem, X86VPermiv3, v8f64, VK8WM>,
+ EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+multiclass avx512_perm_table_3src<bits<8> opc, string Suffix, RegisterClass RC,
+ PatFrag mem_frag, X86MemOperand x86memop,
+ SDNode OpNode, ValueType OpVT, RegisterClass KRC,
+ ValueType MaskVT, RegisterClass MRC> :
+ avx512_perm_3src<opc, "vpermt2"##Suffix, RC, mem_frag, x86memop, OpNode,
+ OpVT, KRC> {
+ def : Pat<(OpVT (!cast<Intrinsic>("int_x86_avx512_mask_vpermt_"##Suffix##"_512")
+ VR512:$idx, VR512:$src1, VR512:$src2, -1)),
+ (!cast<Instruction>(NAME#rr) VR512:$src1, VR512:$idx, VR512:$src2)>;
+
+ def : Pat<(OpVT (!cast<Intrinsic>("int_x86_avx512_mask_vpermt_"##Suffix##"_512")
+ VR512:$idx, VR512:$src1, VR512:$src2, MRC:$mask)),
+ (!cast<Instruction>(NAME#rrk) VR512:$src1,
+ (MaskVT (COPY_TO_REGCLASS MRC:$mask, KRC)), VR512:$idx, VR512:$src2)>;
+}
+
+defm VPERMT2D : avx512_perm_table_3src<0x7E, "d", VR512, memopv16i32, i512mem,
+ X86VPermv3, v16i32, VK16WM, v16i1, GR16>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMT2Q : avx512_perm_table_3src<0x7E, "q", VR512, memopv8i64, i512mem,
+ X86VPermv3, v8i64, VK8WM, v8i1, GR8>,
+ EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMT2PS : avx512_perm_table_3src<0x7F, "ps", VR512, memopv16f32, i512mem,
+ X86VPermv3, v16f32, VK16WM, v16i1, GR16>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMT2PD : avx512_perm_table_3src<0x7F, "pd", VR512, memopv8f64, i512mem,
+ X86VPermv3, v8f64, VK8WM, v8i1, GR8>,
+ EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
//===----------------------------------------------------------------------===//
// AVX-512 - BLEND using mask
//
@@ -790,52 +873,61 @@ def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
(v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
(v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>;
-multiclass avx512_icmp_cc<bits<8> opc, RegisterClass KRC,
+multiclass avx512_icmp_cc<bits<8> opc, RegisterClass WMRC, RegisterClass KRC,
RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag,
- SDNode OpNode, ValueType vt, Operand CC, string asm,
- string asm_alt> {
+ SDNode OpNode, ValueType vt, Operand CC, string Suffix> {
def rri : AVX512AIi8<opc, MRMSrcReg,
- (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+ (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc),
+ !strconcat("vpcmp${cc}", Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2), imm:$cc))],
IIC_SSE_ALU_F32P_RR>, EVEX_4V;
def rmi : AVX512AIi8<opc, MRMSrcMem,
- (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+ (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc),
+ !strconcat("vpcmp${cc}", Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2),
imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
// Accept explicit immediate argument form instead of comparison code.
let isAsmParserOnly = 1, hasSideEffects = 0 in {
def rri_alt : AVX512AIi8<opc, MRMSrcReg,
(outs KRC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
- asm_alt, [], IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
+ [], IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+ def rrik_alt : AVX512AIi8<opc, MRMSrcReg,
+ (outs KRC:$dst), (ins WMRC:$mask, RC:$src1, RC:$src2, i8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, $src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2, $cc}"),
+ [], IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
def rmi_alt : AVX512AIi8<opc, MRMSrcMem,
(outs KRC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
- asm_alt, [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
+ [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+ def rmik_alt : AVX512AIi8<opc, MRMSrcMem,
+ (outs KRC:$dst), (ins WMRC:$mask, RC:$src1, x86memop:$src2, i8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, $src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2, $cc}"),
+ [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
}
}
-defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16, VR512, i512mem, memopv16i32,
- X86cmpm, v16i32, AVXCC,
- "vpcmp${cc}d\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- "vpcmpd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
- EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16, VR512, i512mem, memopv16i32,
- X86cmpmu, v16i32, AVXCC,
- "vpcmp${cc}ud\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- "vpcmpud\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
- EVEX_V512, EVEX_CD8<32, CD8VF>;
-
-defm VPCMPQZ : avx512_icmp_cc<0x1F, VK8, VR512, i512mem, memopv8i64,
- X86cmpm, v8i64, AVXCC,
- "vpcmp${cc}q\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- "vpcmpq\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
- VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
-defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8, VR512, i512mem, memopv8i64,
- X86cmpmu, v8i64, AVXCC,
- "vpcmp${cc}uq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- "vpcmpuq\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
- VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
-
-// avx512_cmp_packed - sse 1 & 2 compare packed instructions
+defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16WM, VK16, VR512, i512mem, memopv16i32,
+ X86cmpm, v16i32, AVXCC, "d">,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16WM, VK16, VR512, i512mem, memopv16i32,
+ X86cmpmu, v16i32, AVXCC, "ud">,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPCMPQZ : avx512_icmp_cc<0x1F, VK8WM, VK8, VR512, i512mem, memopv8i64,
+ X86cmpm, v8i64, AVXCC, "q">,
+ VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8WM, VK8, VR512, i512mem, memopv8i64,
+ X86cmpmu, v8i64, AVXCC, "uq">,
+ VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+// avx512_cmp_packed - compare packed instructions
multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC,
X86MemOperand x86memop, ValueType vt,
string suffix, Domain d> {
@@ -859,11 +951,11 @@ multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC,
// Accept explicit immediate argument form instead of comparison code.
let isAsmParserOnly = 1, hasSideEffects = 0 in {
def rri_alt : AVX512PIi8<0xC2, MRMSrcReg,
- (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
+ (outs KRC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
!strconcat("vcmp", suffix,
" \t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>;
def rmi_alt : AVX512PIi8<0xC2, MRMSrcMem,
- (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
+ (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
!strconcat("vcmp", suffix,
" \t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>;
}
@@ -1788,6 +1880,46 @@ def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
(SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
//===----------------------------------------------------------------------===//
+// AVX-512 - Non-temporals
+//===----------------------------------------------------------------------===//
+
+def VMOVNTDQAZrm : AVX5128I<0x2A, MRMSrcMem, (outs VR512:$dst),
+ (ins i512mem:$src),
+ "vmovntdqa\t{$src, $dst|$dst, $src}",
+ [(set VR512:$dst,
+ (int_x86_avx512_movntdqa addr:$src))]>,
+ EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+// Prefer non-temporal over temporal versions
+let AddedComplexity = 400, SchedRW = [WriteStore] in {
+
+def VMOVNTPSZmr : AVX512PSI<0x2B, MRMDestMem, (outs),
+ (ins f512mem:$dst, VR512:$src),
+ "vmovntps\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v16f32 VR512:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>,
+ EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+def VMOVNTPDZmr : AVX512PDI<0x2B, MRMDestMem, (outs),
+ (ins f512mem:$dst, VR512:$src),
+ "vmovntpd\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v8f64 VR512:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>,
+ EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+
+def VMOVNTDQZmr : AVX512BI<0xE7, MRMDestMem, (outs),
+ (ins i512mem:$dst, VR512:$src),
+ "vmovntdq\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v8i64 VR512:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>,
+ EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>;
+}
+
+//===----------------------------------------------------------------------===//
// AVX-512 - Integer arithmetic
//
multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -3161,6 +3293,10 @@ def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))),
(EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
(v16i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>;
+def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))),
+ (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr
+ (v8i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_ymm)>;
+
def : Pat<(v16f32 (int_x86_avx512_mask_cvtdq2ps_512 (v16i32 VR512:$src),
(bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), imm:$rc)),
(VCVTDQ2PSZrrb VR512:$src, imm:$rc)>;
@@ -4343,6 +4479,37 @@ def : Pat<(int_x86_avx512_mask_conflict_q_512 VR512:$src2, VR512:$src1,
(VPCONFLICTQrrk VR512:$src1,
(v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>;
+let Predicates = [HasCDI] in {
+defm VPLZCNTD : avx512_conflict<0x44, "vplzcntd", VR512, VK16WM,
+ i512mem, i32mem, "{1to16}">,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+
+defm VPLZCNTQ : avx512_conflict<0x44, "vplzcntq", VR512, VK8WM,
+ i512mem, i64mem, "{1to8}">,
+ EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+}
+
+def : Pat<(int_x86_avx512_mask_lzcnt_d_512 VR512:$src2, VR512:$src1,
+ GR16:$mask),
+ (VPLZCNTDrrk VR512:$src1,
+ (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), VR512:$src2)>;
+
+def : Pat<(int_x86_avx512_mask_lzcnt_q_512 VR512:$src2, VR512:$src1,
+ GR8:$mask),
+ (VPLZCNTQrrk VR512:$src1,
+ (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>;
+
+def : Pat<(v16i32 (ctlz (memopv16i32 addr:$src))),
+ (VPLZCNTDrm addr:$src)>;
+def : Pat<(v16i32 (ctlz (v16i32 VR512:$src))),
+ (VPLZCNTDrr VR512:$src)>;
+def : Pat<(v8i64 (ctlz (memopv8i64 addr:$src))),
+ (VPLZCNTQrm addr:$src)>;
+def : Pat<(v8i64 (ctlz (v8i64 VR512:$src))),
+ (VPLZCNTQrr VR512:$src)>;
+
def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index 368e14b..f2574cc 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -1278,8 +1278,10 @@ let isCompare = 1 in {
def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>;
// When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
- // register class is constrained to GR8_NOREX.
- let isPseudo = 1 in
+ // register class is constrained to GR8_NOREX. This pseudo is explicitly
+ // marked side-effect free, since it doesn't have an isel pattern like
+ // other test instructions.
+ let isPseudo = 1, hasSideEffects = 0 in
def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
"", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>;
} // Defs = [EFLAGS]
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 34d8fb9..ca4f608 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -110,7 +110,7 @@ let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
// When using segmented stacks these are lowered into instructions which first
// check if the current stacklet has enough free memory. If it does, memory is
-// allocated by bumping the stack pointer. Otherwise memory is allocated from
+// allocated by bumping the stack pointer. Otherwise memory is allocated from
// the heap.
let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
@@ -197,6 +197,26 @@ let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in {
}
//===----------------------------------------------------------------------===//
+// Pseudo instructions used by unwind info.
+//
+let isPseudo = 1 in {
+ def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg),
+ "#SEH_PushReg $reg", []>;
+ def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
+ "#SEH_SaveReg $reg, $dst", []>;
+ def SEH_SaveXMM : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
+ "#SEH_SaveXMM $reg, $dst", []>;
+ def SEH_StackAlloc : I<0, Pseudo, (outs), (ins i32imm:$size),
+ "#SEH_StackAlloc $size", []>;
+ def SEH_SetFrame : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$offset),
+ "#SEH_SetFrame $reg, $offset", []>;
+ def SEH_PushFrame : I<0, Pseudo, (outs), (ins i1imm:$mode),
+ "#SEH_PushFrame $mode", []>;
+ def SEH_EndPrologue : I<0, Pseudo, (outs), (ins),
+ "#SEH_EndPrologue", []>;
+}
+
+//===----------------------------------------------------------------------===//
// Pseudo instructions used by segmented stacks.
//
@@ -371,7 +391,7 @@ let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
[(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
Requires<[In64BitMode]>;
-
+
let Uses = [RAX,RCX,RDI] in
def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
[(X86rep_stos i64)], IIC_REP_STOS>, REP,
@@ -502,83 +522,6 @@ def CMOV_RFP80 : I<0, Pseudo,
//===----------------------------------------------------------------------===//
-// Atomic Instruction Pseudo Instructions
-//===----------------------------------------------------------------------===//
-
-// Pseudo atomic instructions
-
-multiclass PSEUDO_ATOMIC_LOAD_BINOP<string mnemonic> {
- let usesCustomInserter = 1, mayLoad = 1, mayStore = 1 in {
- let Defs = [EFLAGS, AL] in
- def NAME#8 : I<0, Pseudo, (outs GR8:$dst),
- (ins i8mem:$ptr, GR8:$val),
- !strconcat(mnemonic, "8 PSEUDO!"), []>;
- let Defs = [EFLAGS, AX] in
- def NAME#16 : I<0, Pseudo,(outs GR16:$dst),
- (ins i16mem:$ptr, GR16:$val),
- !strconcat(mnemonic, "16 PSEUDO!"), []>;
- let Defs = [EFLAGS, EAX] in
- def NAME#32 : I<0, Pseudo, (outs GR32:$dst),
- (ins i32mem:$ptr, GR32:$val),
- !strconcat(mnemonic, "32 PSEUDO!"), []>;
- let Defs = [EFLAGS, RAX] in
- def NAME#64 : I<0, Pseudo, (outs GR64:$dst),
- (ins i64mem:$ptr, GR64:$val),
- !strconcat(mnemonic, "64 PSEUDO!"), []>;
- }
-}
-
-multiclass PSEUDO_ATOMIC_LOAD_BINOP_PATS<string name, string frag> {
- def : Pat<(!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val),
- (!cast<Instruction>(name # "8") addr:$ptr, GR8:$val)>;
- def : Pat<(!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val),
- (!cast<Instruction>(name # "16") addr:$ptr, GR16:$val)>;
- def : Pat<(!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val),
- (!cast<Instruction>(name # "32") addr:$ptr, GR32:$val)>;
- def : Pat<(!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val),
- (!cast<Instruction>(name # "64") addr:$ptr, GR64:$val)>;
-}
-
-// Atomic exchange, and, or, xor
-defm ATOMAND : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMAND">;
-defm ATOMOR : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMOR">;
-defm ATOMXOR : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMXOR">;
-defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMNAND">;
-defm ATOMMAX : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMAX">;
-defm ATOMMIN : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMIN">;
-defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMAX">;
-defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMIN">;
-
-defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMAND", "atomic_load_and">;
-defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMOR", "atomic_load_or">;
-defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMXOR", "atomic_load_xor">;
-defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMNAND", "atomic_load_nand">;
-defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMAX", "atomic_load_max">;
-defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMIN", "atomic_load_min">;
-defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMAX", "atomic_load_umax">;
-defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMIN", "atomic_load_umin">;
-
-multiclass PSEUDO_ATOMIC_LOAD_BINOP6432<string mnemonic> {
- let usesCustomInserter = 1, Defs = [EFLAGS, EAX, EDX],
- mayLoad = 1, mayStore = 1, hasSideEffects = 0 in
- def NAME#6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
- (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
- !strconcat(mnemonic, "6432 PSEUDO!"), []>;
-}
-
-defm ATOMAND : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMAND">;
-defm ATOMOR : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMOR">;
-defm ATOMXOR : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMXOR">;
-defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMNAND">;
-defm ATOMADD : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMADD">;
-defm ATOMSUB : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSUB">;
-defm ATOMMAX : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMAX">;
-defm ATOMMIN : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMIN">;
-defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMAX">;
-defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMIN">;
-defm ATOMSWAP : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSWAP">;
-
-//===----------------------------------------------------------------------===//
// Normal-Instructions-With-Lock-Prefix Pseudo Instructions
//===----------------------------------------------------------------------===//
@@ -1696,20 +1639,34 @@ def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
(IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
// Increment reg.
-def : Pat<(add GR8 :$src, 1), (INC8r GR8 :$src)>;
-def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>, Requires<[Not64BitMode]>;
-def : Pat<(add GR16:$src, 1), (INC64_16r GR16:$src)>, Requires<[In64BitMode]>;
-def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>, Requires<[Not64BitMode]>;
-def : Pat<(add GR32:$src, 1), (INC64_32r GR32:$src)>, Requires<[In64BitMode]>;
-def : Pat<(add GR64:$src, 1), (INC64r GR64:$src)>;
+// Do not make INC if it is slow
+def : Pat<(add GR8:$src, 1),
+ (INC8r GR8:$src)>, Requires<[NotSlowIncDec]>;
+def : Pat<(add GR16:$src, 1),
+ (INC16r GR16:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>;
+def : Pat<(add GR16:$src, 1),
+ (INC64_16r GR16:$src)>, Requires<[NotSlowIncDec, In64BitMode]>;
+def : Pat<(add GR32:$src, 1),
+ (INC32r GR32:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>;
+def : Pat<(add GR32:$src, 1),
+ (INC64_32r GR32:$src)>, Requires<[NotSlowIncDec, In64BitMode]>;
+def : Pat<(add GR64:$src, 1),
+ (INC64r GR64:$src)>, Requires<[NotSlowIncDec]>;
// Decrement reg.
-def : Pat<(add GR8 :$src, -1), (DEC8r GR8 :$src)>;
-def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>, Requires<[Not64BitMode]>;
-def : Pat<(add GR16:$src, -1), (DEC64_16r GR16:$src)>, Requires<[In64BitMode]>;
-def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>, Requires<[Not64BitMode]>;
-def : Pat<(add GR32:$src, -1), (DEC64_32r GR32:$src)>, Requires<[In64BitMode]>;
-def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>;
+// Do not make DEC if it is slow
+def : Pat<(add GR8:$src, -1),
+ (DEC8r GR8:$src)>, Requires<[NotSlowIncDec]>;
+def : Pat<(add GR16:$src, -1),
+ (DEC16r GR16:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>;
+def : Pat<(add GR16:$src, -1),
+ (DEC64_16r GR16:$src)>, Requires<[NotSlowIncDec, In64BitMode]>;
+def : Pat<(add GR32:$src, -1),
+ (DEC32r GR32:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>;
+def : Pat<(add GR32:$src, -1),
+ (DEC64_32r GR32:$src)>, Requires<[NotSlowIncDec, In64BitMode]>;
+def : Pat<(add GR64:$src, -1),
+ (DEC64r GR64:$src)>, Requires<[NotSlowIncDec]>;
// or reg/reg.
def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr GR8 :$src1, GR8 :$src2)>;
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 1582f43..6f0fa94 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -224,6 +224,10 @@ def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>;
def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
+def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>;
+def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>;
+def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>;
+
def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 6993577..0d3afc4 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -28,6 +28,7 @@
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
@@ -97,14 +98,11 @@ struct X86OpTblEntry {
// Pin the vtable to this file.
void X86InstrInfo::anchor() {}
-X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
- : X86GenInstrInfo((tm.getSubtarget<X86Subtarget>().is64Bit()
- ? X86::ADJCALLSTACKDOWN64
- : X86::ADJCALLSTACKDOWN32),
- (tm.getSubtarget<X86Subtarget>().is64Bit()
- ? X86::ADJCALLSTACKUP64
- : X86::ADJCALLSTACKUP32)),
- TM(tm), RI(tm) {
+X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
+ : X86GenInstrInfo(
+ (STI.is64Bit() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32),
+ (STI.is64Bit() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)),
+ Subtarget(STI), RI(STI) {
static const X86OpTblEntry OpTbl2Addr[] = {
{ X86::ADC32ri, X86::ADC32mi, 0 },
@@ -1472,7 +1470,7 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
case X86::MOVSX32rr8:
case X86::MOVZX32rr8:
case X86::MOVSX64rr8:
- if (!TM.getSubtarget<X86Subtarget>().is64Bit())
+ if (!Subtarget.is64Bit())
// It's not always legal to reference the low 8-bit of the larger
// register in 32-bit mode.
return false;
@@ -1950,7 +1948,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
unsigned Opc, leaInReg;
- if (TM.getSubtarget<X86Subtarget>().is64Bit()) {
+ if (Subtarget.is64Bit()) {
Opc = X86::LEA64_32r;
leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
} else {
@@ -2006,7 +2004,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
// just a single insert_subreg.
addRegReg(MIB, leaInReg, true, leaInReg, false);
} else {
- if (TM.getSubtarget<X86Subtarget>().is64Bit())
+ if (Subtarget.is64Bit())
leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
else
leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
@@ -2076,13 +2074,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
// we have better subtarget support, enable the 16-bit LEA generation here.
// 16-bit LEA is also slow on Core2.
bool DisableLEA16 = true;
- bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+ bool is64Bit = Subtarget.is64Bit();
unsigned MIOpc = MI->getOpcode();
switch (MIOpc) {
case X86::SHUFPSrri: {
assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!");
- if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return nullptr;
+ if (!Subtarget.hasSSE2()) return nullptr;
unsigned B = MI->getOperand(1).getReg();
unsigned C = MI->getOperand(2).getReg();
@@ -2094,7 +2092,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
}
case X86::SHUFPDrri: {
assert(MI->getNumOperands() == 4 && "Unknown shufpd instruction!");
- if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return nullptr;
+ if (!Subtarget.hasSSE2()) return nullptr;
unsigned B = MI->getOperand(1).getReg();
unsigned C = MI->getOperand(2).getReg();
@@ -2672,8 +2670,7 @@ static X86::CondCode getSwappedCondition(X86::CondCode CC) {
/// getSETFromCond - Return a set opcode for the given condition and
/// whether it has memory operand.
-static unsigned getSETFromCond(X86::CondCode CC,
- bool HasMemoryOperand) {
+unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
static const uint16_t Opc[16][2] = {
{ X86::SETAr, X86::SETAm },
{ X86::SETAEr, X86::SETAEm },
@@ -2693,14 +2690,14 @@ static unsigned getSETFromCond(X86::CondCode CC,
{ X86::SETSr, X86::SETSm }
};
- assert(CC < 16 && "Can only handle standard cond codes");
+ assert(CC <= LAST_VALID_COND && "Can only handle standard cond codes");
return Opc[CC][HasMemoryOperand ? 1 : 0];
}
/// getCMovFromCond - Return a cmov opcode for the given condition,
/// register size in bytes, and operand type.
-static unsigned getCMovFromCond(X86::CondCode CC, unsigned RegBytes,
- bool HasMemoryOperand) {
+unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes,
+ bool HasMemoryOperand) {
static const uint16_t Opc[32][3] = {
{ X86::CMOVA16rr, X86::CMOVA32rr, X86::CMOVA64rr },
{ X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr },
@@ -2976,7 +2973,7 @@ canInsertSelect(const MachineBasicBlock &MBB,
unsigned TrueReg, unsigned FalseReg,
int &CondCycles, int &TrueCycles, int &FalseCycles) const {
// Not all subtargets have cmov instructions.
- if (!TM.getSubtarget<X86Subtarget>().hasCMov())
+ if (!Subtarget.hasCMov())
return false;
if (Cond.size() != 1)
return false;
@@ -3027,8 +3024,7 @@ static bool isHReg(unsigned Reg) {
// Try and copy between VR128/VR64 and GR64 registers.
static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
- const X86Subtarget& Subtarget) {
-
+ const X86Subtarget &Subtarget) {
// SrcReg(VR128) -> DestReg(GR64)
// SrcReg(VR64) -> DestReg(GR64)
@@ -3107,8 +3103,8 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
unsigned DestReg, unsigned SrcReg,
bool KillSrc) const {
// First deal with the normal symmetric copies.
- bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
- bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
+ bool HasAVX = Subtarget.hasAVX();
+ bool HasAVX512 = Subtarget.hasAVX512();
unsigned Opc = 0;
if (X86::GR64RegClass.contains(DestReg, SrcReg))
Opc = X86::MOV64rr;
@@ -3120,7 +3116,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Copying to or from a physical H register on x86-64 requires a NOREX
// move. Otherwise use a normal move.
if ((isHReg(DestReg) || isHReg(SrcReg)) &&
- TM.getSubtarget<X86Subtarget>().is64Bit()) {
+ Subtarget.is64Bit()) {
Opc = X86::MOV8rr_NOREX;
// Both operands must be encodable without an REX prefix.
assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
@@ -3137,7 +3133,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
else if (X86::VR256RegClass.contains(DestReg, SrcReg))
Opc = X86::VMOVAPSYrr;
if (!Opc)
- Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, TM.getSubtarget<X86Subtarget>());
+ Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
if (Opc) {
BuildMI(MBB, MI, DL, get(Opc), DestReg)
@@ -3183,9 +3179,9 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
static unsigned getLoadStoreRegOpcode(unsigned Reg,
const TargetRegisterClass *RC,
bool isStackAligned,
- const TargetMachine &TM,
+ const X86Subtarget &STI,
bool load) {
- if (TM.getSubtarget<X86Subtarget>().hasAVX512()) {
+ if (STI.hasAVX512()) {
if (X86::VK8RegClass.hasSubClassEq(RC) ||
X86::VK16RegClass.hasSubClassEq(RC))
return load ? X86::KMOVWkm : X86::KMOVWmk;
@@ -3197,13 +3193,13 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
}
- bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+ bool HasAVX = STI.hasAVX();
switch (RC->getSize()) {
default:
llvm_unreachable("Unknown spill size");
case 1:
assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
- if (TM.getSubtarget<X86Subtarget>().is64Bit())
+ if (STI.is64Bit())
// Copying to or from a physical H register on x86-64 requires a NOREX
// move. Otherwise use a normal move.
if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
@@ -3270,16 +3266,16 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
static unsigned getStoreRegOpcode(unsigned SrcReg,
const TargetRegisterClass *RC,
bool isStackAligned,
- TargetMachine &TM) {
- return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, TM, false);
+ const X86Subtarget &STI) {
+ return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, STI, false);
}
static unsigned getLoadRegOpcode(unsigned DestReg,
const TargetRegisterClass *RC,
bool isStackAligned,
- const TargetMachine &TM) {
- return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, TM, true);
+ const X86Subtarget &STI) {
+ return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, STI, true);
}
void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -3291,9 +3287,10 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
"Stack slot too small for store");
unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
- bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
- RI.canRealignStack(MF);
- unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
+ bool isAligned =
+ (MF.getTarget().getFrameLowering()->getStackAlignment() >= Alignment) ||
+ RI.canRealignStack(MF);
+ unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
DebugLoc DL = MBB.findDebugLoc(MI);
addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
.addReg(SrcReg, getKillRegState(isKill));
@@ -3309,7 +3306,7 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
bool isAligned = MMOBegin != MMOEnd &&
(*MMOBegin)->getAlignment() >= Alignment;
- unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
+ unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
DebugLoc DL;
MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
for (unsigned i = 0, e = Addr.size(); i != e; ++i)
@@ -3327,9 +3324,10 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
const TargetRegisterInfo *TRI) const {
const MachineFunction &MF = *MBB.getParent();
unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
- bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
- RI.canRealignStack(MF);
- unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
+ bool isAligned =
+ (MF.getTarget().getFrameLowering()->getStackAlignment() >= Alignment) ||
+ RI.canRealignStack(MF);
+ unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
DebugLoc DL = MBB.findDebugLoc(MI);
addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
}
@@ -3343,7 +3341,7 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
bool isAligned = MMOBegin != MMOEnd &&
(*MMOBegin)->getAlignment() >= Alignment;
- unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
+ unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
DebugLoc DL;
MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
for (unsigned i = 0, e = Addr.size(); i != e; ++i)
@@ -3741,7 +3739,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
continue;
// EFLAGS is used by this instruction.
- X86::CondCode OldCC;
+ X86::CondCode OldCC = X86::COND_INVALID;
bool OpcIsSET = false;
if (IsCmpZero || IsSwapped) {
// We decode the condition code from opcode.
@@ -3964,7 +3962,7 @@ static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
}
bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
- bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+ bool HasAVX = Subtarget.hasAVX();
MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
switch (MI->getOpcode()) {
case X86::MOV32r0:
@@ -4075,7 +4073,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
unsigned Size, unsigned Align) const {
const DenseMap<unsigned,
std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
- bool isCallRegIndirect = TM.getSubtarget<X86Subtarget>().callRegIndirect();
+ bool isCallRegIndirect = Subtarget.callRegIndirect();
bool isTwoAddrFold = false;
// Atom favors register form of call. So, we do not fold loads into calls
@@ -4316,7 +4314,7 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
if (X86::VR128RegClass.contains(Reg)) {
// These instructions are all floating point domain, so xorps is the best
// choice.
- bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+ bool HasAVX = Subtarget.hasAVX();
unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr;
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg)
.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
@@ -4352,7 +4350,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
// If the function stack isn't realigned we don't want to fold instructions
// that need increased alignment.
if (!RI.needsStackRealignment(MF))
- Alignment = std::min(Alignment, TM.getFrameLowering()->getStackAlignment());
+ Alignment = std::min(
+ Alignment, MF.getTarget().getFrameLowering()->getStackAlignment());
if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
unsigned NewOpc = 0;
unsigned RCSize = 0;
@@ -4453,14 +4452,14 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
// Create a constant-pool entry and operands to load from it.
// Medium and large mode can't fold loads this way.
- if (TM.getCodeModel() != CodeModel::Small &&
- TM.getCodeModel() != CodeModel::Kernel)
+ if (MF.getTarget().getCodeModel() != CodeModel::Small &&
+ MF.getTarget().getCodeModel() != CodeModel::Kernel)
return nullptr;
// x86-32 PIC requires a PIC base register for constant pools.
unsigned PICBase = 0;
- if (TM.getRelocationModel() == Reloc::PIC_) {
- if (TM.getSubtarget<X86Subtarget>().is64Bit())
+ if (MF.getTarget().getRelocationModel() == Reloc::PIC_) {
+ if (Subtarget.is64Bit())
PICBase = X86::RIP;
else
// FIXME: PICBase = getGlobalBaseReg(&MF);
@@ -4600,7 +4599,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
if (!MI->hasOneMemOperand() &&
RC == &X86::VR128RegClass &&
- !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+ !Subtarget.isUnalignedMemAccessFast())
// Without memoperands, loadRegFromAddr and storeRegToStackSlot will
// conservatively assume the address is unaligned. That's bad for
// performance.
@@ -4748,13 +4747,13 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
cast<MachineSDNode>(N)->memoperands_end());
if (!(*MMOs.first) &&
RC == &X86::VR128RegClass &&
- !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+ !Subtarget.isUnalignedMemAccessFast())
// Do not introduce a slow unaligned load.
return false;
unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
bool isAligned = (*MMOs.first) &&
(*MMOs.first)->getAlignment() >= Alignment;
- Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl,
+ Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl,
VT, MVT::Other, AddrOps);
NewNodes.push_back(Load);
@@ -4791,15 +4790,15 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
cast<MachineSDNode>(N)->memoperands_end());
if (!(*MMOs.first) &&
RC == &X86::VR128RegClass &&
- !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+ !Subtarget.isUnalignedMemAccessFast())
// Do not introduce a slow unaligned store.
return false;
unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
bool isAligned = (*MMOs.first) &&
(*MMOs.first)->getAlignment() >= Alignment;
- SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC,
- isAligned, TM),
- dl, MVT::Other, AddrOps);
+ SDNode *Store =
+ DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
+ dl, MVT::Other, AddrOps);
NewNodes.push_back(Store);
// Preserve memory reference information.
@@ -4960,7 +4959,7 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
default:
// XMM registers. In 64-bit mode we can be a bit more aggressive since we
// have 16 of them to play with.
- if (TM.getSubtargetImpl()->is64Bit()) {
+ if (Subtarget.is64Bit()) {
if (NumLoads >= 3)
return false;
} else if (NumLoads) {
@@ -4986,7 +4985,7 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First,
// Check if this processor supports macro-fusion. Since this is a minor
// heuristic, we haven't specifically reserved a feature. hasAVX is a decent
// proxy for SandyBridge+.
- if (!TM.getSubtarget<X86Subtarget>().hasAVX())
+ if (!Subtarget.hasAVX())
return false;
enum {
@@ -5038,6 +5037,7 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First,
case X86::TEST16rm:
case X86::TEST32rm:
case X86::TEST64rm:
+ case X86::TEST8ri_NOREX:
case X86::AND16i16:
case X86::AND16ri:
case X86::AND16ri8:
@@ -5168,7 +5168,7 @@ isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
///
unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
- assert(!TM.getSubtarget<X86Subtarget>().is64Bit() &&
+ assert(!Subtarget.is64Bit() &&
"X86-64 PIC uses RIP relative addressing");
X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
@@ -5271,7 +5271,7 @@ static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
std::pair<uint16_t, uint16_t>
X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const {
uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
- bool hasAVX2 = TM.getSubtarget<X86Subtarget>().hasAVX2();
+ bool hasAVX2 = Subtarget.hasAVX2();
uint16_t validDomains = 0;
if (domain && lookup(MI->getOpcode(), domain))
validDomains = 0xe;
@@ -5286,7 +5286,7 @@ void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
assert(dom && "Not an SSE instruction");
const uint16_t *table = lookup(MI->getOpcode(), dom);
if (!table) { // try the other table
- assert((TM.getSubtarget<X86Subtarget>().hasAVX2() || Domain < 3) &&
+ assert((Subtarget.hasAVX2() || Domain < 3) &&
"256-bit vector operations only available in AVX2");
table = lookupAVX2(MI->getOpcode(), dom);
}
@@ -5299,6 +5299,16 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
NopInst.setOpcode(X86::NOOP);
}
+void X86InstrInfo::getUnconditionalBranch(
+ MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const {
+ Branch.setOpcode(X86::JMP_4);
+ Branch.addOperand(MCOperand::CreateExpr(BranchTarget));
+}
+
+void X86InstrInfo::getTrap(MCInst &MI) const {
+ MI.setOpcode(X86::TRAP);
+}
+
bool X86InstrInfo::isHighLatencyDef(int opc) const {
switch (opc) {
default: return false;
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 5f34915..c177e3a 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -24,7 +24,7 @@
namespace llvm {
class X86RegisterInfo;
- class X86TargetMachine;
+ class X86Subtarget;
namespace X86 {
// X86 specific condition code. These correspond to X86_*_COND in
@@ -46,6 +46,7 @@ namespace X86 {
COND_O = 13,
COND_P = 14,
COND_S = 15,
+ LAST_VALID_COND = COND_S,
// Artificial condition codes. These are used by AnalyzeBranch
// to indicate a block terminated with two conditional branches to
@@ -61,12 +62,21 @@ namespace X86 {
// Turn condition code into conditional branch opcode.
unsigned GetCondBranchFromCond(CondCode CC);
+ /// \brief Return a set opcode for the given condition and whether it has
+ /// a memory operand.
+ unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
+
+ /// \brief Return a cmov opcode for the given condition, register size in
+ /// bytes, and operand type.
+ unsigned getCMovFromCond(CondCode CC, unsigned RegBytes,
+ bool HasMemoryOperand = false);
+
// Turn CMov opcode into condition code.
CondCode getCondFromCMovOpc(unsigned Opc);
/// GetOppositeBranchCondition - Return the inverse of the specified cond,
/// e.g. turning COND_E to COND_NE.
- CondCode GetOppositeBranchCondition(X86::CondCode CC);
+ CondCode GetOppositeBranchCondition(CondCode CC);
} // end namespace X86;
@@ -129,7 +139,7 @@ inline static bool isMem(const MachineInstr *MI, unsigned Op) {
}
class X86InstrInfo final : public X86GenInstrInfo {
- X86TargetMachine &TM;
+ X86Subtarget &Subtarget;
const X86RegisterInfo RI;
/// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
@@ -156,7 +166,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
virtual void anchor();
public:
- explicit X86InstrInfo(X86TargetMachine &tm);
+ explicit X86InstrInfo(X86Subtarget &STI);
/// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
/// such, whenever a client has an instance of instruction info, it should
@@ -396,6 +406,12 @@ public:
const SmallVectorImpl<MachineOperand> &MOs,
unsigned Size, unsigned Alignment) const;
+ void
+ getUnconditionalBranch(MCInst &Branch,
+ const MCSymbolRefExpr *BranchTarget) const override;
+
+ void getTrap(MCInst &MI) const override;
+
bool isHighLatencyDef(int opc) const override;
bool hasHighOperandLatency(const InstrItineraryData *ItinData,
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 0d97669..e7b532c 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -155,27 +155,6 @@ def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair,
[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
SDNPMayLoad, SDNPMemOperand]>;
-def X86AtomAdd64 : SDNode<"X86ISD::ATOMADD64_DAG", SDTX86atomicBinary,
- [SDNPHasChain, SDNPMayStore,
- SDNPMayLoad, SDNPMemOperand]>;
-def X86AtomSub64 : SDNode<"X86ISD::ATOMSUB64_DAG", SDTX86atomicBinary,
- [SDNPHasChain, SDNPMayStore,
- SDNPMayLoad, SDNPMemOperand]>;
-def X86AtomOr64 : SDNode<"X86ISD::ATOMOR64_DAG", SDTX86atomicBinary,
- [SDNPHasChain, SDNPMayStore,
- SDNPMayLoad, SDNPMemOperand]>;
-def X86AtomXor64 : SDNode<"X86ISD::ATOMXOR64_DAG", SDTX86atomicBinary,
- [SDNPHasChain, SDNPMayStore,
- SDNPMayLoad, SDNPMemOperand]>;
-def X86AtomAnd64 : SDNode<"X86ISD::ATOMAND64_DAG", SDTX86atomicBinary,
- [SDNPHasChain, SDNPMayStore,
- SDNPMayLoad, SDNPMemOperand]>;
-def X86AtomNand64 : SDNode<"X86ISD::ATOMNAND64_DAG", SDTX86atomicBinary,
- [SDNPHasChain, SDNPMayStore,
- SDNPMayLoad, SDNPMemOperand]>;
-def X86AtomSwap64 : SDNode<"X86ISD::ATOMSWAP64_DAG", SDTX86atomicBinary,
- [SDNPHasChain, SDNPMayStore,
- SDNPMayLoad, SDNPMemOperand]>;
def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
@@ -208,6 +187,8 @@ def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void,
[SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
def X86rdtscp : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void,
[SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def X86rdpmc : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void,
+ [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>;
def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>;
@@ -795,6 +776,7 @@ def OptForSpeed : Predicate<"!OptForSize">;
def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">;
def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">;
+def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
//===----------------------------------------------------------------------===//
// X86 Instruction Format Definitions.
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 1eb0485..f9a5ae1 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -4337,20 +4337,6 @@ defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
SSE_INTALU_ITINS_P, 0>;
//===---------------------------------------------------------------------===//
-// SSE2 - Packed Integer Pack Instructions
-//===---------------------------------------------------------------------===//
-
-defm PACKSSWB : PDI_binop_all_int<0x63, "packsswb", int_x86_sse2_packsswb_128,
- int_x86_avx2_packsswb,
- SSE_INTALU_ITINS_SHUFF_P, 0>;
-defm PACKSSDW : PDI_binop_all_int<0x6B, "packssdw", int_x86_sse2_packssdw_128,
- int_x86_avx2_packssdw,
- SSE_INTALU_ITINS_SHUFF_P, 0>;
-defm PACKUSWB : PDI_binop_all_int<0x67, "packuswb", int_x86_sse2_packuswb_128,
- int_x86_avx2_packuswb,
- SSE_INTALU_ITINS_SHUFF_P, 0>;
-
-//===---------------------------------------------------------------------===//
// SSE2 - Packed Integer Shuffle Instructions
//===---------------------------------------------------------------------===//
@@ -4432,6 +4418,136 @@ let Predicates = [UseSSE2] in {
}
//===---------------------------------------------------------------------===//
+// Packed Integer Pack Instructions (SSE & AVX)
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
+ ValueType ArgVT, SDNode OpNode, PatFrag bc_frag,
+ bit Is2Addr = 1> {
+ def rr : PDI<opc, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst,
+ (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
+ Sched<[WriteShuffle]>;
+ def rm : PDI<opc, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst,
+ (OutVT (OpNode VR128:$src1,
+ (bc_frag (memopv2i64 addr:$src2)))))]>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
+ ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> {
+ def Yrr : PDI<opc, MRMSrcReg,
+ (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
+ Sched<[WriteShuffle]>;
+ def Yrm : PDI<opc, MRMSrcMem,
+ (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OutVT (OpNode VR256:$src1,
+ (bc_frag (memopv4i64 addr:$src2)))))]>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
+ ValueType ArgVT, SDNode OpNode, PatFrag bc_frag,
+ bit Is2Addr = 1> {
+ def rr : SS48I<opc, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst,
+ (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
+ Sched<[WriteShuffle]>;
+ def rm : SS48I<opc, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst,
+ (OutVT (OpNode VR128:$src1,
+ (bc_frag (memopv2i64 addr:$src2)))))]>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
+ ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> {
+ def Yrr : SS48I<opc, MRMSrcReg,
+ (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
+ Sched<[WriteShuffle]>;
+ def Yrm : SS48I<opc, MRMSrcMem,
+ (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OutVT (OpNode VR256:$src1,
+ (bc_frag (memopv4i64 addr:$src2)))))]>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+ defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss,
+ bc_v8i16, 0>, VEX_4V;
+ defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss,
+ bc_v4i32, 0>, VEX_4V;
+
+ defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus,
+ bc_v8i16, 0>, VEX_4V;
+ defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus,
+ bc_v4i32, 0>, VEX_4V;
+}
+
+let Predicates = [HasAVX2] in {
+ defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss,
+ bc_v16i16>, VEX_4V, VEX_L;
+ defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss,
+ bc_v8i32>, VEX_4V, VEX_L;
+
+ defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus,
+ bc_v16i16>, VEX_4V, VEX_L;
+ defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus,
+ bc_v8i32>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+ defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss,
+ bc_v8i16>;
+ defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss,
+ bc_v4i32>;
+
+ defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus,
+ bc_v8i16>;
+
+ let Predicates = [HasSSE41] in
+ defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus,
+ bc_v4i32>;
+}
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
// SSE2 - Packed Integer Unpack Instructions
//===---------------------------------------------------------------------===//
@@ -5239,6 +5355,60 @@ let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
f128mem, SSE_ALU_F64P>, PD;
}
+// Patterns used to select 'addsub' instructions.
+let Predicates = [HasAVX] in {
+ // Constant 170 corresponds to the binary mask '10101010'.
+ // When used as a blend mask, it allows selecting eight elements from two
+ // input vectors as follow:
+ // - Even-numbered values in the destination are copied from
+ // the corresponding elements in the first input vector;
+ // - Odd-numbered values in the destination are copied from
+ // the corresponding elements in the second input vector.
+
+ def : Pat<(v8f32 (X86Blendi (v8f32 (fsub VR256:$lhs, VR256:$rhs)),
+ (v8f32 (fadd VR256:$lhs, VR256:$rhs)), (i32 170))),
+ (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>;
+
+ // Constant 10 corresponds to the binary mask '1010'.
+ // In the two pattens below, constant 10 is used as a blend mask to select
+ // - the 1st and 3rd element from the first input vector (the 'fsub' node);
+ // - the 2nd and 4th element from the second input vector (the 'fadd' node).
+
+ def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)),
+ (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))),
+ (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
+ def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)),
+ (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))),
+ (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)),
+ (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))),
+ (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)),
+ (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))),
+ (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+ def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)),
+ (v2f64 (fsub VR128:$lhs, VR128:$rhs)))),
+ (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+}
+
+let Predicates = [UseSSE3] in {
+ // Constant 10 corresponds to the binary mask '1010'.
+ // In the pattern below, it is used as a blend mask to select:
+ // - the 1st and 3rd element from the first input vector (the fsub node);
+ // - the 2nd and 4th element from the second input vector (the fadd node).
+
+ def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)),
+ (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))),
+ (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
+
+ def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)),
+ (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))),
+ (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+ def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)),
+ (v2f64 (fsub VR128:$lhs, VR128:$rhs)))),
+ (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+}
+
//===---------------------------------------------------------------------===//
// SSE3 Instructions
//===---------------------------------------------------------------------===//
@@ -7053,8 +7223,6 @@ multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
let Predicates = [HasAVX] in {
let isCommutable = 0 in
- defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
- 0, DEFAULT_ITINS_SHUFFLESCHED>, VEX_4V;
defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128,
loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
VEX_4V;
@@ -7086,9 +7254,6 @@ let Predicates = [HasAVX] in {
let Predicates = [HasAVX2] in {
let isCommutable = 0 in
- defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw",
- int_x86_avx2_packusdw, WriteShuffle>,
- VEX_4V, VEX_L;
defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256,
loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
VEX_4V, VEX_L;
@@ -7120,8 +7285,6 @@ let Predicates = [HasAVX2] in {
let Constraints = "$src1 = $dst" in {
let isCommutable = 0 in
- defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw,
- 1, DEFAULT_ITINS_SHUFFLESCHED>;
defm PMINSB : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128,
memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
defm PMINSD : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128,
@@ -7969,6 +8132,16 @@ class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX;
+class avx_broadcast_no_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, ValueType VT,
+ PatFrag ld_frag, SchedWrite Sched> :
+ AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
+ Sched<[Sched]>, VEX {
+ let mayLoad = 1;
+}
+
// AVX2 adds register forms
class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
Intrinsic Int, SchedWrite Sched> :
@@ -7977,16 +8150,15 @@ class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
[(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX;
let ExeDomain = SSEPackedSingle in {
- def VBROADCASTSSrm : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem,
- int_x86_avx_vbroadcast_ss, WriteLoad>;
- def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem,
- int_x86_avx_vbroadcast_ss_256,
- WriteFShuffleLd>, VEX_L;
+ def VBROADCASTSSrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR128,
+ f32mem, v4f32, loadf32, WriteLoad>;
+ def VBROADCASTSSYrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR256,
+ f32mem, v8f32, loadf32,
+ WriteFShuffleLd>, VEX_L;
}
let ExeDomain = SSEPackedDouble in
-def VBROADCASTSDYrm : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
- int_x86_avx_vbroadcast_sd_256,
- WriteFShuffleLd>, VEX_L;
+def VBROADCASTSDYrm : avx_broadcast_no_int<0x19, "vbroadcastsd", VR256, f64mem,
+ v4f64, loadf64, WriteFShuffleLd>, VEX_L;
def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
int_x86_avx_vbroadcastf128_pd_256,
WriteFShuffleLd>, VEX_L;
@@ -8366,6 +8538,21 @@ let Predicates = [HasF16C] in {
(VCVTPH2PSrm addr:$src)>;
}
+// Patterns for matching conversions from float to half-float and vice versa.
+let Predicates = [HasF16C] in {
+ def : Pat<(f32_to_f16 FR32:$src),
+ (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr
+ (COPY_TO_REGCLASS FR32:$src, VR128), 0)), sub_16bit))>;
+
+ def : Pat<(f16_to_f32 GR16:$src),
+ (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
+ (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >;
+
+ def : Pat<(f16_to_f32 (i16 (f32_to_f16 FR32:$src))),
+ (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
+ (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 0)), FR32)) >;
+}
+
//===----------------------------------------------------------------------===//
// AVX2 Instructions
//===----------------------------------------------------------------------===//
@@ -8543,13 +8730,6 @@ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
}
let Predicates = [HasAVX] in {
-def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
- (VBROADCASTSSYrm addr:$src)>;
-def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
- (VBROADCASTSDYrm addr:$src)>;
-def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
- (VBROADCASTSSrm addr:$src)>;
-
// Provide fallback in case the load node that is used in the patterns above
// is used by additional users, which prevents the pattern selection.
let AddedComplexity = 20 in {
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index b5595cb..5402780 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -439,7 +439,10 @@ def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
let SchedRW = [WriteSystem] in {
def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB;
def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB;
-def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [], IIC_RDPMC>, TB;
+
+let Defs = [RAX, RDX], Uses = [ECX] in
+ def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)], IIC_RDPMC>,
+ TB;
def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins),
"smsw{w}\t$dst", [], IIC_SMSW>, OpSize16, TB;
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
index e969ef2..a082c4f 100644
--- a/lib/Target/X86/X86JITInfo.cpp
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -432,7 +432,7 @@ X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
// SSE Callback should be called for SSE-enabled LLVM.
return X86CompilationCallback_SSE;
#else
- if (Subtarget->hasSSE1())
+ if (useSSE)
return X86CompilationCallback_SSE;
#endif
#endif
@@ -440,8 +440,8 @@ X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
return X86CompilationCallback;
}
-X86JITInfo::X86JITInfo(X86TargetMachine &tm) : TM(tm) {
- Subtarget = &TM.getSubtarget<X86Subtarget>();
+X86JITInfo::X86JITInfo(bool UseSSE) {
+ useSSE = UseSSE;
useGOT = 0;
TLSOffset = nullptr;
}
diff --git a/lib/Target/X86/X86JITInfo.h b/lib/Target/X86/X86JITInfo.h
index 4d279de..564343f 100644
--- a/lib/Target/X86/X86JITInfo.h
+++ b/lib/Target/X86/X86JITInfo.h
@@ -19,16 +19,14 @@
#include "llvm/Target/TargetJITInfo.h"
namespace llvm {
- class X86TargetMachine;
class X86Subtarget;
class X86JITInfo : public TargetJITInfo {
- X86TargetMachine &TM;
- const X86Subtarget *Subtarget;
uintptr_t PICBase;
- char* TLSOffset;
+ char *TLSOffset;
+ bool useSSE;
public:
- explicit X86JITInfo(X86TargetMachine &tm);
+ explicit X86JITInfo(bool UseSSE);
/// replaceMachineCodeForFunction - Make it so that calling the function
/// whose machine code is at OLD turns into a call to NEW, perhaps by
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 0190080..2bd70a9 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//
#include "X86AsmPrinter.h"
+#include "X86RegisterInfo.h"
#include "InstPrinter/X86ATTInstPrinter.h"
#include "MCTargetDesc/X86BaseInfo.h"
#include "llvm/ADT/SmallString.h"
@@ -779,6 +780,9 @@ static void LowerPATCHPOINT(MCStreamer &OS, StackMaps &SM,
void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
X86MCInstLower MCInstLowering(*MF, *this);
+ const X86RegisterInfo *RI =
+ static_cast<const X86RegisterInfo *>(TM.getRegisterInfo());
+
switch (MI->getOpcode()) {
case TargetOpcode::DBG_VALUE:
llvm_unreachable("Should be handled target independently");
@@ -883,6 +887,37 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
.addReg(X86::R10)
.addReg(X86::RAX));
return;
+
+ case X86::SEH_PushReg:
+ OutStreamer.EmitWinCFIPushReg(RI->getSEHRegNum(MI->getOperand(0).getImm()));
+ return;
+
+ case X86::SEH_SaveReg:
+ OutStreamer.EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+ MI->getOperand(1).getImm());
+ return;
+
+ case X86::SEH_SaveXMM:
+ OutStreamer.EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+ MI->getOperand(1).getImm());
+ return;
+
+ case X86::SEH_StackAlloc:
+ OutStreamer.EmitWinCFIAllocStack(MI->getOperand(0).getImm());
+ return;
+
+ case X86::SEH_SetFrame:
+ OutStreamer.EmitWinCFISetFrame(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+ MI->getOperand(1).getImm());
+ return;
+
+ case X86::SEH_PushFrame:
+ OutStreamer.EmitWinCFIPushFrame(MI->getOperand(0).getImm());
+ return;
+
+ case X86::SEH_EndPrologue:
+ OutStreamer.EmitWinCFIEndProlog();
+ return;
}
MCInst TmpInst;
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index a83e1e4..e8a7e84 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -53,20 +53,18 @@ static cl::opt<bool>
EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
cl::desc("Enable use of a base pointer for complex stack frames"));
-X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm)
- : X86GenRegisterInfo((tm.getSubtarget<X86Subtarget>().is64Bit()
- ? X86::RIP : X86::EIP),
- X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), false),
- X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), true),
- (tm.getSubtarget<X86Subtarget>().is64Bit()
- ? X86::RIP : X86::EIP)),
- TM(tm) {
+X86RegisterInfo::X86RegisterInfo(const X86Subtarget &STI)
+ : X86GenRegisterInfo(
+ (STI.is64Bit() ? X86::RIP : X86::EIP),
+ X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), false),
+ X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), true),
+ (STI.is64Bit() ? X86::RIP : X86::EIP)),
+ Subtarget(STI) {
X86_MC::InitLLVM2SEHRegisterMapping(this);
// Cache some information.
- const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
- Is64Bit = Subtarget->is64Bit();
- IsWin64 = Subtarget->isTargetWin64();
+ Is64Bit = Subtarget.is64Bit();
+ IsWin64 = Subtarget.isTargetWin64();
if (Is64Bit) {
SlotSize = 8;
@@ -83,21 +81,6 @@ X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm)
BasePtr = Is64Bit ? X86::RBX : X86::ESI;
}
-/// getCompactUnwindRegNum - This function maps the register to the number for
-/// compact unwind encoding. Return -1 if the register isn't valid.
-int X86RegisterInfo::getCompactUnwindRegNum(unsigned RegNum, bool isEH) const {
- switch (getLLVMRegNum(RegNum, isEH)) {
- case X86::EBX: case X86::RBX: return 1;
- case X86::ECX: case X86::R12: return 2;
- case X86::EDX: case X86::R13: return 3;
- case X86::EDI: case X86::R14: return 4;
- case X86::ESI: case X86::R15: return 5;
- case X86::EBP: case X86::RBP: return 6;
- }
-
- return -1;
-}
-
bool
X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
// ExeDepsFixer and PostRAScheduler require liveness.
@@ -173,9 +156,8 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{
}
const TargetRegisterClass *
-X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
- const {
- const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind) const {
switch (Kind) {
default: llvm_unreachable("Unexpected Kind in getPointerRegClass!");
case 0: // Normal GPRs.
@@ -225,7 +207,7 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
case X86::GR64RegClassID:
return 12 - FPDiff;
case X86::VR128RegClassID:
- return TM.getSubtarget<X86Subtarget>().is64Bit() ? 10 : 4;
+ return Subtarget.is64Bit() ? 10 : 4;
case X86::VR64RegClassID:
return 4;
}
@@ -233,8 +215,8 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
const MCPhysReg *
X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
- bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
- bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
+ bool HasAVX = Subtarget.hasAVX();
+ bool HasAVX512 = Subtarget.hasAVX512();
assert(MF && "MachineFunction required");
switch (MF->getFunction()->getCallingConv()) {
@@ -287,8 +269,8 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
const uint32_t*
X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
- bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
- bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
+ bool HasAVX = Subtarget.hasAVX();
+ bool HasAVX512 = Subtarget.hasAVX512();
switch (CC) {
case CallingConv::GHC:
@@ -406,7 +388,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
Reserved.set(*AI);
}
}
- if (!Is64Bit || !TM.getSubtarget<X86Subtarget>().hasAVX512()) {
+ if (!Is64Bit || !Subtarget.hasAVX512()) {
for (unsigned n = 16; n != 32; ++n) {
for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI)
Reserved.set(*AI);
@@ -459,7 +441,7 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
const MachineFrameInfo *MFI = MF.getFrameInfo();
const Function *F = MF.getFunction();
- unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
+ unsigned StackAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
bool requiresRealignment =
((MFI->getMaxAlignment() > StackAlign) ||
F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 2289d91..74efd1f 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -22,11 +22,11 @@
namespace llvm {
class Type;
class TargetInstrInfo;
- class X86TargetMachine;
+ class X86Subtarget;
class X86RegisterInfo final : public X86GenRegisterInfo {
public:
- X86TargetMachine &TM;
+ const X86Subtarget &Subtarget;
private:
/// Is64Bit - Is the target 64-bits.
@@ -55,15 +55,11 @@ private:
unsigned BasePtr;
public:
- X86RegisterInfo(X86TargetMachine &tm);
+ X86RegisterInfo(const X86Subtarget &STI);
// FIXME: This should be tablegen'd like getDwarfRegNum is
int getSEHRegNum(unsigned i) const;
- /// getCompactUnwindRegNum - This function maps the register to the number for
- /// compact unwind encoding. Return -1 if the register isn't valid.
- int getCompactUnwindRegNum(unsigned RegNum, bool isEH) const override;
-
/// Code Generation virtual methods...
///
bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 744890d..a83dd9b 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -11,21 +11,23 @@
//
//===----------------------------------------------------------------------===//
-#include "X86TargetMachine.h"
+#include "X86InstrInfo.h"
+#include "X86ISelLowering.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86SelectionDAGInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Target/TargetLowering.h"
+
using namespace llvm;
#define DEBUG_TYPE "x86-selectiondag-info"
-X86SelectionDAGInfo::X86SelectionDAGInfo(const X86TargetMachine &TM) :
- TargetSelectionDAGInfo(TM),
- Subtarget(&TM.getSubtarget<X86Subtarget>()),
- TLI(*TM.getTargetLowering()) {
-}
+X86SelectionDAGInfo::X86SelectionDAGInfo(const DataLayout &DL)
+ : TargetSelectionDAGInfo(&DL) {}
-X86SelectionDAGInfo::~X86SelectionDAGInfo() {
-}
+X86SelectionDAGInfo::~X86SelectionDAGInfo() {}
SDValue
X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
@@ -35,6 +37,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
bool isVolatile,
MachinePointerInfo DstPtrInfo) const {
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+ const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget<X86Subtarget>();
// If to a segment-relative address space, use the default lowering.
if (DstPtrInfo.getAddrSpace() >= 256)
@@ -43,16 +46,14 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
// If not DWORD aligned or size is more than the threshold, call the library.
// The libc version is likely to be faster for these cases. It can use the
// address value and run time information about the CPU.
- if ((Align & 3) != 0 ||
- !ConstantSize ||
- ConstantSize->getZExtValue() >
- Subtarget->getMaxInlineSizeThreshold()) {
+ if ((Align & 3) != 0 || !ConstantSize ||
+ ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) {
// Check to see if there is a specialized entry-point for memory zeroing.
ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
if (const char *bzeroEntry = V &&
- V->isNullValue() ? Subtarget->getBZeroEntry() : nullptr) {
- EVT IntPtr = TLI.getPointerTy();
+ V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) {
+ EVT IntPtr = DAG.getTargetLoweringInfo().getPointerTy();
Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
@@ -65,10 +66,11 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl).setChain(Chain)
.setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
- DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0)
+ DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args),
+ 0)
.setDiscardResult();
- std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+ std::pair<SDValue,SDValue> CallResult = DAG.getTargetLoweringInfo().LowerCallTo(CLI);
return CallResult.second;
}
@@ -99,7 +101,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
ValReg = X86::EAX;
Val = (Val << 8) | Val;
Val = (Val << 16) | Val;
- if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned
+ if (Subtarget.is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned
AVT = MVT::i64;
ValReg = X86::RAX;
Val = (Val << 32) | Val;
@@ -128,13 +130,11 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
InFlag = Chain.getValue(1);
}
- Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
- X86::ECX,
- Count, InFlag);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX,
+ Count, InFlag);
InFlag = Chain.getValue(1);
- Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
- X86::EDI,
- Dst, InFlag);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI,
+ Dst, InFlag);
InFlag = Chain.getValue(1);
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
@@ -182,10 +182,11 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
// This requires the copy size to be a constant, preferably
// within a subtarget-specific limit.
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+ const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget<X86Subtarget>();
if (!ConstantSize)
return SDValue();
uint64_t SizeVal = ConstantSize->getZExtValue();
- if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold())
+ if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
return SDValue();
/// If not DWORD aligned, it is more efficient to call the library. However
@@ -218,7 +219,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
AVT = MVT::i32;
else
// QWORD aligned
- AVT = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
+ AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
unsigned UBytes = AVT.getSizeInBits() / 8;
unsigned CountVal = SizeVal / UBytes;
@@ -226,15 +227,15 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
unsigned BytesLeft = SizeVal % UBytes;
SDValue InFlag;
- Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX :
X86::ECX,
Count, InFlag);
InFlag = Chain.getValue(1);
- Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI :
X86::EDI,
Dst, InFlag);
InFlag = Chain.getValue(1);
- Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI :
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI :
X86::ESI,
Src, InFlag);
InFlag = Chain.getValue(1);
diff --git a/lib/Target/X86/X86SelectionDAGInfo.h b/lib/Target/X86/X86SelectionDAGInfo.h
index 0d5dc38..c12555a 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.h
+++ b/lib/Target/X86/X86SelectionDAGInfo.h
@@ -23,14 +23,8 @@ class X86TargetMachine;
class X86Subtarget;
class X86SelectionDAGInfo : public TargetSelectionDAGInfo {
- /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
- /// make the right decision when generating code for different targets.
- const X86Subtarget *Subtarget;
-
- const X86TargetLowering &TLI;
-
public:
- explicit X86SelectionDAGInfo(const X86TargetMachine &TM);
+ explicit X86SelectionDAGInfo(const DataLayout &DL);
~X86SelectionDAGInfo();
SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 989e0d6..79b7e68 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -291,13 +291,60 @@ void X86Subtarget::initializeEnvironment() {
CallRegIndirect = false;
LEAUsesAG = false;
SlowLEA = false;
+ SlowIncDec = false;
stackAlignment = 4;
// FIXME: this is a known good value for Yonah. How about others?
MaxInlineSizeThreshold = 128;
}
+static std::string computeDataLayout(const X86Subtarget &ST) {
+ // X86 is little endian
+ std::string Ret = "e";
+
+ Ret += DataLayout::getManglingComponent(ST.getTargetTriple());
+ // X86 and x32 have 32 bit pointers.
+ if (ST.isTarget64BitILP32() || !ST.is64Bit())
+ Ret += "-p:32:32";
+
+ // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
+ if (ST.is64Bit() || ST.isOSWindows() || ST.isTargetNaCl())
+ Ret += "-i64:64";
+ else
+ Ret += "-f64:32:64";
+
+ // Some ABIs align long double to 128 bits, others to 32.
+ if (ST.isTargetNaCl())
+ ; // No f80
+ else if (ST.is64Bit() || ST.isTargetDarwin())
+ Ret += "-f80:128";
+ else
+ Ret += "-f80:32";
+
+ // The registers can hold 8, 16, 32 or, in x86-64, 64 bits.
+ if (ST.is64Bit())
+ Ret += "-n8:16:32:64";
+ else
+ Ret += "-n8:16:32";
+
+ // The stack is aligned to 32 bits on some ABIs and 128 bits on others.
+ if (!ST.is64Bit() && ST.isOSWindows())
+ Ret += "-S32";
+ else
+ Ret += "-S128";
+
+ return Ret;
+}
+
+X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
+ StringRef FS) {
+ initializeEnvironment();
+ resetSubtargetFeatures(CPU, FS);
+ return *this;
+}
+
X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
- const std::string &FS, unsigned StackAlignOverride)
+ const std::string &FS, X86TargetMachine &TM,
+ unsigned StackAlignOverride)
: X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others),
PICStyle(PICStyles::None), TargetTriple(TT),
StackAlignOverride(StackAlignOverride),
@@ -305,10 +352,12 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
In32BitMode(TargetTriple.getArch() == Triple::x86 &&
TargetTriple.getEnvironment() != Triple::CODE16),
In16BitMode(TargetTriple.getArch() == Triple::x86 &&
- TargetTriple.getEnvironment() == Triple::CODE16) {
- initializeEnvironment();
- resetSubtargetFeatures(CPU, FS);
-}
+ TargetTriple.getEnvironment() == Triple::CODE16),
+ DL(computeDataLayout(*this)), TSInfo(DL),
+ InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM),
+ FrameLowering(TargetFrameLowering::StackGrowsDown, getStackAlignment(),
+ is64Bit() ? -8 : -4),
+ JITInfo(hasSSE1()) {}
bool
X86Subtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel,
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 703559a..09db0eb 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -14,6 +14,11 @@
#ifndef X86SUBTARGET_H
#define X86SUBTARGET_H
+#include "X86FrameLowering.h"
+#include "X86ISelLowering.h"
+#include "X86InstrInfo.h"
+#include "X86JITInfo.h"
+#include "X86SelectionDAGInfo.h"
#include "llvm/ADT/Triple.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/Target/TargetSubtargetInfo.h"
@@ -40,6 +45,7 @@ enum Style {
}
class X86Subtarget final : public X86GenSubtargetInfo {
+
protected:
enum X86SSEEnum {
NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
@@ -181,6 +187,9 @@ protected:
/// SlowLEA - True if the LEA instruction with certain arguments is slow
bool SlowLEA;
+ /// SlowIncDec - True if INC and DEC instructions are slow when writing to flags
+ bool SlowIncDec;
+
/// Processor has AVX-512 PreFetch Instructions
bool HasPFI;
@@ -217,14 +226,31 @@ private:
/// In16BitMode - True if compiling for 16-bit, false for 32-bit or 64-bit.
bool In16BitMode;
+ // Calculates type size & alignment
+ const DataLayout DL;
+ X86SelectionDAGInfo TSInfo;
+ // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
+ // X86TargetLowering needs.
+ X86InstrInfo InstrInfo;
+ X86TargetLowering TLInfo;
+ X86FrameLowering FrameLowering;
+ X86JITInfo JITInfo;
+
public:
/// This constructor initializes the data members to match that
/// of the specified triple.
///
X86Subtarget(const std::string &TT, const std::string &CPU,
- const std::string &FS,
+ const std::string &FS, X86TargetMachine &TM,
unsigned StackAlignOverride);
+ const X86TargetLowering *getTargetLowering() const { return &TLInfo; }
+ const X86InstrInfo *getInstrInfo() const { return &InstrInfo; }
+ const DataLayout *getDataLayout() const { return &DL; }
+ const X86FrameLowering *getFrameLowering() const { return &FrameLowering; }
+ const X86SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+ X86JITInfo *getJITInfo() { return &JITInfo; }
+
/// getStackAlignment - Returns the minimum alignment known to hold of the
/// stack frame on entry to the function and which must be maintained by every
/// function for this subtarget.
@@ -241,6 +267,9 @@ public:
/// \brief Reset the features for the X86 target.
void resetSubtargetFeatures(const MachineFunction *MF) override;
private:
+ /// \brief Initialize the full set of dependencies so we can use an initializer
+ /// list for X86Subtarget.
+ X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
void initializeEnvironment();
void resetSubtargetFeatures(StringRef CPU, StringRef FS);
public:
@@ -319,6 +348,7 @@ public:
bool callRegIndirect() const { return CallRegIndirect; }
bool LEAusesAG() const { return LEAUsesAG; }
bool slowLEA() const { return SlowLEA; }
+ bool slowIncDec() const { return SlowIncDec; }
bool hasCDI() const { return HasCDI; }
bool hasPFI() const { return HasPFI; }
bool hasERI() const { return HasERI; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 93760ef..f12140f 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -29,61 +29,14 @@ extern "C" void LLVMInitializeX86Target() {
void X86TargetMachine::anchor() { }
-static std::string computeDataLayout(const X86Subtarget &ST) {
- // X86 is little endian
- std::string Ret = "e";
-
- Ret += DataLayout::getManglingComponent(ST.getTargetTriple());
- // X86 and x32 have 32 bit pointers.
- if (ST.isTarget64BitILP32() || !ST.is64Bit())
- Ret += "-p:32:32";
-
- // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
- if (ST.is64Bit() || ST.isTargetCygMing() || ST.isTargetKnownWindowsMSVC() ||
- ST.isTargetNaCl())
- Ret += "-i64:64";
- else
- Ret += "-f64:32:64";
-
- // Some ABIs align long double to 128 bits, others to 32.
- if (ST.isTargetNaCl())
- ; // No f80
- else if (ST.is64Bit() || ST.isTargetDarwin())
- Ret += "-f80:128";
- else
- Ret += "-f80:32";
-
- // The registers can hold 8, 16, 32 or, in x86-64, 64 bits.
- if (ST.is64Bit())
- Ret += "-n8:16:32:64";
- else
- Ret += "-n8:16:32";
-
- // The stack is aligned to 32 bits on some ABIs and 128 bits on others.
- if (!ST.is64Bit() && (ST.isTargetCygMing() || ST.isTargetKnownWindowsMSVC()))
- Ret += "-S32";
- else
- Ret += "-S128";
-
- return Ret;
-}
-
/// X86TargetMachine ctor - Create an X86 target.
///
-X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
- StringRef CPU, StringRef FS,
- const TargetOptions &Options,
+X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
Reloc::Model RM, CodeModel::Model CM,
CodeGenOpt::Level OL)
- : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
- Subtarget(TT, CPU, FS, Options.StackAlignmentOverride),
- FrameLowering(*this, Subtarget),
- InstrItins(Subtarget.getInstrItineraryData()),
- DL(computeDataLayout(*getSubtargetImpl())),
- InstrInfo(*this),
- TLInfo(*this),
- TSInfo(*this),
- JITInfo(*this) {
+ : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+ Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) {
// Determine the PICStyle based on the target selected.
if (getRelocationModel() == Reloc::Static) {
// Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
@@ -158,6 +111,7 @@ public:
return *getX86TargetMachine().getSubtargetImpl();
}
+ void addIRPasses() override;
bool addInstSelector() override;
bool addILPOpts() override;
bool addPreRegAlloc() override;
@@ -170,6 +124,12 @@ TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
return new X86PassConfig(this, PM);
}
+void X86PassConfig::addIRPasses() {
+ addPass(createX86AtomicExpandPass(&getX86TargetMachine()));
+
+ TargetPassConfig::addIRPasses();
+}
+
bool X86PassConfig::addInstSelector() {
// Install an instruction selector.
addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel()));
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 57e6eda..41d5157 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -13,12 +13,7 @@
#ifndef X86TARGETMACHINE_H
#define X86TARGETMACHINE_H
-
-#include "X86FrameLowering.h"
-#include "X86ISelLowering.h"
#include "X86InstrInfo.h"
-#include "X86JITInfo.h"
-#include "X86SelectionDAGInfo.h"
#include "X86Subtarget.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/Target/TargetMachine.h"
@@ -30,13 +25,6 @@ class StringRef;
class X86TargetMachine final : public LLVMTargetMachine {
virtual void anchor();
X86Subtarget Subtarget;
- X86FrameLowering FrameLowering;
- InstrItineraryData InstrItins;
- const DataLayout DL; // Calculates type size & alignment
- X86InstrInfo InstrInfo;
- X86TargetLowering TLInfo;
- X86SelectionDAGInfo TSInfo;
- X86JITInfo JITInfo;
public:
X86TargetMachine(const Target &T, StringRef TT,
@@ -44,28 +32,28 @@ public:
Reloc::Model RM, CodeModel::Model CM,
CodeGenOpt::Level OL);
- const DataLayout *getDataLayout() const override { return &DL; }
+ const DataLayout *getDataLayout() const override {
+ return getSubtargetImpl()->getDataLayout();
+ }
const X86InstrInfo *getInstrInfo() const override {
- return &InstrInfo;
+ return getSubtargetImpl()->getInstrInfo();
}
const TargetFrameLowering *getFrameLowering() const override {
- return &FrameLowering;
- }
- X86JITInfo *getJITInfo() override {
- return &JITInfo;
+ return getSubtargetImpl()->getFrameLowering();
}
+ X86JITInfo *getJITInfo() override { return Subtarget.getJITInfo(); }
const X86Subtarget *getSubtargetImpl() const override { return &Subtarget; }
const X86TargetLowering *getTargetLowering() const override {
- return &TLInfo;
+ return getSubtargetImpl()->getTargetLowering();
}
const X86SelectionDAGInfo *getSelectionDAGInfo() const override {
- return &TSInfo;
+ return getSubtargetImpl()->getSelectionDAGInfo();
}
const X86RegisterInfo *getRegisterInfo() const override {
return &getInstrInfo()->getRegisterInfo();
}
const InstrItineraryData *getInstrItineraryData() const override {
- return &InstrItins;
+ return &getSubtargetImpl()->getInstrItineraryData();
}
/// \brief Register X86 analysis passes with a pass manager.
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 91b9d40..c961e2f 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -102,6 +102,8 @@ public:
unsigned getReductionCost(unsigned Opcode, Type *Ty,
bool IsPairwiseForm) const override;
+ unsigned getIntImmCost(int64_t) const;
+
unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
@@ -142,13 +144,17 @@ unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
if (Vector && !ST->hasSSE1())
return 0;
- if (ST->is64Bit())
+ if (ST->is64Bit()) {
+ if (Vector && ST->hasAVX512())
+ return 32;
return 16;
+ }
return 8;
}
unsigned X86TTI::getRegisterBitWidth(bool Vector) const {
if (Vector) {
+ if (ST->hasAVX512()) return 512;
if (ST->hasAVX()) return 256;
if (ST->hasSSE1()) return 128;
return 0;
@@ -400,17 +406,117 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) const {
- // We only estimate the cost of reverse shuffles.
- if (Kind != SK_Reverse)
+ // We only estimate the cost of reverse and alternate shuffles.
+ if (Kind != SK_Reverse && Kind != SK_Alternate)
return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
- unsigned Cost = 1;
- if (LT.second.getSizeInBits() > 128)
- Cost = 3; // Extract + insert + copy.
+ if (Kind == SK_Reverse) {
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+ unsigned Cost = 1;
+ if (LT.second.getSizeInBits() > 128)
+ Cost = 3; // Extract + insert + copy.
+
+ // Multiple by the number of parts.
+ return Cost * LT.first;
+ }
+
+ if (Kind == SK_Alternate) {
+ // 64-bit packed float vectors (v2f32) are widened to type v4f32.
+ // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+
+ // The backend knows how to generate a single VEX.256 version of
+ // instruction VPBLENDW if the target supports AVX2.
+ if (ST->hasAVX2() && LT.second == MVT::v16i16)
+ return LT.first;
+
+ static const CostTblEntry<MVT::SimpleValueType> AVXAltShuffleTbl[] = {
+ {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vblendpd
+ {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vblendpd
+
+ {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vblendps
+ {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vblendps
+
+ // This shuffle is custom lowered into a sequence of:
+ // 2x vextractf128 , 2x vpblendw , 1x vinsertf128
+ {ISD::VECTOR_SHUFFLE, MVT::v16i16, 5},
+
+ // This shuffle is custom lowered into a long sequence of:
+ // 2x vextractf128 , 4x vpshufb , 2x vpor , 1x vinsertf128
+ {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9}
+ };
+
+ if (ST->hasAVX()) {
+ int Idx = CostTableLookup(AVXAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+ if (Idx != -1)
+ return LT.first * AVXAltShuffleTbl[Idx].Cost;
+ }
+
+ static const CostTblEntry<MVT::SimpleValueType> SSE41AltShuffleTbl[] = {
+ // These are lowered into movsd.
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+
+ // packed float vectors with four elements are lowered into BLENDI dag
+ // nodes. A v4i32/v4f32 BLENDI generates a single 'blendps'/'blendpd'.
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
+
+ // This shuffle generates a single pshufw.
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
+
+ // There is no instruction that matches a v16i8 alternate shuffle.
+ // The backend will expand it into the sequence 'pshufb + pshufb + or'.
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}
+ };
+
+ if (ST->hasSSE41()) {
+ int Idx = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+ if (Idx != -1)
+ return LT.first * SSE41AltShuffleTbl[Idx].Cost;
+ }
+
+ static const CostTblEntry<MVT::SimpleValueType> SSSE3AltShuffleTbl[] = {
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd
+
+ // SSE3 doesn't have 'blendps'. The following shuffles are expanded into
+ // the sequence 'shufps + pshufd'
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
- // Multiple by the number of parts.
- return Cost * LT.first;
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // pshufb + pshufb + or
+ };
+
+ if (ST->hasSSSE3()) {
+ int Idx = CostTableLookup(SSSE3AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+ if (Idx != -1)
+ return LT.first * SSSE3AltShuffleTbl[Idx].Cost;
+ }
+
+ static const CostTblEntry<MVT::SimpleValueType> SSEAltShuffleTbl[] = {
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd
+
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd
+
+ // This is expanded into a long sequence of four extract + four insert.
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw.
+
+ // 8 x (pinsrw + pextrw + and + movb + movzb + or)
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48}
+ };
+
+ // Fall-back (SSE3 and SSE2).
+ int Idx = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+ if (Idx != -1)
+ return LT.first * SSEAltShuffleTbl[Idx].Cost;
+ return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+ }
+
+ return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
}
unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
@@ -808,6 +914,19 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy,
return TargetTransformInfo::getReductionCost(Opcode, ValTy, IsPairwise);
}
+/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// method might only calculate a fraction of a larger immediate. Therefore it
+/// is valid to return a cost of ZERO.
+unsigned X86TTI::getIntImmCost(int64_t Val) const {
+ if (Val == 0)
+ return TCC_Free;
+
+ if (isInt<32>(Val))
+ return TCC_Basic;
+
+ return 2 * TCC_Basic;
+}
+
unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
assert(Ty->isIntegerTy());
@@ -825,11 +944,21 @@ unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
if (Imm == 0)
return TCC_Free;
- if (Imm.getBitWidth() <= 64 &&
- (isInt<32>(Imm.getSExtValue()) || isUInt<32>(Imm.getZExtValue())))
- return TCC_Basic;
- else
- return 2 * TCC_Basic;
+ // Sign-extend all constants to a multiple of 64-bit.
+ APInt ImmVal = Imm;
+ if (BitSize & 0x3f)
+ ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
+
+ // Split the constant into 64-bit chunks and calculate the cost for each
+ // chunk.
+ unsigned Cost = 0;
+ for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
+ APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
+ int64_t Val = Tmp.getSExtValue();
+ Cost += getIntImmCost(Val);
+ }
+ // We need at least one instruction to materialze the constant.
+ return std::max(1U, Cost);
}
unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
@@ -889,9 +1018,13 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
break;
}
- if ((Idx == ImmIdx) &&
- Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
- return TCC_Free;
+ if (Idx == ImmIdx) {
+ unsigned NumConstants = (BitSize + 63) / 64;
+ unsigned Cost = X86TTI::getIntImmCost(Imm, Ty);
+ return (Cost <= NumConstants * TCC_Basic)
+ ? static_cast<unsigned>(TCC_Free)
+ : Cost;
+ }
return X86TTI::getIntImmCost(Imm, Ty);
}