aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/X86
diff options
context:
space:
mode:
authorStephen Hines <srhines@google.com>2014-05-29 02:49:00 -0700
committerStephen Hines <srhines@google.com>2014-05-29 02:49:00 -0700
commitdce4a407a24b04eebc6a376f8e62b41aaa7b071f (patch)
treedcebc53f2b182f145a2e659393bf9a0472cedf23 /lib/Target/X86
parent220b921aed042f9e520c26cffd8282a94c66c3d5 (diff)
downloadexternal_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.zip
external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.gz
external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.bz2
Update LLVM for 3.5 rebase (r209712).
Change-Id: I149556c940fb7dc92d075273c87ff584f400941f
Diffstat (limited to 'lib/Target/X86')
-rw-r--r--lib/Target/X86/Android.mk1
-rw-r--r--lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp122
-rw-r--r--lib/Target/X86/AsmParser/X86AsmInstrumentation.h17
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParser.cpp93
-rw-r--r--lib/Target/X86/AsmParser/X86Operand.h6
-rw-r--r--lib/Target/X86/CMakeLists.txt1
-rw-r--r--lib/Target/X86/Disassembler/Android.mk3
-rw-r--r--lib/Target/X86/Disassembler/CMakeLists.txt2
-rw-r--r--lib/Target/X86/Disassembler/Makefile4
-rw-r--r--lib/Target/X86/Disassembler/X86Disassembler.cpp51
-rw-r--r--lib/Target/X86/Disassembler/X86Disassembler.h17
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp (renamed from lib/Target/X86/Disassembler/X86DisassemblerDecoder.c)181
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.h362
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h221
-rw-r--r--lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp3
-rw-r--r--lib/Target/X86/InstPrinter/X86ATTInstPrinter.h2
-rw-r--r--lib/Target/X86/InstPrinter/X86InstComments.cpp4
-rw-r--r--lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp3
-rw-r--r--lib/Target/X86/MCTargetDesc/Android.mk3
-rw-r--r--lib/Target/X86/MCTargetDesc/CMakeLists.txt1
-rw-r--r--lib/Target/X86/MCTargetDesc/LLVMBuild.txt2
-rw-r--r--lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp16
-rw-r--r--lib/Target/X86/MCTargetDesc/X86BaseInfo.h9
-rw-r--r--lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp8
-rw-r--r--lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp2
-rw-r--r--lib/Target/X86/MCTargetDesc/X86FixupKinds.h1
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp8
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp19
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp42
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h9
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp2
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp23
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp72
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp51
-rw-r--r--lib/Target/X86/X86.h4
-rw-r--r--lib/Target/X86/X86.td27
-rw-r--r--lib/Target/X86/X86AsmPrinter.cpp192
-rw-r--r--lib/Target/X86/X86AsmPrinter.h4
-rw-r--r--lib/Target/X86/X86COFFMachineModuleInfo.cpp19
-rw-r--r--lib/Target/X86/X86COFFMachineModuleInfo.h46
-rw-r--r--lib/Target/X86/X86CallingConv.h27
-rw-r--r--lib/Target/X86/X86CallingConv.td10
-rw-r--r--lib/Target/X86/X86CodeEmitter.cpp9
-rw-r--r--lib/Target/X86/X86FastISel.cpp36
-rw-r--r--lib/Target/X86/X86FixupLEAs.cpp107
-rw-r--r--lib/Target/X86/X86FloatingPoint.cpp5
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp91
-rw-r--r--lib/Target/X86/X86FrameLowering.h2
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp98
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp2193
-rw-r--r--lib/Target/X86/X86ISelLowering.h42
-rw-r--r--lib/Target/X86/X86InstrAVX512.td206
-rw-r--r--lib/Target/X86/X86InstrBuilder.h3
-rw-r--r--lib/Target/X86/X86InstrCompiler.td4
-rw-r--r--lib/Target/X86/X86InstrFMA.td46
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td5
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp262
-rw-r--r--lib/Target/X86/X86InstrInfo.h9
-rw-r--r--lib/Target/X86/X86InstrInfo.td200
-rw-r--r--lib/Target/X86/X86InstrMMX.td8
-rw-r--r--lib/Target/X86/X86InstrSSE.td147
-rw-r--r--lib/Target/X86/X86InstrSystem.td2
-rw-r--r--lib/Target/X86/X86JITInfo.cpp10
-rw-r--r--lib/Target/X86/X86MCInstLower.cpp8
-rw-r--r--lib/Target/X86/X86PadShortFunction.cpp9
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp8
-rw-r--r--lib/Target/X86/X86RegisterInfo.h4
-rw-r--r--lib/Target/X86/X86SchedHaswell.td3
-rw-r--r--lib/Target/X86/X86SchedSandyBridge.td3
-rw-r--r--lib/Target/X86/X86ScheduleAtom.td4
-rw-r--r--lib/Target/X86/X86ScheduleSLM.td849
-rw-r--r--lib/Target/X86/X86SelectionDAGInfo.cpp36
-rw-r--r--lib/Target/X86/X86Subtarget.cpp364
-rw-r--r--lib/Target/X86/X86Subtarget.h11
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp33
-rw-r--r--lib/Target/X86/X86TargetObjectFile.cpp14
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp159
-rw-r--r--lib/Target/X86/X86VZeroUpper.cpp6
78 files changed, 3701 insertions, 2985 deletions
diff --git a/lib/Target/X86/Android.mk b/lib/Target/X86/Android.mk
index 73031de..0d0a9ca 100644
--- a/lib/Target/X86/Android.mk
+++ b/lib/Target/X86/Android.mk
@@ -12,7 +12,6 @@ x86_codegen_TBLGEN_TABLES := \
x86_codegen_SRC_FILES := \
X86AsmPrinter.cpp \
- X86COFFMachineModuleInfo.cpp \
X86CodeEmitter.cpp \
X86FastISel.cpp \
X86FixupLEAs.cpp \
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
index db29228..f3e6b3f 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
@@ -11,21 +11,25 @@
#include "X86AsmInstrumentation.h"
#include "X86Operand.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Function.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetOptions.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
namespace llvm {
namespace {
-static cl::opt<bool> ClAsanInstrumentInlineAssembly(
- "asan-instrument-inline-assembly", cl::desc("instrument inline assembly"),
- cl::Hidden, cl::init(false));
+static cl::opt<bool> ClAsanInstrumentAssembly(
+ "asan-instrument-assembly",
+ cl::desc("instrument assembly with AddressSanitizer checks"), cl::Hidden,
+ cl::init(false));
bool IsStackReg(unsigned Reg) {
return Reg == X86::RSP || Reg == X86::ESP || Reg == X86::SP;
@@ -38,14 +42,14 @@ std::string FuncName(unsigned AccessSize, bool IsWrite) {
class X86AddressSanitizer : public X86AsmInstrumentation {
public:
- X86AddressSanitizer(MCSubtargetInfo &sti) : STI(sti) {}
+ X86AddressSanitizer(const MCSubtargetInfo &STI) : STI(STI) {}
virtual ~X86AddressSanitizer() {}
// X86AsmInstrumentation implementation:
virtual void InstrumentInstruction(
const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
- MCContext &Ctx, MCStreamer &Out) override {
- InstrumentMOV(Inst, Operands, Ctx, Out);
+ MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) override {
+ InstrumentMOV(Inst, Operands, Ctx, MII, Out);
}
// Should be implemented differently in x86_32 and x86_64 subclasses.
@@ -57,13 +61,13 @@ public:
bool IsWrite, MCContext &Ctx, MCStreamer &Out);
void InstrumentMOV(const MCInst &Inst,
SmallVectorImpl<MCParsedAsmOperand *> &Operands,
- MCContext &Ctx, MCStreamer &Out);
+ MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
void EmitInstruction(MCStreamer &Out, const MCInst &Inst) {
Out.EmitInstruction(Inst, STI);
}
protected:
- MCSubtargetInfo &STI;
+ const MCSubtargetInfo &STI;
};
void X86AddressSanitizer::InstrumentMemOperand(
@@ -83,68 +87,53 @@ void X86AddressSanitizer::InstrumentMemOperand(
void X86AddressSanitizer::InstrumentMOV(
const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
- MCContext &Ctx, MCStreamer &Out) {
+ MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) {
// Access size in bytes.
unsigned AccessSize = 0;
- unsigned long OpIx = Operands.size();
+
switch (Inst.getOpcode()) {
case X86::MOV8mi:
case X86::MOV8mr:
- AccessSize = 1;
- OpIx = 2;
- break;
case X86::MOV8rm:
AccessSize = 1;
- OpIx = 1;
break;
case X86::MOV16mi:
case X86::MOV16mr:
- AccessSize = 2;
- OpIx = 2;
- break;
case X86::MOV16rm:
AccessSize = 2;
- OpIx = 1;
break;
case X86::MOV32mi:
case X86::MOV32mr:
- AccessSize = 4;
- OpIx = 2;
- break;
case X86::MOV32rm:
AccessSize = 4;
- OpIx = 1;
break;
case X86::MOV64mi32:
case X86::MOV64mr:
- AccessSize = 8;
- OpIx = 2;
- break;
case X86::MOV64rm:
AccessSize = 8;
- OpIx = 1;
break;
case X86::MOVAPDmr:
case X86::MOVAPSmr:
- AccessSize = 16;
- OpIx = 2;
- break;
case X86::MOVAPDrm:
case X86::MOVAPSrm:
AccessSize = 16;
- OpIx = 1;
break;
- }
- if (OpIx >= Operands.size())
+ default:
return;
+ }
- const bool IsWrite = (OpIx != 1);
- InstrumentMemOperand(Operands[OpIx], AccessSize, IsWrite, Ctx, Out);
+ const bool IsWrite = MII.get(Inst.getOpcode()).mayStore();
+ for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) {
+ MCParsedAsmOperand *Op = Operands[Ix];
+ if (Op && Op->isMem())
+ InstrumentMemOperand(Op, AccessSize, IsWrite, Ctx, Out);
+ }
}
class X86AddressSanitizer32 : public X86AddressSanitizer {
public:
- X86AddressSanitizer32(MCSubtargetInfo &sti) : X86AddressSanitizer(sti) {}
+ X86AddressSanitizer32(const MCSubtargetInfo &STI)
+ : X86AddressSanitizer(STI) {}
virtual ~X86AddressSanitizer32() {}
virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize,
@@ -172,14 +161,14 @@ void X86AddressSanitizer32::InstrumentMemOperandImpl(
MCSymbolRefExpr::Create(FuncSym, MCSymbolRefExpr::VK_PLT, Ctx);
EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FuncExpr));
}
- EmitInstruction(Out, MCInstBuilder(X86::ADD32ri).addReg(X86::ESP)
- .addReg(X86::ESP).addImm(4));
+ EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX));
EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX));
}
class X86AddressSanitizer64 : public X86AddressSanitizer {
public:
- X86AddressSanitizer64(MCSubtargetInfo &sti) : X86AddressSanitizer(sti) {}
+ X86AddressSanitizer64(const MCSubtargetInfo &STI)
+ : X86AddressSanitizer(STI) {}
virtual ~X86AddressSanitizer64() {}
virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize,
@@ -187,13 +176,26 @@ public:
MCStreamer &Out) override;
};
-void X86AddressSanitizer64::InstrumentMemOperandImpl(
- X86Operand *Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
- MCStreamer &Out) {
+void X86AddressSanitizer64::InstrumentMemOperandImpl(X86Operand *Op,
+ unsigned AccessSize,
+ bool IsWrite,
+ MCContext &Ctx,
+ MCStreamer &Out) {
// FIXME: emit .cfi directives for correct stack unwinding.
- // Set %rsp below current red zone (128 bytes wide)
- EmitInstruction(Out, MCInstBuilder(X86::SUB64ri32).addReg(X86::RSP)
- .addReg(X86::RSP).addImm(128));
+
+ // Set %rsp below current red zone (128 bytes wide) using LEA instruction to
+ // preserve flags.
+ {
+ MCInst Inst;
+ Inst.setOpcode(X86::LEA64r);
+ Inst.addOperand(MCOperand::CreateReg(X86::RSP));
+
+ const MCExpr *Disp = MCConstantExpr::Create(-128, Ctx);
+ std::unique_ptr<X86Operand> Op(
+ X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc()));
+ Op->addMemOperands(Inst, 5);
+ EmitInstruction(Out, Inst);
+ }
EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RDI));
{
MCInst Inst;
@@ -210,8 +212,19 @@ void X86AddressSanitizer64::InstrumentMemOperandImpl(
EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FuncExpr));
}
EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RDI));
- EmitInstruction(Out, MCInstBuilder(X86::ADD64ri32).addReg(X86::RSP)
- .addReg(X86::RSP).addImm(128));
+
+ // Restore old %rsp value.
+ {
+ MCInst Inst;
+ Inst.setOpcode(X86::LEA64r);
+ Inst.addOperand(MCOperand::CreateReg(X86::RSP));
+
+ const MCExpr *Disp = MCConstantExpr::Create(128, Ctx);
+ std::unique_ptr<X86Operand> Op(
+ X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc()));
+ Op->addMemOperands(Inst, 5);
+ EmitInstruction(Out, Inst);
+ }
}
} // End anonymous namespace
@@ -221,10 +234,15 @@ X86AsmInstrumentation::~X86AsmInstrumentation() {}
void X86AsmInstrumentation::InstrumentInstruction(
const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
- MCContext &Ctx, MCStreamer &Out) {}
-
-X86AsmInstrumentation *CreateX86AsmInstrumentation(MCSubtargetInfo &STI) {
- if (ClAsanInstrumentInlineAssembly) {
+ MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) {}
+
+X86AsmInstrumentation *
+CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+ const MCContext &Ctx, const MCSubtargetInfo &STI) {
+ Triple T(STI.getTargetTriple());
+ const bool hasCompilerRTSupport = T.isOSLinux();
+ if (ClAsanInstrumentAssembly && hasCompilerRTSupport &&
+ MCOptions.SanitizeAddress) {
if ((STI.getFeatureBits() & X86::Mode32Bit) != 0)
return new X86AddressSanitizer32(STI);
if ((STI.getFeatureBits() & X86::Mode64Bit) != 0)
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
index c783a78..0369b14 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
@@ -16,13 +16,17 @@ namespace llvm {
class MCContext;
class MCInst;
+class MCInstrInfo;
class MCParsedAsmOperand;
class MCStreamer;
class MCSubtargetInfo;
+class MCTargetOptions;
class X86AsmInstrumentation;
-X86AsmInstrumentation *CreateX86AsmInstrumentation(MCSubtargetInfo &STI);
+X86AsmInstrumentation *
+CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+ const MCContext &Ctx, const MCSubtargetInfo &STI);
class X86AsmInstrumentation {
public:
@@ -32,15 +36,18 @@ public:
// instruction is sent to Out.
virtual void InstrumentInstruction(
const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
- MCContext &Ctx, MCStreamer &Out);
+ MCContext &Ctx,
+ const MCInstrInfo &MII,
+ MCStreamer &Out);
protected:
friend X86AsmInstrumentation *
- CreateX86AsmInstrumentation(MCSubtargetInfo &STI);
+ CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+ const MCContext &Ctx, const MCSubtargetInfo &STI);
X86AsmInstrumentation();
};
-} // End llvm namespace
+} // End llvm namespace
-#endif // X86_ASM_INSTRUMENTATION_H
+#endif // X86_ASM_INSTRUMENTATION_H
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 9eddc74..d3e695e 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -20,6 +20,7 @@
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCParser/MCAsmLexer.h"
#include "llvm/MC/MCParser/MCAsmParser.h"
#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
@@ -55,6 +56,7 @@ static const char OpPrecedence[] = {
class X86AsmParser : public MCTargetAsmParser {
MCSubtargetInfo &STI;
MCAsmParser &Parser;
+ const MCInstrInfo &MII;
ParseInstructionInfo *InstInfo;
std::unique_ptr<X86AsmInstrumentation> Instrumentation;
private:
@@ -257,7 +259,7 @@ private:
public:
IntelExprStateMachine(int64_t imm, bool stoponlbrac, bool addimmprefix) :
State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0),
- Scale(1), Imm(imm), Sym(0), StopOnLBrac(stoponlbrac),
+ Scale(1), Imm(imm), Sym(nullptr), StopOnLBrac(stoponlbrac),
AddImmPrefix(addimmprefix) { Info.clear(); }
unsigned getBaseReg() { return BaseReg; }
@@ -618,7 +620,7 @@ private:
X86Operand *ErrorOperand(SMLoc Loc, StringRef Msg) {
Error(Loc, Msg);
- return 0;
+ return nullptr;
}
X86Operand *DefaultMemSIOperand(SMLoc Loc);
@@ -710,13 +712,17 @@ private:
public:
X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
- const MCInstrInfo &MII)
- : MCTargetAsmParser(), STI(sti), Parser(parser), InstInfo(0) {
+ const MCInstrInfo &mii,
+ const MCTargetOptions &Options)
+ : MCTargetAsmParser(), STI(sti), Parser(parser), MII(mii),
+ InstInfo(nullptr) {
// Initialize the set of available features.
setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
- Instrumentation.reset(CreateX86AsmInstrumentation(STI));
+ Instrumentation.reset(
+ CreateX86AsmInstrumentation(Options, Parser.getContext(), STI));
}
+
bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
bool
@@ -1173,9 +1179,9 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
// expression.
IntelExprStateMachine SM(ImmDisp, /*StopOnLBrac=*/false, /*AddImmPrefix=*/true);
if (ParseIntelExpression(SM, End))
- return 0;
+ return nullptr;
- const MCExpr *Disp = 0;
+ const MCExpr *Disp = nullptr;
if (const MCExpr *Sym = SM.getSym()) {
// A symbolic displacement.
Disp = Sym;
@@ -1199,7 +1205,7 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
if (Tok.getString().find('.') != StringRef::npos) {
const MCExpr *NewDisp;
if (ParseIntelDotOperator(Disp, NewDisp))
- return 0;
+ return nullptr;
End = Tok.getEndLoc();
Parser.Lex(); // Eat the field.
@@ -1220,7 +1226,7 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
StringRef ErrMsg;
if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) {
Error(StartInBrac, ErrMsg);
- return 0;
+ return nullptr;
}
return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, Start,
End, Size);
@@ -1237,7 +1243,7 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
InlineAsmIdentifierInfo &Info,
bool IsUnevaluatedOperand, SMLoc &End) {
assert (isParsingInlineAsm() && "Expected to be parsing inline assembly.");
- Val = 0;
+ Val = nullptr;
StringRef LineBuf(Identifier.data());
SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand);
@@ -1309,7 +1315,7 @@ X86Operand *X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg,
StringRef Identifier = Tok.getString();
if (ParseIntelIdentifier(Val, Identifier, Info,
/*Unevaluated=*/false, End))
- return 0;
+ return nullptr;
return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0,
/*Scale=*/1, Start, End, Size, Identifier, Info);
}
@@ -1337,7 +1343,7 @@ X86Operand *X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start,
StringRef Identifier = Tok.getString();
if (ParseIntelIdentifier(Val, Identifier, Info,
/*Unevaluated=*/false, End))
- return 0;
+ return nullptr;
if (!getLexer().is(AsmToken::LBrac))
return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0, /*IndexReg=*/0,
@@ -1349,19 +1355,19 @@ X86Operand *X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start,
IntelExprStateMachine SM(/*ImmDisp=*/0, /*StopOnLBrac=*/true,
/*AddImmPrefix=*/false);
if (ParseIntelExpression(SM, End))
- return 0;
+ return nullptr;
if (SM.getSym()) {
Error(Start, "cannot use more than one symbol in memory operand");
- return 0;
+ return nullptr;
}
if (SM.getBaseReg()) {
Error(Start, "cannot use base register with variable reference");
- return 0;
+ return nullptr;
}
if (SM.getIndexReg()) {
Error(Start, "cannot use index register with variable reference");
- return 0;
+ return nullptr;
}
const MCExpr *Disp = MCConstantExpr::Create(SM.getImm(), getContext());
@@ -1430,7 +1436,7 @@ X86Operand *X86AsmParser::ParseIntelOffsetOfOperator() {
StringRef Identifier = Tok.getString();
if (ParseIntelIdentifier(Val, Identifier, Info,
/*Unevaluated=*/false, End))
- return 0;
+ return nullptr;
// Don't emit the offset operator.
InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7));
@@ -1461,13 +1467,13 @@ X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) {
SMLoc TypeLoc = Tok.getLoc();
Parser.Lex(); // Eat operator.
- const MCExpr *Val = 0;
+ const MCExpr *Val = nullptr;
InlineAsmIdentifierInfo Info;
SMLoc Start = Tok.getLoc(), End;
StringRef Identifier = Tok.getString();
if (ParseIntelIdentifier(Val, Identifier, Info,
/*Unevaluated=*/true, End))
- return 0;
+ return nullptr;
if (!Info.OpDecl)
return ErrorOperand(Start, "unable to lookup expression");
@@ -1522,7 +1528,7 @@ X86Operand *X86AsmParser::ParseIntelOperand() {
IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true,
/*AddImmPrefix=*/false);
if (ParseIntelExpression(SM, End))
- return 0;
+ return nullptr;
int64_t Imm = SM.getImm();
if (isParsingInlineAsm()) {
@@ -1580,11 +1586,11 @@ X86Operand *X86AsmParser::ParseATTOperand() {
// Read the register.
unsigned RegNo;
SMLoc Start, End;
- if (ParseRegister(RegNo, Start, End)) return 0;
+ if (ParseRegister(RegNo, Start, End)) return nullptr;
if (RegNo == X86::EIZ || RegNo == X86::RIZ) {
Error(Start, "%eiz and %riz can only be used as index registers",
SMRange(Start, End));
- return 0;
+ return nullptr;
}
// If this is a segment register followed by a ':', then this is the start
@@ -1601,7 +1607,7 @@ X86Operand *X86AsmParser::ParseATTOperand() {
Parser.Lex();
const MCExpr *Val;
if (getParser().parseExpression(Val, End))
- return 0;
+ return nullptr;
return X86Operand::CreateImm(Val, Start, End);
}
}
@@ -1630,7 +1636,7 @@ X86AsmParser::HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands
StringSwitch<const char*>(getLexer().getTok().getIdentifier())
.Case("to8", "{1to8}")
.Case("to16", "{1to16}")
- .Default(0);
+ .Default(nullptr);
if (!BroadcastPrimitive)
return !ErrorAndEatStatement(getLexer().getLoc(),
"Invalid memory broadcast primitive.");
@@ -1685,7 +1691,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
if (getLexer().isNot(AsmToken::LParen)) {
SMLoc ExprEnd;
- if (getParser().parseExpression(Disp, ExprEnd)) return 0;
+ if (getParser().parseExpression(Disp, ExprEnd)) return nullptr;
// After parsing the base expression we could either have a parenthesized
// memory address or not. If not, return now. If so, eat the (.
@@ -1712,7 +1718,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
// It must be an parenthesized expression, parse it now.
if (getParser().parseParenExpression(Disp, ExprEnd))
- return 0;
+ return nullptr;
// After parsing the base expression we could either have a parenthesized
// memory address or not. If not, return now. If so, eat the (.
@@ -1736,11 +1742,11 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
if (getLexer().is(AsmToken::Percent)) {
SMLoc StartLoc, EndLoc;
BaseLoc = Parser.getTok().getLoc();
- if (ParseRegister(BaseReg, StartLoc, EndLoc)) return 0;
+ if (ParseRegister(BaseReg, StartLoc, EndLoc)) return nullptr;
if (BaseReg == X86::EIZ || BaseReg == X86::RIZ) {
Error(StartLoc, "eiz and riz can only be used as index registers",
SMRange(StartLoc, EndLoc));
- return 0;
+ return nullptr;
}
}
@@ -1756,7 +1762,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
// like "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this.
if (getLexer().is(AsmToken::Percent)) {
SMLoc L;
- if (ParseRegister(IndexReg, L, L)) return 0;
+ if (ParseRegister(IndexReg, L, L)) return nullptr;
if (getLexer().isNot(AsmToken::RParen)) {
// Parse the scale amount:
@@ -1764,7 +1770,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
if (getLexer().isNot(AsmToken::Comma)) {
Error(Parser.getTok().getLoc(),
"expected comma in scale expression");
- return 0;
+ return nullptr;
}
Parser.Lex(); // Eat the comma.
@@ -1774,18 +1780,18 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
int64_t ScaleVal;
if (getParser().parseAbsoluteExpression(ScaleVal)){
Error(Loc, "expected scale expression");
- return 0;
+ return nullptr;
}
// Validate the scale amount.
if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
ScaleVal != 1) {
Error(Loc, "scale factor in 16-bit address must be 1");
- return 0;
+ return nullptr;
}
if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){
Error(Loc, "scale factor in address must be 1, 2, 4 or 8");
- return 0;
+ return nullptr;
}
Scale = (unsigned)ScaleVal;
}
@@ -1797,7 +1803,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
int64_t Value;
if (getParser().parseAbsoluteExpression(Value))
- return 0;
+ return nullptr;
if (Value != 1)
Warning(Loc, "scale factor without index register is ignored");
@@ -1808,7 +1814,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
// Ok, we've eaten the memory operand, verify we have a ')' and eat it too.
if (getLexer().isNot(AsmToken::RParen)) {
Error(Parser.getTok().getLoc(), "unexpected token in memory operand");
- return 0;
+ return nullptr;
}
SMLoc MemEnd = Parser.getTok().getEndLoc();
Parser.Lex(); // Eat the ')'.
@@ -1821,18 +1827,18 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
BaseReg != X86::SI && BaseReg != X86::DI)) &&
BaseReg != X86::DX) {
Error(BaseLoc, "invalid 16-bit base register");
- return 0;
+ return nullptr;
}
if (BaseReg == 0 &&
X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)) {
Error(IndexLoc, "16-bit memory operand may not include only index register");
- return 0;
+ return nullptr;
}
StringRef ErrMsg;
if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) {
Error(BaseLoc, ErrMsg);
- return 0;
+ return nullptr;
}
return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale,
@@ -1851,7 +1857,7 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
PatchedName = PatchedName.substr(0, Name.size()-1);
// FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
- const MCExpr *ExtraImmOp = 0;
+ const MCExpr *ExtraImmOp = nullptr;
if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) &&
(PatchedName.endswith("ss") || PatchedName.endswith("sd") ||
PatchedName.endswith("ps") || PatchedName.endswith("pd"))) {
@@ -2070,8 +2076,10 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
(Name == "smov" || Name == "smovb" || Name == "smovw" ||
Name == "smovl" || Name == "smovd" || Name == "smovq"))) {
if (Operands.size() == 1) {
- if (Name == "movsd")
+ if (Name == "movsd") {
+ delete Operands.back();
Operands.back() = X86Operand::CreateToken("movsl", NameLoc);
+ }
if (isParsingIntelSyntax()) {
Operands.push_back(DefaultMemDIOperand(NameLoc));
Operands.push_back(DefaultMemSIOperand(NameLoc));
@@ -2253,7 +2261,8 @@ static const char *getSubtargetFeatureName(unsigned Val);
void X86AsmParser::EmitInstruction(
MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
MCStreamer &Out) {
- Instrumentation->InstrumentInstruction(Inst, Operands, getContext(), Out);
+ Instrumentation->InstrumentInstruction(Inst, Operands, getContext(), MII,
+ Out);
Out.EmitInstruction(Inst, STI);
}
@@ -2291,7 +2300,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
.Case("fstsw", "fnstsw")
.Case("fstsww", "fnstsw")
.Case("fclex", "fnclex")
- .Default(0);
+ .Default(nullptr);
assert(Repl && "Unknown wait-prefixed instruction");
delete Operands[0];
Operands[0] = X86Operand::CreateToken(Repl, IDLoc);
diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h
index 45fe2a9..de3be38 100644
--- a/lib/Target/X86/AsmParser/X86Operand.h
+++ b/lib/Target/X86/AsmParser/X86Operand.h
@@ -422,7 +422,7 @@ struct X86Operand : public MCParsedAsmOperand {
bool AddressOf = false,
SMLoc OffsetOfLoc = SMLoc(),
StringRef SymName = StringRef(),
- void *OpDecl = 0) {
+ void *OpDecl = nullptr) {
X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc);
Res->Reg.RegNo = RegNo;
Res->AddressOf = AddressOf;
@@ -441,7 +441,7 @@ struct X86Operand : public MCParsedAsmOperand {
/// Create an absolute memory operand.
static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
unsigned Size = 0, StringRef SymName = StringRef(),
- void *OpDecl = 0) {
+ void *OpDecl = nullptr) {
X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
Res->Mem.SegReg = 0;
Res->Mem.Disp = Disp;
@@ -461,7 +461,7 @@ struct X86Operand : public MCParsedAsmOperand {
unsigned Scale, SMLoc StartLoc, SMLoc EndLoc,
unsigned Size = 0,
StringRef SymName = StringRef(),
- void *OpDecl = 0) {
+ void *OpDecl = nullptr) {
// We should never just have a displacement, that should be parsed as an
// absolute memory operand.
assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 206b651..c54fbc1 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -14,7 +14,6 @@ add_public_tablegen_target(X86CommonTableGen)
set(sources
X86AsmPrinter.cpp
- X86COFFMachineModuleInfo.cpp
X86CodeEmitter.cpp
X86FastISel.cpp
X86FloatingPoint.cpp
diff --git a/lib/Target/X86/Disassembler/Android.mk b/lib/Target/X86/Disassembler/Android.mk
index 3984266..0b3b8a5 100644
--- a/lib/Target/X86/Disassembler/Android.mk
+++ b/lib/Target/X86/Disassembler/Android.mk
@@ -8,7 +8,8 @@ x86_disassembler_TBLGEN_TABLES := \
x86_disassembler_SRC_FILES := \
X86Disassembler.cpp \
- X86DisassemblerDecoder.c
+ X86DisassemblerDecoder.cpp
+
# For the device
# =====================================================
diff --git a/lib/Target/X86/Disassembler/CMakeLists.txt b/lib/Target/X86/Disassembler/CMakeLists.txt
index deed115..4370282 100644
--- a/lib/Target/X86/Disassembler/CMakeLists.txt
+++ b/lib/Target/X86/Disassembler/CMakeLists.txt
@@ -1,4 +1,4 @@
add_llvm_library(LLVMX86Disassembler
X86Disassembler.cpp
- X86DisassemblerDecoder.c
+ X86DisassemblerDecoder.cpp
)
diff --git a/lib/Target/X86/Disassembler/Makefile b/lib/Target/X86/Disassembler/Makefile
index 8669fd8..51e7b82 100644
--- a/lib/Target/X86/Disassembler/Makefile
+++ b/lib/Target/X86/Disassembler/Makefile
@@ -10,7 +10,9 @@
LEVEL = ../../../..
LIBRARYNAME = LLVMX86Disassembler
-# Hack: we need to include 'main' x86 target directory to grab private headers
+# Hack: we need to include 'main' x86 target directory to grab private headers.
CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
include $(LEVEL)/Makefile.common
+
+.PHONY: $(PROJ_SRC_DIR)/X86DisassemblerDecoder.c
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index d5759cd..c366725 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -27,6 +27,11 @@
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+using namespace llvm::X86Disassembler;
+
+#define DEBUG_TYPE "x86-disassembler"
+
#define GET_REGINFO_ENUM
#include "X86GenRegisterInfo.inc"
#define GET_INSTRINFO_ENUM
@@ -34,21 +39,18 @@
#define GET_SUBTARGETINFO_ENUM
#include "X86GenSubtargetInfo.inc"
-using namespace llvm;
-using namespace llvm::X86Disassembler;
-
-void x86DisassemblerDebug(const char *file,
- unsigned line,
- const char *s) {
+void llvm::X86Disassembler::Debug(const char *file, unsigned line,
+ const char *s) {
dbgs() << file << ":" << line << ": " << s;
}
-const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii) {
+const char *llvm::X86Disassembler::GetInstrName(unsigned Opcode,
+ const void *mii) {
const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii);
return MII->getName(Opcode);
}
-#define debug(s) DEBUG(x86DisassemblerDebug(__FILE__, __LINE__, s));
+#define debug(s) DEBUG(Debug(__FILE__, __LINE__, s));
namespace llvm {
@@ -74,9 +76,11 @@ static bool translateInstruction(MCInst &target,
InternalInstruction &source,
const MCDisassembler *Dis);
-X86GenericDisassembler::X86GenericDisassembler(const MCSubtargetInfo &STI,
- const MCInstrInfo *MII)
- : MCDisassembler(STI), MII(MII) {
+X86GenericDisassembler::X86GenericDisassembler(
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx,
+ std::unique_ptr<const MCInstrInfo> MII)
+ : MCDisassembler(STI, Ctx), MII(std::move(MII)) {
switch (STI.getFeatureBits() &
(X86::Mode16Bit | X86::Mode32Bit | X86::Mode64Bit)) {
case X86::Mode16Bit:
@@ -93,10 +97,6 @@ X86GenericDisassembler::X86GenericDisassembler(const MCSubtargetInfo &STI,
}
}
-X86GenericDisassembler::~X86GenericDisassembler() {
- delete MII;
-}
-
/// regionReader - a callback function that wraps the readByte method from
/// MemoryObject.
///
@@ -140,14 +140,14 @@ X86GenericDisassembler::getInstruction(MCInst &instr,
dlog_t loggerFn = logger;
if (&vStream == &nulls())
- loggerFn = 0; // Disable logging completely if it's going to nulls().
+ loggerFn = nullptr; // Disable logging completely if it's going to nulls().
int ret = decodeInstruction(&internalInstr,
regionReader,
(const void*)&region,
loggerFn,
(void*)&vStream,
- (const void*)MII,
+ (const void*)MII.get(),
address,
fMode);
@@ -319,7 +319,7 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
}
// By default sign-extend all X86 immediates based on their encoding.
else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 ||
- type == TYPE_IMM64) {
+ type == TYPE_IMM64 || type == TYPE_IMMv) {
uint32_t Opcode = mcInst.getOpcode();
switch (operand.encoding) {
default:
@@ -787,13 +787,11 @@ static bool translateInstruction(MCInst &mcInst,
mcInst.setOpcode(X86::XACQUIRE_PREFIX);
}
- int index;
-
insn.numImmediatesTranslated = 0;
- for (index = 0; index < X86_MAX_OPERANDS; ++index) {
- if (insn.operands[index].encoding != ENCODING_NONE) {
- if (translateOperand(mcInst, insn.operands[index], insn, Dis)) {
+ for (const auto &Op : insn.operands) {
+ if (Op.encoding != ENCODING_NONE) {
+ if (translateOperand(mcInst, Op, insn, Dis)) {
return true;
}
}
@@ -803,9 +801,10 @@ static bool translateInstruction(MCInst &mcInst,
}
static MCDisassembler *createX86Disassembler(const Target &T,
- const MCSubtargetInfo &STI) {
- return new X86Disassembler::X86GenericDisassembler(STI,
- T.createMCInstrInfo());
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ std::unique_ptr<const MCInstrInfo> MII(T.createMCInstrInfo());
+ return new X86Disassembler::X86GenericDisassembler(STI, Ctx, std::move(MII));
}
extern "C" void LLVMInitializeX86Disassembler() {
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.h b/lib/Target/X86/Disassembler/X86Disassembler.h
index 4e6e297..4dc7c29 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.h
+++ b/lib/Target/X86/Disassembler/X86Disassembler.h
@@ -74,17 +74,7 @@
#ifndef X86DISASSEMBLER_H
#define X86DISASSEMBLER_H
-#define INSTRUCTION_SPECIFIER_FIELDS \
- uint16_t operands;
-
-#define INSTRUCTION_IDS \
- uint16_t instructionIDs;
-
#include "X86DisassemblerDecoderCommon.h"
-
-#undef INSTRUCTION_SPECIFIER_FIELDS
-#undef INSTRUCTION_IDS
-
#include "llvm/MC/MCDisassembler.h"
namespace llvm {
@@ -101,13 +91,12 @@ namespace X86Disassembler {
/// All each platform class should have to do is subclass the constructor, and
/// provide a different disassemblerMode value.
class X86GenericDisassembler : public MCDisassembler {
- const MCInstrInfo *MII;
+ std::unique_ptr<const MCInstrInfo> MII;
public:
/// Constructor - Initializes the disassembler.
///
- X86GenericDisassembler(const MCSubtargetInfo &STI, const MCInstrInfo *MII);
-private:
- ~X86GenericDisassembler();
+ X86GenericDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+ std::unique_ptr<const MCInstrInfo> MII);
public:
/// getInstruction - See MCDisassembler.
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index 0801c96..804606d 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -1,17 +1,17 @@
-/*===-- X86DisassemblerDecoder.c - Disassembler decoder ------------*- C -*-===*
- *
- * The LLVM Compiler Infrastructure
- *
- * This file is distributed under the University of Illinois Open Source
- * License. See LICENSE.TXT for details.
- *
- *===----------------------------------------------------------------------===*
- *
- * This file is part of the X86 Disassembler.
- * It contains the implementation of the instruction decoder.
- * Documentation for the disassembler can be found in X86Disassembler.h.
- *
- *===----------------------------------------------------------------------===*/
+//===-- X86DisassemblerDecoder.c - Disassembler decoder -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains the implementation of the instruction decoder.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
#include <stdarg.h> /* for va_*() */
#include <stdio.h> /* for vsnprintf() */
@@ -20,13 +20,35 @@
#include "X86DisassemblerDecoder.h"
-#include "X86GenDisassemblerTables.inc"
+using namespace llvm::X86Disassembler;
+
+/// Specifies whether a ModR/M byte is needed and (if so) which
+/// instruction each possible value of the ModR/M byte corresponds to. Once
+/// this information is known, we have narrowed down to a single instruction.
+struct ModRMDecision {
+ uint8_t modrm_type;
+ uint16_t instructionIDs;
+};
+
+/// Specifies which set of ModR/M->instruction tables to look at
+/// given a particular opcode.
+struct OpcodeDecision {
+ ModRMDecision modRMDecisions[256];
+};
+
+/// Specifies which opcode->instruction tables to look at given
+/// a particular context (set of attributes). Since there are many possible
+/// contexts, the decoder first uses CONTEXTS_SYM to determine which context
+/// applies given a specific set of attributes. Hence there are only IC_max
+/// entries in this table, rather than 2^(ATTR_max).
+struct ContextDecision {
+ OpcodeDecision opcodeDecisions[IC_max];
+};
-#define TRUE 1
-#define FALSE 0
+#include "X86GenDisassemblerTables.inc"
#ifndef NDEBUG
-#define debug(s) do { x86DisassemblerDebug(__FILE__, __LINE__, s); } while (0)
+#define debug(s) do { Debug(__FILE__, __LINE__, s); } while (0)
#else
#define debug(s) do { } while (0)
#endif
@@ -41,7 +63,7 @@
* an instruction with these attributes.
*/
static InstructionContext contextForAttrs(uint16_t attrMask) {
- return CONTEXTS_SYM[attrMask];
+ return static_cast<InstructionContext>(CONTEXTS_SYM[attrMask]);
}
/*
@@ -53,12 +75,12 @@ static InstructionContext contextForAttrs(uint16_t attrMask) {
* contextForAttrs.
* @param opcode - The last byte of the instruction's opcode, not counting
* ModR/M extensions and escapes.
- * @return - TRUE if the ModR/M byte is required, FALSE otherwise.
+ * @return - true if the ModR/M byte is required, false otherwise.
*/
static int modRMRequired(OpcodeType type,
InstructionContext insnContext,
uint16_t opcode) {
- const struct ContextDecision* decision = 0;
+ const struct ContextDecision* decision = nullptr;
switch (type) {
case ONEBYTE:
@@ -102,7 +124,7 @@ static InstrUID decode(OpcodeType type,
InstructionContext insnContext,
uint8_t opcode,
uint8_t modRM) {
- const struct ModRMDecision* dec = 0;
+ const struct ModRMDecision* dec = nullptr;
switch (type) {
case ONEBYTE:
@@ -284,15 +306,15 @@ static void setPrefixPresent(struct InternalInstruction* insn,
* @param location - The location to query.
* @return - Whether the prefix is at that location.
*/
-static BOOL isPrefixAtLocation(struct InternalInstruction* insn,
+static bool isPrefixAtLocation(struct InternalInstruction* insn,
uint8_t prefix,
uint64_t location)
{
if (insn->prefixPresent[prefix] == 1 &&
insn->prefixLocations[prefix] == location)
- return TRUE;
+ return true;
else
- return FALSE;
+ return false;
}
/*
@@ -305,14 +327,14 @@ static BOOL isPrefixAtLocation(struct InternalInstruction* insn,
* bytes, and no prefixes conflicted; nonzero otherwise.
*/
static int readPrefixes(struct InternalInstruction* insn) {
- BOOL isPrefix = TRUE;
- BOOL prefixGroups[4] = { FALSE };
+ bool isPrefix = true;
+ bool prefixGroups[4] = { false };
uint64_t prefixLocation;
uint8_t byte = 0;
uint8_t nextByte;
- BOOL hasAdSize = FALSE;
- BOOL hasOpSize = FALSE;
+ bool hasAdSize = false;
+ bool hasOpSize = false;
dbgprintf(insn, "readPrefixes()");
@@ -344,7 +366,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
if ((byte == 0xf2 || byte == 0xf3) &&
((nextByte == 0xf0) |
((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90)))
- insn->xAcquireRelease = TRUE;
+ insn->xAcquireRelease = true;
/*
* Also if the byte is 0xf3, and the following condition is met:
* - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or
@@ -354,7 +376,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
if (byte == 0xf3 &&
(nextByte == 0x88 || nextByte == 0x89 ||
nextByte == 0xc6 || nextByte == 0xc7))
- insn->xAcquireRelease = TRUE;
+ insn->xAcquireRelease = true;
if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) {
if (consumeByte(insn, &nextByte))
return -1;
@@ -372,7 +394,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
case 0xf3: /* REP or REPE/REPZ */
if (prefixGroups[0])
dbgprintf(insn, "Redundant Group 1 prefix");
- prefixGroups[0] = TRUE;
+ prefixGroups[0] = true;
setPrefixPresent(insn, byte, prefixLocation);
break;
case 0x2e: /* CS segment override -OR- Branch not taken */
@@ -406,25 +428,25 @@ static int readPrefixes(struct InternalInstruction* insn) {
}
if (prefixGroups[1])
dbgprintf(insn, "Redundant Group 2 prefix");
- prefixGroups[1] = TRUE;
+ prefixGroups[1] = true;
setPrefixPresent(insn, byte, prefixLocation);
break;
case 0x66: /* Operand-size override */
if (prefixGroups[2])
dbgprintf(insn, "Redundant Group 3 prefix");
- prefixGroups[2] = TRUE;
- hasOpSize = TRUE;
+ prefixGroups[2] = true;
+ hasOpSize = true;
setPrefixPresent(insn, byte, prefixLocation);
break;
case 0x67: /* Address-size override */
if (prefixGroups[3])
dbgprintf(insn, "Redundant Group 4 prefix");
- prefixGroups[3] = TRUE;
- hasAdSize = TRUE;
+ prefixGroups[3] = true;
+ hasAdSize = true;
setPrefixPresent(insn, byte, prefixLocation);
break;
default: /* Not a prefix byte */
- isPrefix = FALSE;
+ isPrefix = false;
break;
}
@@ -549,7 +571,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
default:
break;
case VEX_PREFIX_66:
- hasOpSize = TRUE;
+ hasOpSize = true;
break;
}
@@ -595,7 +617,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
default:
break;
case VEX_PREFIX_66:
- hasOpSize = TRUE;
+ hasOpSize = true;
break;
}
@@ -790,11 +812,9 @@ static int readModRM(struct InternalInstruction* insn);
static int getIDWithAttrMask(uint16_t* instructionID,
struct InternalInstruction* insn,
uint16_t attrMask) {
- BOOL hasModRMExtension;
-
- uint16_t instructionClass;
+ bool hasModRMExtension;
- instructionClass = contextForAttrs(attrMask);
+ InstructionContext instructionClass = contextForAttrs(attrMask);
hasModRMExtension = modRMRequired(insn->opcodeType,
instructionClass,
@@ -825,14 +845,14 @@ static int getIDWithAttrMask(uint16_t* instructionID,
* @param orig - The instruction that is not 16-bit
* @param equiv - The instruction that is 16-bit
*/
-static BOOL is16BitEquivalent(const char* orig, const char* equiv) {
+static bool is16BitEquivalent(const char* orig, const char* equiv) {
off_t i;
for (i = 0;; i++) {
if (orig[i] == '\0' && equiv[i] == '\0')
- return TRUE;
+ return true;
if (orig[i] == '\0' || equiv[i] == '\0')
- return FALSE;
+ return false;
if (orig[i] != equiv[i]) {
if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W')
continue;
@@ -840,7 +860,7 @@ static BOOL is16BitEquivalent(const char* orig, const char* equiv) {
continue;
if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6')
continue;
- return FALSE;
+ return false;
}
}
}
@@ -1011,9 +1031,8 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
return 0;
}
- specName = x86DisassemblerGetInstrName(instructionID, miiArg);
- specWithOpSizeName =
- x86DisassemblerGetInstrName(instructionIDWithOpsize, miiArg);
+ specName = GetInstrName(instructionID, miiArg);
+ specWithOpSizeName = GetInstrName(instructionIDWithOpsize, miiArg);
if (is16BitEquivalent(specName, specWithOpSizeName) &&
(insn->mode == MODE_16BIT) ^ insn->prefixPresent[0x66]) {
@@ -1077,8 +1096,8 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
* @return - 0 if the SIB byte was successfully read; nonzero otherwise.
*/
static int readSIB(struct InternalInstruction* insn) {
- SIBIndex sibIndexBase = 0;
- SIBBase sibBaseBase = 0;
+ SIBIndex sibIndexBase = SIB_INDEX_NONE;
+ SIBBase sibBaseBase = SIB_BASE_NONE;
uint8_t index, base;
dbgprintf(insn, "readSIB()");
@@ -1086,7 +1105,7 @@ static int readSIB(struct InternalInstruction* insn) {
if (insn->consumedSIB)
return 0;
- insn->consumedSIB = TRUE;
+ insn->consumedSIB = true;
switch (insn->addressSize) {
case 2:
@@ -1184,12 +1203,12 @@ static int readDisplacement(struct InternalInstruction* insn) {
if (insn->consumedDisplacement)
return 0;
- insn->consumedDisplacement = TRUE;
+ insn->consumedDisplacement = true;
insn->displacementOffset = insn->readerCursor - insn->startLocation;
switch (insn->eaDisplacement) {
case EA_DISP_NONE:
- insn->consumedDisplacement = FALSE;
+ insn->consumedDisplacement = false;
break;
case EA_DISP_8:
if (consumeInt8(insn, &d8))
@@ -1208,7 +1227,7 @@ static int readDisplacement(struct InternalInstruction* insn) {
break;
}
- insn->consumedDisplacement = TRUE;
+ insn->consumedDisplacement = true;
return 0;
}
@@ -1229,7 +1248,7 @@ static int readModRM(struct InternalInstruction* insn) {
if (consumeByte(insn, &insn->modRM))
return -1;
- insn->consumedModRM = TRUE;
+ insn->consumedModRM = true;
mod = modFromModRM(insn->modRM);
rm = rmFromModRM(insn->modRM);
@@ -1599,20 +1618,22 @@ static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
static int readVVVV(struct InternalInstruction* insn) {
dbgprintf(insn, "readVVVV()");
+ int vvvv;
if (insn->vectorExtensionType == TYPE_EVEX)
- insn->vvvv = vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]);
+ vvvv = vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]);
else if (insn->vectorExtensionType == TYPE_VEX_3B)
- insn->vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]);
+ vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]);
else if (insn->vectorExtensionType == TYPE_VEX_2B)
- insn->vvvv = vvvvFromVEX2of2(insn->vectorExtensionPrefix[1]);
+ vvvv = vvvvFromVEX2of2(insn->vectorExtensionPrefix[1]);
else if (insn->vectorExtensionType == TYPE_XOP)
- insn->vvvv = vvvvFromXOP3of3(insn->vectorExtensionPrefix[2]);
+ vvvv = vvvvFromXOP3of3(insn->vectorExtensionPrefix[2]);
else
return -1;
if (insn->mode != MODE_64BIT)
- insn->vvvv &= 0x7;
+ vvvv &= 0x7;
+ insn->vvvv = static_cast<Reg>(vvvv);
return 0;
}
@@ -1629,7 +1650,8 @@ static int readMaskRegister(struct InternalInstruction* insn) {
if (insn->vectorExtensionType != TYPE_EVEX)
return -1;
- insn->writemask = aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]);
+ insn->writemask =
+ static_cast<Reg>(aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]));
return 0;
}
@@ -1641,7 +1663,6 @@ static int readMaskRegister(struct InternalInstruction* insn) {
* @return - 0 if all operands could be read; nonzero otherwise.
*/
static int readOperands(struct InternalInstruction* insn) {
- int index;
int hasVVVV, needVVVV;
int sawRegImm = 0;
@@ -1652,8 +1673,8 @@ static int readOperands(struct InternalInstruction* insn) {
hasVVVV = !readVVVV(insn);
needVVVV = hasVVVV && (insn->vvvv != 0);
- for (index = 0; index < X86_MAX_OPERANDS; ++index) {
- switch (x86OperandSets[insn->spec->operands][index].encoding) {
+ for (const auto &Op : x86OperandSets[insn->spec->operands]) {
+ switch (Op.encoding) {
case ENCODING_NONE:
case ENCODING_SI:
case ENCODING_DI:
@@ -1662,7 +1683,7 @@ static int readOperands(struct InternalInstruction* insn) {
case ENCODING_RM:
if (readModRM(insn))
return -1;
- if (fixupReg(insn, &x86OperandSets[insn->spec->operands][index]))
+ if (fixupReg(insn, &Op))
return -1;
break;
case ENCODING_CB:
@@ -1684,14 +1705,14 @@ static int readOperands(struct InternalInstruction* insn) {
}
if (readImmediate(insn, 1))
return -1;
- if (x86OperandSets[insn->spec->operands][index].type == TYPE_IMM3 &&
+ if (Op.type == TYPE_IMM3 &&
insn->immediates[insn->numImmediatesConsumed - 1] > 7)
return -1;
- if (x86OperandSets[insn->spec->operands][index].type == TYPE_IMM5 &&
+ if (Op.type == TYPE_IMM5 &&
insn->immediates[insn->numImmediatesConsumed - 1] > 31)
return -1;
- if (x86OperandSets[insn->spec->operands][index].type == TYPE_XMM128 ||
- x86OperandSets[insn->spec->operands][index].type == TYPE_XMM256)
+ if (Op.type == TYPE_XMM128 ||
+ Op.type == TYPE_XMM256)
sawRegImm = 1;
break;
case ENCODING_IW:
@@ -1740,7 +1761,7 @@ static int readOperands(struct InternalInstruction* insn) {
needVVVV = 0; /* Mark that we have found a VVVV operand. */
if (!hasVVVV)
return -1;
- if (fixupReg(insn, &x86OperandSets[insn->spec->operands][index]))
+ if (fixupReg(insn, &Op))
return -1;
break;
case ENCODING_WRITEMASK:
@@ -1781,14 +1802,10 @@ static int readOperands(struct InternalInstruction* insn) {
* @return - 0 if the instruction's memory could be read; nonzero if
* not.
*/
-int decodeInstruction(struct InternalInstruction* insn,
- byteReader_t reader,
- const void* readerArg,
- dlog_t logger,
- void* loggerArg,
- const void* miiArg,
- uint64_t startLoc,
- DisassemblerMode mode) {
+int llvm::X86Disassembler::decodeInstruction(
+ struct InternalInstruction *insn, byteReader_t reader,
+ const void *readerArg, dlog_t logger, void *loggerArg, const void *miiArg,
+ uint64_t startLoc, DisassemblerMode mode) {
memset(insn, 0, sizeof(struct InternalInstruction));
insn->reader = reader;
@@ -1807,7 +1824,7 @@ int decodeInstruction(struct InternalInstruction* insn,
readOperands(insn))
return -1;
- insn->operands = &x86OperandSets[insn->spec->operands][0];
+ insn->operands = x86OperandSets[insn->spec->operands];
insn->length = insn->readerCursor - insn->startLocation;
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index ac3b39d..8c45402 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -1,39 +1,28 @@
-/*===-- X86DisassemblerDecoderInternal.h - Disassembler decoder ---*- C -*-===*
- *
- * The LLVM Compiler Infrastructure
- *
- * This file is distributed under the University of Illinois Open Source
- * License. See LICENSE.TXT for details.
- *
- *===----------------------------------------------------------------------===*
- *
- * This file is part of the X86 Disassembler.
- * It contains the public interface of the instruction decoder.
- * Documentation for the disassembler can be found in X86Disassembler.h.
- *
- *===----------------------------------------------------------------------===*/
+//===-- X86DisassemblerDecoderInternal.h - Disassembler decoder -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains the public interface of the instruction decoder.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
#ifndef X86DISASSEMBLERDECODER_H
#define X86DISASSEMBLERDECODER_H
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define INSTRUCTION_SPECIFIER_FIELDS \
- uint16_t operands;
-
-#define INSTRUCTION_IDS \
- uint16_t instructionIDs;
-
#include "X86DisassemblerDecoderCommon.h"
+#include "llvm/ADT/ArrayRef.h"
-#undef INSTRUCTION_SPECIFIER_FIELDS
-#undef INSTRUCTION_IDS
+namespace llvm {
+namespace X86Disassembler {
-/*
- * Accessor functions for various fields of an Intel instruction
- */
+// Accessor functions for various fields of an Intel instruction
#define modFromModRM(modRM) (((modRM) & 0xc0) >> 6)
#define regFromModRM(modRM) (((modRM) & 0x38) >> 3)
#define rmFromModRM(modRM) ((modRM) & 0x7)
@@ -83,10 +72,7 @@ extern "C" {
#define lFromXOP3of3(xop) (((xop) & 0x4) >> 2)
#define ppFromXOP3of3(xop) ((xop) & 0x3)
-/*
- * These enums represent Intel registers for use by the decoder.
- */
-
+// These enums represent Intel registers for use by the decoder.
#define REGS_8BIT \
ENTRY(AL) \
ENTRY(CL) \
@@ -392,13 +378,11 @@ extern "C" {
REGS_CONTROL \
ENTRY(RIP)
-/*
- * EABase - All possible values of the base field for effective-address
- * computations, a.k.a. the Mod and R/M fields of the ModR/M byte. We
- * distinguish between bases (EA_BASE_*) and registers that just happen to be
- * referred to when Mod == 0b11 (EA_REG_*).
- */
-typedef enum {
+/// \brief All possible values of the base field for effective-address
+/// computations, a.k.a. the Mod and R/M fields of the ModR/M byte.
+/// We distinguish between bases (EA_BASE_*) and registers that just happen
+/// to be referred to when Mod == 0b11 (EA_REG_*).
+enum EABase {
EA_BASE_NONE,
#define ENTRY(x) EA_BASE_##x,
ALL_EA_BASES
@@ -407,15 +391,13 @@ typedef enum {
ALL_REGS
#undef ENTRY
EA_max
-} EABase;
-
-/*
- * SIBIndex - All possible values of the SIB index field.
- * Borrows entries from ALL_EA_BASES with the special case that
- * sib is synonymous with NONE.
- * Vector SIB: index can be XMM or YMM.
- */
-typedef enum {
+};
+
+/// \brief All possible values of the SIB index field.
+/// borrows entries from ALL_EA_BASES with the special case that
+/// sib is synonymous with NONE.
+/// Vector SIB: index can be XMM or YMM.
+enum SIBIndex {
SIB_INDEX_NONE,
#define ENTRY(x) SIB_INDEX_##x,
ALL_EA_BASES
@@ -424,23 +406,18 @@ typedef enum {
REGS_ZMM
#undef ENTRY
SIB_INDEX_max
-} SIBIndex;
+};
-/*
- * SIBBase - All possible values of the SIB base field.
- */
-typedef enum {
+/// \brief All possible values of the SIB base field.
+enum SIBBase {
SIB_BASE_NONE,
#define ENTRY(x) SIB_BASE_##x,
ALL_SIB_BASES
#undef ENTRY
SIB_BASE_max
-} SIBBase;
+};
-/*
- * EADisplacement - Possible displacement types for effective-address
- * computations.
- */
+/// \brief Possible displacement types for effective-address computations.
typedef enum {
EA_DISP_NONE,
EA_DISP_8,
@@ -448,20 +425,16 @@ typedef enum {
EA_DISP_32
} EADisplacement;
-/*
- * Reg - All possible values of the reg field in the ModR/M byte.
- */
-typedef enum {
+/// \brief All possible values of the reg field in the ModR/M byte.
+enum Reg {
#define ENTRY(x) MODRM_REG_##x,
ALL_REGS
#undef ENTRY
MODRM_REG_max
-} Reg;
+};
-/*
- * SegmentOverride - All possible segment overrides.
- */
-typedef enum {
+/// \brief All possible segment overrides.
+enum SegmentOverride {
SEG_OVERRIDE_NONE,
SEG_OVERRIDE_CS,
SEG_OVERRIDE_SS,
@@ -470,235 +443,220 @@ typedef enum {
SEG_OVERRIDE_FS,
SEG_OVERRIDE_GS,
SEG_OVERRIDE_max
-} SegmentOverride;
-
-/*
- * VEXLeadingOpcodeByte - Possible values for the VEX.m-mmmm field
- */
+};
-typedef enum {
+/// \brief Possible values for the VEX.m-mmmm field
+enum VEXLeadingOpcodeByte {
VEX_LOB_0F = 0x1,
VEX_LOB_0F38 = 0x2,
VEX_LOB_0F3A = 0x3
-} VEXLeadingOpcodeByte;
+};
-typedef enum {
+enum XOPMapSelect {
XOP_MAP_SELECT_8 = 0x8,
XOP_MAP_SELECT_9 = 0x9,
XOP_MAP_SELECT_A = 0xA
-} XOPMapSelect;
-
-/*
- * VEXPrefixCode - Possible values for the VEX.pp/EVEX.pp field
- */
+};
-typedef enum {
+/// \brief Possible values for the VEX.pp/EVEX.pp field
+enum VEXPrefixCode {
VEX_PREFIX_NONE = 0x0,
VEX_PREFIX_66 = 0x1,
VEX_PREFIX_F3 = 0x2,
VEX_PREFIX_F2 = 0x3
-} VEXPrefixCode;
+};
-typedef enum {
+enum VectorExtensionType {
TYPE_NO_VEX_XOP = 0x0,
TYPE_VEX_2B = 0x1,
TYPE_VEX_3B = 0x2,
TYPE_EVEX = 0x3,
TYPE_XOP = 0x4
-} VectorExtensionType;
-
-typedef uint8_t BOOL;
-
-/*
- * byteReader_t - Type for the byte reader that the consumer must provide to
- * the decoder. Reads a single byte from the instruction's address space.
- * @param arg - A baton that the consumer can associate with any internal
- * state that it needs.
- * @param byte - A pointer to a single byte in memory that should be set to
- * contain the value at address.
- * @param address - The address in the instruction's address space that should
- * be read from.
- * @return - -1 if the byte cannot be read for any reason; 0 otherwise.
- */
-typedef int (*byteReader_t)(const void* arg, uint8_t* byte, uint64_t address);
-
-/*
- * dlog_t - Type for the logging function that the consumer can provide to
- * get debugging output from the decoder.
- * @param arg - A baton that the consumer can associate with any internal
- * state that it needs.
- * @param log - A string that contains the message. Will be reused after
- * the logger returns.
- */
-typedef void (*dlog_t)(void* arg, const char *log);
-
-/*
- * The x86 internal instruction, which is produced by the decoder.
- */
+};
+
+/// \brief Type for the byte reader that the consumer must provide to
+/// the decoder. Reads a single byte from the instruction's address space.
+/// \param arg A baton that the consumer can associate with any internal
+/// state that it needs.
+/// \param byte A pointer to a single byte in memory that should be set to
+/// contain the value at address.
+/// \param address The address in the instruction's address space that should
+/// be read from.
+/// \return -1 if the byte cannot be read for any reason; 0 otherwise.
+typedef int (*byteReader_t)(const void *arg, uint8_t *byte, uint64_t address);
+
+/// \brief Type for the logging function that the consumer can provide to
+/// get debugging output from the decoder.
+/// \param arg A baton that the consumer can associate with any internal
+/// state that it needs.
+/// \param log A string that contains the message. Will be reused after
+/// the logger returns.
+typedef void (*dlog_t)(void *arg, const char *log);
+
+/// The specification for how to extract and interpret a full instruction and
+/// its operands.
+struct InstructionSpecifier {
+ uint16_t operands;
+};
+
+/// The x86 internal instruction, which is produced by the decoder.
struct InternalInstruction {
- /* Reader interface (C) */
+ // Reader interface (C)
byteReader_t reader;
- /* Opaque value passed to the reader */
+ // Opaque value passed to the reader
const void* readerArg;
- /* The address of the next byte to read via the reader */
+ // The address of the next byte to read via the reader
uint64_t readerCursor;
- /* Logger interface (C) */
+ // Logger interface (C)
dlog_t dlog;
- /* Opaque value passed to the logger */
+ // Opaque value passed to the logger
void* dlogArg;
- /* General instruction information */
+ // General instruction information
- /* The mode to disassemble for (64-bit, protected, real) */
+ // The mode to disassemble for (64-bit, protected, real)
DisassemblerMode mode;
- /* The start of the instruction, usable with the reader */
+ // The start of the instruction, usable with the reader
uint64_t startLocation;
- /* The length of the instruction, in bytes */
+ // The length of the instruction, in bytes
size_t length;
- /* Prefix state */
+ // Prefix state
- /* 1 if the prefix byte corresponding to the entry is present; 0 if not */
+ // 1 if the prefix byte corresponding to the entry is present; 0 if not
uint8_t prefixPresent[0x100];
- /* contains the location (for use with the reader) of the prefix byte */
+ // contains the location (for use with the reader) of the prefix byte
uint64_t prefixLocations[0x100];
- /* The value of the vector extension prefix(EVEX/VEX/XOP), if present */
+ // The value of the vector extension prefix(EVEX/VEX/XOP), if present
uint8_t vectorExtensionPrefix[4];
- /* The type of the vector extension prefix */
+ // The type of the vector extension prefix
VectorExtensionType vectorExtensionType;
- /* The value of the REX prefix, if present */
+ // The value of the REX prefix, if present
uint8_t rexPrefix;
- /* The location where a mandatory prefix would have to be (i.e., right before
- the opcode, or right before the REX prefix if one is present) */
+ // The location where a mandatory prefix would have to be (i.e., right before
+ // the opcode, or right before the REX prefix if one is present).
uint64_t necessaryPrefixLocation;
- /* The segment override type */
+ // The segment override type
SegmentOverride segmentOverride;
- /* 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease */
- BOOL xAcquireRelease;
+ // 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease
+ bool xAcquireRelease;
- /* Sizes of various critical pieces of data, in bytes */
+ // Sizes of various critical pieces of data, in bytes
uint8_t registerSize;
uint8_t addressSize;
uint8_t displacementSize;
uint8_t immediateSize;
- /* Offsets from the start of the instruction to the pieces of data, which is
- needed to find relocation entries for adding symbolic operands */
+ // Offsets from the start of the instruction to the pieces of data, which is
+ // needed to find relocation entries for adding symbolic operands.
uint8_t displacementOffset;
uint8_t immediateOffset;
- /* opcode state */
+ // opcode state
- /* The last byte of the opcode, not counting any ModR/M extension */
+ // The last byte of the opcode, not counting any ModR/M extension
uint8_t opcode;
- /* The ModR/M byte of the instruction, if it is an opcode extension */
+ // The ModR/M byte of the instruction, if it is an opcode extension
uint8_t modRMExtension;
- /* decode state */
+ // decode state
- /* The type of opcode, used for indexing into the array of decode tables */
+ // The type of opcode, used for indexing into the array of decode tables
OpcodeType opcodeType;
- /* The instruction ID, extracted from the decode table */
+ // The instruction ID, extracted from the decode table
uint16_t instructionID;
- /* The specifier for the instruction, from the instruction info table */
- const struct InstructionSpecifier *spec;
+ // The specifier for the instruction, from the instruction info table
+ const InstructionSpecifier *spec;
- /* state for additional bytes, consumed during operand decode. Pattern:
- consumed___ indicates that the byte was already consumed and does not
- need to be consumed again */
+ // state for additional bytes, consumed during operand decode. Pattern:
+ // consumed___ indicates that the byte was already consumed and does not
+ // need to be consumed again.
- /* The VEX.vvvv field, which contains a third register operand for some AVX
- instructions */
+ // The VEX.vvvv field, which contains a third register operand for some AVX
+ // instructions.
Reg vvvv;
- /* The writemask for AVX-512 instructions which is contained in EVEX.aaa */
+ // The writemask for AVX-512 instructions which is contained in EVEX.aaa
Reg writemask;
- /* The ModR/M byte, which contains most register operands and some portion of
- all memory operands */
- BOOL consumedModRM;
+ // The ModR/M byte, which contains most register operands and some portion of
+ // all memory operands.
+ bool consumedModRM;
uint8_t modRM;
- /* The SIB byte, used for more complex 32- or 64-bit memory operands */
- BOOL consumedSIB;
+ // The SIB byte, used for more complex 32- or 64-bit memory operands
+ bool consumedSIB;
uint8_t sib;
- /* The displacement, used for memory operands */
- BOOL consumedDisplacement;
+ // The displacement, used for memory operands
+ bool consumedDisplacement;
int32_t displacement;
- /* Immediates. There can be two in some cases */
+ // Immediates. There can be two in some cases
uint8_t numImmediatesConsumed;
uint8_t numImmediatesTranslated;
uint64_t immediates[2];
- /* A register or immediate operand encoded into the opcode */
+ // A register or immediate operand encoded into the opcode
Reg opcodeRegister;
- /* Portions of the ModR/M byte */
+ // Portions of the ModR/M byte
- /* These fields determine the allowable values for the ModR/M fields, which
- depend on operand and address widths */
+ // These fields determine the allowable values for the ModR/M fields, which
+ // depend on operand and address widths.
EABase eaBaseBase;
EABase eaRegBase;
Reg regBase;
- /* The Mod and R/M fields can encode a base for an effective address, or a
- register. These are separated into two fields here */
+ // The Mod and R/M fields can encode a base for an effective address, or a
+ // register. These are separated into two fields here.
EABase eaBase;
EADisplacement eaDisplacement;
- /* The reg field always encodes a register */
+ // The reg field always encodes a register
Reg reg;
- /* SIB state */
+ // SIB state
SIBIndex sibIndex;
uint8_t sibScale;
SIBBase sibBase;
- const struct OperandSpecifier *operands;
+ ArrayRef<OperandSpecifier> operands;
};
-/* decodeInstruction - Decode one instruction and store the decoding results in
- * a buffer provided by the consumer.
- * @param insn - The buffer to store the instruction in. Allocated by the
- * consumer.
- * @param reader - The byteReader_t for the bytes to be read.
- * @param readerArg - An argument to pass to the reader for storing context
- * specific to the consumer. May be NULL.
- * @param logger - The dlog_t to be used in printing status messages from the
- * disassembler. May be NULL.
- * @param loggerArg - An argument to pass to the logger for storing context
- * specific to the logger. May be NULL.
- * @param startLoc - The address (in the reader's address space) of the first
- * byte in the instruction.
- * @param mode - The mode (16-bit, 32-bit, 64-bit) to decode in.
- * @return - Nonzero if there was an error during decode, 0 otherwise.
- */
-int decodeInstruction(struct InternalInstruction* insn,
+/// \brief Decode one instruction and store the decoding results in
+/// a buffer provided by the consumer.
+/// \param insn The buffer to store the instruction in. Allocated by the
+/// consumer.
+/// \param reader The byteReader_t for the bytes to be read.
+/// \param readerArg An argument to pass to the reader for storing context
+/// specific to the consumer. May be NULL.
+/// \param logger The dlog_t to be used in printing status messages from the
+/// disassembler. May be NULL.
+/// \param loggerArg An argument to pass to the logger for storing context
+/// specific to the logger. May be NULL.
+/// \param startLoc The address (in the reader's address space) of the first
+/// byte in the instruction.
+/// \param mode The mode (16-bit, 32-bit, 64-bit) to decode in.
+/// \return Nonzero if there was an error during decode, 0 otherwise.
+int decodeInstruction(InternalInstruction *insn,
byteReader_t reader,
- const void* readerArg,
+ const void *readerArg,
dlog_t logger,
- void* loggerArg,
- const void* miiArg,
+ void *loggerArg,
+ const void *miiArg,
uint64_t startLoc,
DisassemblerMode mode);
-/* x86DisassemblerDebug - C-accessible function for printing a message to
- * debugs()
- * @param file - The name of the file printing the debug message.
- * @param line - The line number that printed the debug message.
- * @param s - The message to print.
- */
+/// \brief Print a message to debugs()
+/// \param file The name of the file printing the debug message.
+/// \param line The line number that printed the debug message.
+/// \param s The message to print.
+void Debug(const char *file, unsigned line, const char *s);
-void x86DisassemblerDebug(const char *file,
- unsigned line,
- const char *s);
+const char *GetInstrName(unsigned Opcode, const void *mii);
-const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii);
-
-#ifdef __cplusplus
-}
-#endif
+} // namespace X86Disassembler
+} // namespace llvm
#endif
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index 523ae99..f59e0b6 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -1,29 +1,27 @@
-/*===-- X86DisassemblerDecoderCommon.h - Disassembler decoder -----*- C -*-===*
- *
- * The LLVM Compiler Infrastructure
- *
- * This file is distributed under the University of Illinois Open Source
- * License. See LICENSE.TXT for details.
- *
- *===----------------------------------------------------------------------===*
- *
- * This file is part of the X86 Disassembler.
- * It contains common definitions used by both the disassembler and the table
- * generator.
- * Documentation for the disassembler can be found in X86Disassembler.h.
- *
- *===----------------------------------------------------------------------===*/
-
-/*
- * This header file provides those definitions that need to be shared between
- * the decoder and the table generator in a C-friendly manner.
- */
+//===-- X86DisassemblerDecoderCommon.h - Disassembler decoder ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains common definitions used by both the disassembler and the table
+// generator.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
#ifndef X86DISASSEMBLERDECODERCOMMON_H
#define X86DISASSEMBLERDECODERCOMMON_H
#include "llvm/Support/DataTypes.h"
+namespace llvm {
+namespace X86Disassembler {
+
#define INSTRUCTIONS_SYM x86DisassemblerInstrSpecifiers
#define CONTEXTS_SYM x86DisassemblerContexts
#define ONEBYTE_SYM x86DisassemblerOneByteOpcodes
@@ -44,11 +42,9 @@
#define XOP9_MAP_STR "x86DisassemblerXOP9Opcodes"
#define XOPA_MAP_STR "x86DisassemblerXOPAOpcodes"
-/*
- * Attributes of an instruction that must be known before the opcode can be
- * processed correctly. Most of these indicate the presence of particular
- * prefixes, but ATTR_64BIT is simply an attribute of the decoding context.
- */
+// Attributes of an instruction that must be known before the opcode can be
+// processed correctly. Most of these indicate the presence of particular
+// prefixes, but ATTR_64BIT is simply an attribute of the decoding context.
#define ATTRIBUTE_BITS \
ENUM_ENTRY(ATTR_NONE, 0x00) \
ENUM_ENTRY(ATTR_64BIT, (0x1 << 0)) \
@@ -73,13 +69,11 @@ enum attributeBits {
};
#undef ENUM_ENTRY
-/*
- * Combinations of the above attributes that are relevant to instruction
- * decode. Although other combinations are possible, they can be reduced to
- * these without affecting the ultimately decoded instruction.
- */
+// Combinations of the above attributes that are relevant to instruction
+// decode. Although other combinations are possible, they can be reduced to
+// these without affecting the ultimately decoded instruction.
-/* Class name Rank Rationale for rank assignment */
+// Class name Rank Rationale for rank assignment
#define INSTRUCTION_CONTEXTS \
ENUM_ENTRY(IC, 0, "says nothing about the instruction") \
ENUM_ENTRY(IC_64BIT, 1, "says the instruction applies in " \
@@ -274,17 +268,15 @@ enum attributeBits {
ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4, "requires EVEX_KZ, L2, W and OpSize")
#define ENUM_ENTRY(n, r, d) n,
-typedef enum {
+enum InstructionContext {
INSTRUCTION_CONTEXTS
IC_max
-} InstructionContext;
+};
#undef ENUM_ENTRY
-/*
- * Opcode types, which determine which decode table to use, both in the Intel
- * manual and also for the decoder.
- */
-typedef enum {
+// Opcode types, which determine which decode table to use, both in the Intel
+// manual and also for the decoder.
+enum OpcodeType {
ONEBYTE = 0,
TWOBYTE = 1,
THREEBYTE_38 = 2,
@@ -292,39 +284,33 @@ typedef enum {
XOP8_MAP = 4,
XOP9_MAP = 5,
XOPA_MAP = 6
-} OpcodeType;
-
-/*
- * The following structs are used for the hierarchical decode table. After
- * determining the instruction's class (i.e., which IC_* constant applies to
- * it), the decoder reads the opcode. Some instructions require specific
- * values of the ModR/M byte, so the ModR/M byte indexes into the final table.
- *
- * If a ModR/M byte is not required, "required" is left unset, and the values
- * for each instructionID are identical.
- */
+};
+// The following structs are used for the hierarchical decode table. After
+// determining the instruction's class (i.e., which IC_* constant applies to
+// it), the decoder reads the opcode. Some instructions require specific
+// values of the ModR/M byte, so the ModR/M byte indexes into the final table.
+//
+// If a ModR/M byte is not required, "required" is left unset, and the values
+// for each instructionID are identical.
typedef uint16_t InstrUID;
-/*
- * ModRMDecisionType - describes the type of ModR/M decision, allowing the
- * consumer to determine the number of entries in it.
- *
- * MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded
- * instruction is the same.
- * MODRM_SPLITRM - If the ModR/M byte is between 0x00 and 0xbf, the opcode
- * corresponds to one instruction; otherwise, it corresponds to
- * a different instruction.
- * MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte
- * divided by 8 is used to select instruction; otherwise, each
- * value of the ModR/M byte could correspond to a different
- * instruction.
- * MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This
- corresponds to instructions that use reg field as opcode
- * MODRM_FULL - Potentially, each value of the ModR/M byte could correspond
- * to a different instruction.
- */
-
+// ModRMDecisionType - describes the type of ModR/M decision, allowing the
+// consumer to determine the number of entries in it.
+//
+// MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded
+// instruction is the same.
+// MODRM_SPLITRM - If the ModR/M byte is between 0x00 and 0xbf, the opcode
+// corresponds to one instruction; otherwise, it corresponds to
+// a different instruction.
+// MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte
+// divided by 8 is used to select instruction; otherwise, each
+// value of the ModR/M byte could correspond to a different
+// instruction.
+// MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This
+// corresponds to instructions that use reg field as opcode
+// MODRM_FULL - Potentially, each value of the ModR/M byte could correspond
+// to a different instruction.
#define MODRMTYPES \
ENUM_ENTRY(MODRM_ONEENTRY) \
ENUM_ENTRY(MODRM_SPLITRM) \
@@ -333,47 +319,13 @@ typedef uint16_t InstrUID;
ENUM_ENTRY(MODRM_FULL)
#define ENUM_ENTRY(n) n,
-typedef enum {
+enum ModRMDecisionType {
MODRMTYPES
MODRM_max
-} ModRMDecisionType;
-#undef ENUM_ENTRY
-
-/*
- * ModRMDecision - Specifies whether a ModR/M byte is needed and (if so) which
- * instruction each possible value of the ModR/M byte corresponds to. Once
- * this information is known, we have narrowed down to a single instruction.
- */
-struct ModRMDecision {
- uint8_t modrm_type;
-
- /* The macro below must be defined wherever this file is included. */
- INSTRUCTION_IDS
-};
-
-/*
- * OpcodeDecision - Specifies which set of ModR/M->instruction tables to look at
- * given a particular opcode.
- */
-struct OpcodeDecision {
- struct ModRMDecision modRMDecisions[256];
-};
-
-/*
- * ContextDecision - Specifies which opcode->instruction tables to look at given
- * a particular context (set of attributes). Since there are many possible
- * contexts, the decoder first uses CONTEXTS_SYM to determine which context
- * applies given a specific set of attributes. Hence there are only IC_max
- * entries in this table, rather than 2^(ATTR_max).
- */
-struct ContextDecision {
- struct OpcodeDecision opcodeDecisions[IC_max];
};
+#undef ENUM_ENTRY
-/*
- * Physical encodings of instruction operands.
- */
-
+// Physical encodings of instruction operands.
#define ENCODINGS \
ENUM_ENTRY(ENCODING_NONE, "") \
ENUM_ENTRY(ENCODING_REG, "Register operand in ModR/M byte.") \
@@ -408,16 +360,13 @@ struct ContextDecision {
ENUM_ENTRY(ENCODING_DI, "Destination index; encoded in prefixes")
#define ENUM_ENTRY(n, d) n,
- typedef enum {
- ENCODINGS
- ENCODING_max
- } OperandEncoding;
+enum OperandEncoding {
+ ENCODINGS
+ ENCODING_max
+};
#undef ENUM_ENTRY
-/*
- * Semantic interpretations of instruction operands.
- */
-
+// Semantic interpretations of instruction operands.
#define TYPES \
ENUM_ENTRY(TYPE_NONE, "") \
ENUM_ENTRY(TYPE_REL8, "1-byte immediate address") \
@@ -508,56 +457,42 @@ struct ContextDecision {
ENUM_ENTRY(TYPE_M512, "512-bit FPU/MMX/XMM/MXCSR state")
#define ENUM_ENTRY(n, d) n,
-typedef enum {
+enum OperandType {
TYPES
TYPE_max
-} OperandType;
+};
#undef ENUM_ENTRY
-/*
- * OperandSpecifier - The specification for how to extract and interpret one
- * operand.
- */
+/// \brief The specification for how to extract and interpret one operand.
struct OperandSpecifier {
uint8_t encoding;
uint8_t type;
};
-/*
- * Indicates where the opcode modifier (if any) is to be found. Extended
- * opcodes with AddRegFrm have the opcode modifier in the ModR/M byte.
- */
-
+// Indicates where the opcode modifier (if any) is to be found. Extended
+// opcodes with AddRegFrm have the opcode modifier in the ModR/M byte.
#define MODIFIER_TYPES \
ENUM_ENTRY(MODIFIER_NONE)
#define ENUM_ENTRY(n) n,
-typedef enum {
+enum ModifierType {
MODIFIER_TYPES
MODIFIER_max
-} ModifierType;
+};
#undef ENUM_ENTRY
-#define X86_MAX_OPERANDS 5
-
-/*
- * The specification for how to extract and interpret a full instruction and
- * its operands.
- */
-struct InstructionSpecifier {
- /* The macro below must be defined wherever this file is included. */
- INSTRUCTION_SPECIFIER_FIELDS
-};
+static const unsigned X86_MAX_OPERANDS = 5;
-/*
- * Decoding mode for the Intel disassembler. 16-bit, 32-bit, and 64-bit mode
- * are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode,
- * respectively.
- */
-typedef enum {
+/// Decoding mode for the Intel disassembler. 16-bit, 32-bit, and 64-bit mode
+/// are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode,
+/// respectively.
+enum DisassemblerMode {
MODE_16BIT,
MODE_32BIT,
MODE_64BIT
-} DisassemblerMode;
+};
+
+} // namespace X86Disassembler
+} // namespace llvm
#endif
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index eea0a76..b45b118 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -12,7 +12,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "asm-printer"
#include "X86ATTInstPrinter.h"
#include "MCTargetDesc/X86BaseInfo.h"
#include "MCTargetDesc/X86MCTargetDesc.h"
@@ -28,6 +27,8 @@
#include <map>
using namespace llvm;
+#define DEBUG_TYPE "asm-printer"
+
// Include the auto-generated portion of the assembly writer.
#define PRINT_ALIAS_INSTR
#include "X86GenAsmWriter.inc"
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index f34e633..531183b 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -32,6 +32,8 @@ public:
// Autogenerated by tblgen, returns true if we successfully printed an
// alias.
bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+ void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+ unsigned PrintMethodIdx, raw_ostream &O);
// Autogenerated by tblgen.
void printInstruction(const MCInst *MI, raw_ostream &OS);
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index db61fb0..baf6507 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -32,7 +32,7 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
const char *(*getRegName)(unsigned)) {
// If this is a shuffle operation, the switch should fill in this state.
SmallVector<int, 8> ShuffleMask;
- const char *DestName = 0, *Src1Name = 0, *Src2Name = 0;
+ const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr;
switch (MI->getOpcode()) {
case X86::INSERTPSrr:
@@ -492,7 +492,7 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
// If this was a shuffle operation, print the shuffle mask.
if (!ShuffleMask.empty()) {
- if (DestName == 0) DestName = Src1Name;
+ if (!DestName) DestName = Src1Name;
OS << (DestName ? DestName : "mem") << " = ";
// If the two sources are the same, canonicalize the input elements to be
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 1c95d37..1c8466b 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -12,7 +12,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "asm-printer"
#include "X86IntelInstPrinter.h"
#include "MCTargetDesc/X86BaseInfo.h"
#include "MCTargetDesc/X86MCTargetDesc.h"
@@ -25,6 +24,8 @@
#include <cctype>
using namespace llvm;
+#define DEBUG_TYPE "asm-printer"
+
#include "X86GenAsmWriter1.inc"
void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
diff --git a/lib/Target/X86/MCTargetDesc/Android.mk b/lib/Target/X86/MCTargetDesc/Android.mk
index ee37c27..a3c9bc8 100644
--- a/lib/Target/X86/MCTargetDesc/Android.mk
+++ b/lib/Target/X86/MCTargetDesc/Android.mk
@@ -14,7 +14,8 @@ x86_mc_desc_SRC_FILES := \
X86MCCodeEmitter.cpp \
X86MachORelocationInfo.cpp \
X86MachObjectWriter.cpp \
- X86WinCOFFObjectWriter.cpp
+ X86WinCOFFObjectWriter.cpp \
+ X86WinCOFFStreamer.cpp
# For the host
# =====================================================
diff --git a/lib/Target/X86/MCTargetDesc/CMakeLists.txt b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
index 3f5a0e2..129c28d 100644
--- a/lib/Target/X86/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_library(LLVMX86Desc
X86MCCodeEmitter.cpp
X86MachObjectWriter.cpp
X86ELFObjectWriter.cpp
+ X86WinCOFFStreamer.cpp
X86WinCOFFObjectWriter.cpp
X86MachORelocationInfo.cpp
X86ELFRelocationInfo.cpp
diff --git a/lib/Target/X86/MCTargetDesc/LLVMBuild.txt b/lib/Target/X86/MCTargetDesc/LLVMBuild.txt
index 9e1d29c..146d111 100644
--- a/lib/Target/X86/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/X86/MCTargetDesc/LLVMBuild.txt
@@ -19,5 +19,5 @@
type = Library
name = X86Desc
parent = X86
-required_libraries = MC Support X86AsmPrinter X86Info
+required_libraries = MC Object Support X86AsmPrinter X86Info
add_to_library_groups = X86
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 23763f7..bf30a8e 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -37,23 +37,29 @@ MCDisableArithRelaxation("mc-x86-disable-arith-relaxation",
static unsigned getFixupKindLog2Size(unsigned Kind) {
switch (Kind) {
- default: llvm_unreachable("invalid fixup kind!");
+ default:
+ llvm_unreachable("invalid fixup kind!");
case FK_PCRel_1:
case FK_SecRel_1:
- case FK_Data_1: return 0;
+ case FK_Data_1:
+ return 0;
case FK_PCRel_2:
case FK_SecRel_2:
- case FK_Data_2: return 1;
+ case FK_Data_2:
+ return 1;
case FK_PCRel_4:
case X86::reloc_riprel_4byte:
case X86::reloc_riprel_4byte_movq_load:
case X86::reloc_signed_4byte:
case X86::reloc_global_offset_table:
case FK_SecRel_4:
- case FK_Data_4: return 2;
+ case FK_Data_4:
+ return 2;
case FK_PCRel_8:
case FK_SecRel_8:
- case FK_Data_8: return 3;
+ case FK_Data_8:
+ case X86::reloc_global_offset_table8:
+ return 3;
}
}
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 38fab15..6aeb1f2 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -643,6 +643,10 @@ namespace X86II {
/// counted as one operand.
///
inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) {
+ bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+ bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+ bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
+
switch (TSFlags & X86II::FormMask) {
default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!");
case X86II::Pseudo:
@@ -660,9 +664,6 @@ namespace X86II {
case X86II::MRMDestMem:
return 0;
case X86II::MRMSrcMem: {
- bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
- bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
- bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
unsigned FirstMemOp = 1;
if (HasVEX_4V)
++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
@@ -690,6 +691,8 @@ namespace X86II {
unsigned FirstMemOp = 0;
if (HasVEX_4V)
++FirstMemOp;// Skip the register dest (which is encoded in VEX_VVVV).
+ if (HasEVEX_K)
+ ++FirstMemOp;// Skip the mask register
return FirstMemOp;
}
case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index c44d88d..3fdec87 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -43,7 +43,7 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
bool IsPCRel) const {
// determine the type of the relocation
- MCSymbolRefExpr::VariantKind Modifier = Fixup.getAccessVariant();
+ MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
unsigned Type;
if (getEMachine() == ELF::EM_X86_64) {
if (IsPCRel) {
@@ -98,6 +98,12 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
} else {
switch ((unsigned)Fixup.getKind()) {
default: llvm_unreachable("invalid fixup kind!");
+ case X86::reloc_global_offset_table8:
+ Type = ELF::R_X86_64_GOTPC64;
+ break;
+ case X86::reloc_global_offset_table:
+ Type = ELF::R_X86_64_GOTPC32;
+ break;
case FK_Data_8:
switch (Modifier) {
default:
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
index 4fa519c..b679316 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
@@ -39,7 +39,7 @@ public:
if (Sym->isVariable() == false)
Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx));
- const MCExpr *Expr = 0;
+ const MCExpr *Expr = nullptr;
// If hasAddend is true, then we need to add Addend (r_addend) to Expr.
bool hasAddend = false;
diff --git a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
index f2e34cb..09396b7 100644
--- a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
+++ b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
@@ -23,6 +23,7 @@ enum Fixups {
reloc_global_offset_table, // 32-bit, relative to the start
// of the instruction. Used only
// for _GLOBAL_OFFSET_TABLE_.
+ reloc_global_offset_table8, // 64-bit variant.
// Marker
LastTargetFixupKind,
NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index 6561804..39480ea 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -51,7 +51,7 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
TextAlignFillValue = 0x90;
if (!is64Bit)
- Data64bitsDirective = 0; // we can't emit a 64-bit unit
+ Data64bitsDirective = nullptr; // we can't emit a 64-bit unit
// Use ## as a comment string so that .s files generated by llvm can go
// through the GCC preprocessor without causing an error. This is needed
@@ -115,7 +115,7 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
// into two .words.
if ((T.getOS() == Triple::OpenBSD || T.getOS() == Triple::Bitrig) &&
T.getArch() == Triple::x86)
- Data64bitsDirective = 0;
+ Data64bitsDirective = nullptr;
// Always enable the integrated assembler by default.
// Clang also enabled it when the OS is Solaris but that is redundant here.
@@ -157,8 +157,10 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
void X86MCAsmInfoGNUCOFF::anchor() { }
X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
- if (Triple.getArch() == Triple::x86_64)
+ if (Triple.getArch() == Triple::x86_64) {
PrivateGlobalPrefix = ".L";
+ PointerSize = 8;
+ }
AssemblerDialect = AsmWriterFlavor;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index e6fb037..2152b21 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -11,7 +11,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "mccodeemitter"
#include "MCTargetDesc/X86MCTargetDesc.h"
#include "MCTargetDesc/X86BaseInfo.h"
#include "MCTargetDesc/X86FixupKinds.h"
@@ -27,6 +26,8 @@
using namespace llvm;
+#define DEBUG_TYPE "mccodeemitter"
+
namespace {
class X86MCCodeEmitter : public MCCodeEmitter {
X86MCCodeEmitter(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION;
@@ -285,7 +286,7 @@ enum GlobalOffsetTableExprKind {
};
static GlobalOffsetTableExprKind
StartsWithGlobalOffsetTable(const MCExpr *Expr) {
- const MCExpr *RHS = 0;
+ const MCExpr *RHS = nullptr;
if (Expr->getKind() == MCExpr::Binary) {
const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr);
Expr = BE->getLHS();
@@ -316,7 +317,7 @@ void X86MCCodeEmitter::
EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const {
- const MCExpr *Expr = NULL;
+ const MCExpr *Expr = nullptr;
if (DispOp.isImm()) {
// If this is a simple integer displacement that doesn't require a
// relocation, emit it now.
@@ -339,7 +340,13 @@ EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
if (Kind != GOT_None) {
assert(ImmOffset == 0);
- FixupKind = MCFixupKind(X86::reloc_global_offset_table);
+ if (Size == 8) {
+ FixupKind = MCFixupKind(X86::reloc_global_offset_table8);
+ } else {
+ assert(Size == 4);
+ FixupKind = MCFixupKind(X86::reloc_global_offset_table);
+ }
+
if (Kind == GOT_Normal)
ImmOffset = CurByte;
} else if (Expr->getKind() == MCExpr::SymbolRef) {
@@ -1421,6 +1428,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
case X86II::MRM6r: case X86II::MRM7r: {
if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
++CurOp;
+ if (HasEVEX_K) // Skip writemask
+ ++CurOp;
EmitByte(BaseOpcode, CurByte, OS);
uint64_t Form = TSFlags & X86II::FormMask;
EmitRegModRMByte(MI.getOperand(CurOp++),
@@ -1436,6 +1445,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
case X86II::MRM6m: case X86II::MRM7m: {
if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
++CurOp;
+ if (HasEVEX_K) // Skip writemask
+ ++CurOp;
EmitByte(BaseOpcode, CurByte, OS);
uint64_t Form = TSFlags & X86II::FormMask;
EmitMemModRMByte(MI, CurOp, (Form == X86II::MRMXm) ? 0 : Form-X86II::MRM0m,
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 09fdb9c..e63036c 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -27,6 +27,12 @@
#include "llvm/Support/Host.h"
#include "llvm/Support/TargetRegistry.h"
+#if _MSC_VER
+#include <intrin.h>
+#endif
+
+using namespace llvm;
+
#define GET_REGINFO_MC_DESC
#include "X86GenRegisterInfo.inc"
@@ -36,13 +42,6 @@
#define GET_SUBTARGETINFO_MC_DESC
#include "X86GenSubtargetInfo.inc"
-#if _MSC_VER
-#include <intrin.h>
-#endif
-
-using namespace llvm;
-
-
std::string X86_MC::ParseX86Triple(StringRef TT) {
Triple TheTriple(TT);
std::string FS;
@@ -230,14 +229,8 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(StringRef TT, StringRef CPU,
}
std::string CPUName = CPU;
- if (CPUName.empty()) {
-#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
- || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
- CPUName = sys::getHostCPUName();
-#else
+ if (CPUName.empty())
CPUName = "generic";
-#endif
- }
MCSubtargetInfo *X = new MCSubtargetInfo();
InitX86MCSubtargetInfo(X, TT, CPUName, ArchFS);
@@ -294,13 +287,13 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
// Initial state of the frame pointer is esp+stackGrowth.
unsigned StackPtr = is64Bit ? X86::RSP : X86::ESP;
MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(
- 0, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth);
+ nullptr, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth);
MAI->addInitialFrameState(Inst);
// Add return address to move list
unsigned InstPtr = is64Bit ? X86::RIP : X86::EIP;
MCCFIInstruction Inst2 = MCCFIInstruction::createOffset(
- 0, MRI.getDwarfRegNum(InstPtr, true), stackGrowth);
+ nullptr, MRI.getDwarfRegNum(InstPtr, true), stackGrowth);
MAI->addInitialFrameState(Inst2);
return MAI;
@@ -365,13 +358,16 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
bool NoExecStack) {
Triple TheTriple(TT);
- if (TheTriple.isOSBinFormatMachO())
+ switch (TheTriple.getObjectFormat()) {
+ default: llvm_unreachable("unsupported object format");
+ case Triple::MachO:
return createMachOStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll);
-
- if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF())
- return createWinCOFFStreamer(Ctx, MAB, *_Emitter, _OS, RelaxAll);
-
- return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+ case Triple::COFF:
+ assert(TheTriple.isOSWindows() && "only Windows COFF is supported");
+ return createX86WinCOFFStreamer(Ctx, MAB, _Emitter, _OS, RelaxAll);
+ case Triple::ELF:
+ return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+ }
}
static MCInstPrinter *createX86MCInstPrinter(const Target &T,
@@ -384,7 +380,7 @@ static MCInstPrinter *createX86MCInstPrinter(const Target &T,
return new X86ATTInstPrinter(MAI, MII, MRI);
if (SyntaxVariant == 1)
return new X86IntelInstPrinter(MAI, MII, MRI);
- return 0;
+ return nullptr;
}
static MCRelocationInfo *createX86MCRelocationInfo(StringRef TT,
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 41ae435..8fe40fd 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -26,6 +26,7 @@ class MCObjectWriter;
class MCRegisterInfo;
class MCSubtargetInfo;
class MCRelocationInfo;
+class MCStreamer;
class Target;
class StringRef;
class raw_ostream;
@@ -84,6 +85,14 @@ MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
StringRef TT, StringRef CPU);
+/// createX86WinCOFFStreamer - Construct an X86 Windows COFF machine code
+/// streamer which will generate PE/COFF format object files.
+///
+/// Takes ownership of \p AB and \p CE.
+MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
+ MCCodeEmitter *CE, raw_ostream &OS,
+ bool RelaxAll);
+
/// createX86MachObjectWriter - Construct an X86 Mach-O object writer.
MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS,
bool Is64Bit,
diff --git a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
index f2023e3..3b81d53 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
@@ -40,7 +40,7 @@ public:
// FIXME: check that the value is actually the same.
if (Sym->isVariable() == false)
Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx));
- const MCExpr *Expr = 0;
+ const MCExpr *Expr = nullptr;
switch(RelType) {
case X86_64_RELOC_TLV:
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 1a35ced..ead3338 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -146,13 +146,13 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
const MCSymbol *A = &Target.getSymA()->getSymbol();
if (A->isTemporary())
A = &A->AliasedSymbol();
- MCSymbolData &A_SD = Asm.getSymbolData(*A);
+ const MCSymbolData &A_SD = Asm.getSymbolData(*A);
const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
const MCSymbol *B = &Target.getSymB()->getSymbol();
if (B->isTemporary())
B = &B->AliasedSymbol();
- MCSymbolData &B_SD = Asm.getSymbolData(*B);
+ const MCSymbolData &B_SD = Asm.getSymbolData(*B);
const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
// Neither symbol can be modified.
@@ -186,9 +186,9 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
false);
Value += Writer->getSymbolAddress(&A_SD, Layout) -
- (A_Base == NULL ? 0 : Writer->getSymbolAddress(A_Base, Layout));
+ (!A_Base ? 0 : Writer->getSymbolAddress(A_Base, Layout));
Value -= Writer->getSymbolAddress(&B_SD, Layout) -
- (B_Base == NULL ? 0 : Writer->getSymbolAddress(B_Base, Layout));
+ (!B_Base ? 0 : Writer->getSymbolAddress(B_Base, Layout));
if (A_Base) {
Index = A_Base->getIndex();
@@ -220,7 +220,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
Type = MachO::X86_64_RELOC_SUBTRACTOR;
} else {
const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
- MCSymbolData &SD = Asm.getSymbolData(*Symbol);
+ const MCSymbolData &SD = Asm.getSymbolData(*Symbol);
const MCSymbolData *Base = Asm.getAtom(&SD);
// Relocations inside debug sections always use local relocations when
@@ -231,7 +231,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
const MCSectionMachO &Section = static_cast<const MCSectionMachO&>(
Fragment->getParent()->getSection());
if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
- Base = 0;
+ Base = nullptr;
}
// x86_64 almost always uses external relocations, except when there is no
@@ -369,7 +369,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
// See <reloc.h>.
const MCSymbol *A = &Target.getSymA()->getSymbol();
- MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+ const MCSymbolData *A_SD = &Asm.getSymbolData(*A);
if (!A_SD->getFragment())
report_fatal_error("symbol '" + A->getName() +
@@ -382,7 +382,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
uint32_t Value2 = 0;
if (const MCSymbolRefExpr *B = Target.getSymB()) {
- MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+ const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
if (!B_SD->getFragment())
report_fatal_error("symbol '" + B->getSymbol().getName() +
@@ -465,7 +465,7 @@ void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer,
unsigned IsPCRel = 0;
// Get the symbol data.
- MCSymbolData *SD_A = &Asm.getSymbolData(Target.getSymA()->getSymbol());
+ const MCSymbolData *SD_A = &Asm.getSymbolData(Target.getSymA()->getSymbol());
unsigned Index = SD_A->getIndex();
// We're only going to have a second symbol in pic mode and it'll be a
@@ -476,7 +476,8 @@ void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer,
// If this is a subtraction then we're pcrel.
uint32_t FixupAddress =
Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
- MCSymbolData *SD_B = &Asm.getSymbolData(Target.getSymB()->getSymbol());
+ const MCSymbolData *SD_B =
+ &Asm.getSymbolData(Target.getSymB()->getSymbol());
IsPCRel = 1;
FixedValue = (FixupAddress - Writer->getSymbolAddress(SD_B, Layout) +
Target.getConstant());
@@ -524,7 +525,7 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
}
// Get the symbol data, if any.
- MCSymbolData *SD = 0;
+ const MCSymbolData *SD = nullptr;
if (Target.getSymA())
SD = &Asm.getSymbolData(Target.getSymA()->getSymbol());
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index ffc9e8d..40af822 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -23,10 +23,8 @@ namespace llvm {
namespace {
class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
- const bool Is64Bit;
-
public:
- X86WinCOFFObjectWriter(bool Is64Bit_);
+ X86WinCOFFObjectWriter(bool Is64Bit);
virtual ~X86WinCOFFObjectWriter();
unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
@@ -34,10 +32,9 @@ namespace {
};
}
-X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit_)
- : MCWinCOFFObjectTargetWriter(Is64Bit_ ? COFF::IMAGE_FILE_MACHINE_AMD64 :
- COFF::IMAGE_FILE_MACHINE_I386),
- Is64Bit(Is64Bit_) {}
+X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit)
+ : MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64
+ : COFF::IMAGE_FILE_MACHINE_I386) {}
X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {}
@@ -49,29 +46,46 @@ unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target,
MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
- switch (FixupKind) {
- case FK_PCRel_4:
- case X86::reloc_riprel_4byte:
- case X86::reloc_riprel_4byte_movq_load:
- return Is64Bit ? COFF::IMAGE_REL_AMD64_REL32 : COFF::IMAGE_REL_I386_REL32;
- case FK_Data_4:
- case X86::reloc_signed_4byte:
- if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
- return Is64Bit ? COFF::IMAGE_REL_AMD64_ADDR32NB :
- COFF::IMAGE_REL_I386_DIR32NB;
- return Is64Bit ? COFF::IMAGE_REL_AMD64_ADDR32 : COFF::IMAGE_REL_I386_DIR32;
- case FK_Data_8:
- if (Is64Bit)
+ if (getMachine() == COFF::IMAGE_FILE_MACHINE_AMD64) {
+ switch (FixupKind) {
+ case FK_PCRel_4:
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_movq_load:
+ return COFF::IMAGE_REL_AMD64_REL32;
+ case FK_Data_4:
+ case X86::reloc_signed_4byte:
+ if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
+ return COFF::IMAGE_REL_AMD64_ADDR32NB;
+ return COFF::IMAGE_REL_AMD64_ADDR32;
+ case FK_Data_8:
return COFF::IMAGE_REL_AMD64_ADDR64;
- llvm_unreachable("unsupported relocation type");
- case FK_SecRel_2:
- return Is64Bit ? COFF::IMAGE_REL_AMD64_SECTION
- : COFF::IMAGE_REL_I386_SECTION;
- case FK_SecRel_4:
- return Is64Bit ? COFF::IMAGE_REL_AMD64_SECREL : COFF::IMAGE_REL_I386_SECREL;
- default:
- llvm_unreachable("unsupported relocation type");
- }
+ case FK_SecRel_2:
+ return COFF::IMAGE_REL_AMD64_SECTION;
+ case FK_SecRel_4:
+ return COFF::IMAGE_REL_AMD64_SECREL;
+ default:
+ llvm_unreachable("unsupported relocation type");
+ }
+ } else if (getMachine() == COFF::IMAGE_FILE_MACHINE_I386) {
+ switch (FixupKind) {
+ case FK_PCRel_4:
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_movq_load:
+ return COFF::IMAGE_REL_I386_REL32;
+ case FK_Data_4:
+ case X86::reloc_signed_4byte:
+ if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
+ return COFF::IMAGE_REL_I386_DIR32NB;
+ return COFF::IMAGE_REL_I386_DIR32;
+ case FK_SecRel_2:
+ return COFF::IMAGE_REL_I386_SECTION;
+ case FK_SecRel_4:
+ return COFF::IMAGE_REL_I386_SECREL;
+ default:
+ llvm_unreachable("unsupported relocation type");
+ }
+ } else
+ llvm_unreachable("Unsupported COFF machine type.");
}
MCObjectWriter *llvm::createX86WinCOFFObjectWriter(raw_ostream &OS,
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
new file mode 100644
index 0000000..c62fd0a
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -0,0 +1,51 @@
+//===-- X86WinCOFFStreamer.cpp - X86 Target WinCOFF Streamer ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCTargetDesc.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
+
+using namespace llvm;
+
+namespace {
+class X86WinCOFFStreamer : public MCWinCOFFStreamer {
+public:
+ X86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter *CE,
+ raw_ostream &OS)
+ : MCWinCOFFStreamer(C, AB, *CE, OS) { }
+
+ void EmitWin64EHHandlerData() override;
+ void FinishImpl() override;
+};
+
+void X86WinCOFFStreamer::EmitWin64EHHandlerData() {
+ MCStreamer::EmitWin64EHHandlerData();
+
+ // We have to emit the unwind info now, because this directive
+ // actually switches to the .xdata section!
+ MCWin64EHUnwindEmitter::EmitUnwindInfo(*this, getCurrentW64UnwindInfo());
+}
+
+void X86WinCOFFStreamer::FinishImpl() {
+ EmitFrames(nullptr);
+ EmitW64Tables();
+
+ MCWinCOFFStreamer::FinishImpl();
+}
+}
+
+namespace llvm {
+MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
+ MCCodeEmitter *CE, raw_ostream &OS,
+ bool RelaxAll) {
+ X86WinCOFFStreamer *S = new X86WinCOFFStreamer(C, AB, CE, OS);
+ S->getAssembler().setRelaxAll(RelaxAll);
+ return S;
+}
+}
+
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 18e6845..64e8ea8 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -30,9 +30,9 @@ class X86TargetMachine;
FunctionPass *createX86ISelDag(X86TargetMachine &TM,
CodeGenOpt::Level OptLevel);
-/// createGlobalBaseRegPass - This pass initializes a global base
+/// createX86GlobalBaseRegPass - This pass initializes a global base
/// register for PIC on x86-32.
-FunctionPass* createGlobalBaseRegPass();
+FunctionPass* createX86GlobalBaseRegPass();
/// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses
/// to local-dynamic TLS variables so that the TLS base address for the module
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 78edcf0..6912b57 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -166,6 +166,8 @@ def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect",
"Call register indirect">;
def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
"LEA instruction needs inputs at AG stage">;
+def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
+ "LEA instruction with certain arguments is slow">;
//===----------------------------------------------------------------------===//
// X86 processors supported.
@@ -195,8 +197,7 @@ def : Proc<"pentium3m", [FeatureSSE1, FeatureSlowBTMem]>;
def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>;
def : Proc<"pentium4", [FeatureSSE2]>;
def : Proc<"pentium4m", [FeatureSSE2, FeatureSlowBTMem]>;
-def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem,
- FeatureFastUAMem]>;
+
// Intel Core Duo.
def : ProcessorModel<"yonah", SandyBridgeModel,
[FeatureSSE3, FeatureSlowBTMem]>;
@@ -227,6 +228,7 @@ def : ProcessorModel<"slm", SLMModel, [ProcIntelSLM,
FeaturePCLMUL, FeatureAES,
FeatureCallRegIndirect,
FeaturePRFCHW,
+ FeatureSlowLEA,
FeatureSlowBTMem, FeatureFastUAMem]>;
// "Arrandale" along with corei3 and corei5
def : ProcessorModel<"corei7", SandyBridgeModel,
@@ -329,6 +331,13 @@ def : Proc<"bdver3", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
FeaturePOPCNT, FeatureBMI, FeatureTBM,
FeatureFMA, FeatureFSGSBase]>;
+// Excavator
+def : Proc<"bdver4", [FeatureAVX2, FeatureXOP, FeatureFMA4,
+ FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW,
+ FeaturePCLMUL, FeatureF16C, FeatureLZCNT,
+ FeaturePOPCNT, FeatureBMI, FeatureBMI2,
+ FeatureTBM, FeatureFMA, FeatureFSGSBase]>;
+
def : Proc<"geode", [Feature3DNowA]>;
def : Proc<"winchip-c6", [FeatureMMX]>;
@@ -336,6 +345,20 @@ def : Proc<"winchip2", [Feature3DNow]>;
def : Proc<"c3", [Feature3DNow]>;
def : Proc<"c3-2", [FeatureSSE1]>;
+// We also provide a generic 64-bit specific x86 processor model which tries to
+// be good for modern chips without enabling instruction set encodings past the
+// basic SSE2 and 64-bit ones. It disables slow things from any mainstream and
+// modern 64-bit x86 chip, and enables features that are generally beneficial.
+//
+// We currently use the Sandy Bridge model as the default scheduling model as
+// we use it across Nehalem, Westmere, Sandy Bridge, and Ivy Bridge which
+// covers a huge swath of x86 processors. If there are specific scheduling
+// knobs which need to be tuned differently for AMD chips, we might consider
+// forming a common base for them.
+def : ProcessorModel<"x86-64", SandyBridgeModel,
+ [FeatureSSE2, Feature64Bit, FeatureSlowBTMem,
+ FeatureFastUAMem]>;
+
//===----------------------------------------------------------------------===//
// Register File Description
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index fb66acc..1dca568 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -15,7 +15,6 @@
#include "X86AsmPrinter.h"
#include "InstPrinter/X86ATTInstPrinter.h"
#include "MCTargetDesc/X86BaseInfo.h"
-#include "X86COFFMachineModuleInfo.h"
#include "X86InstrInfo.h"
#include "X86MachineFunctionInfo.h"
#include "llvm/ADT/SmallString.h"
@@ -102,7 +101,7 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
MachineModuleInfoImpl::StubValueTy &StubSym =
P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
- if (StubSym.getPointer() == 0)
+ if (!StubSym.getPointer())
StubSym = MachineModuleInfoImpl::
StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
} else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){
@@ -110,14 +109,14 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
MachineModuleInfoImpl::StubValueTy &StubSym =
P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(
Sym);
- if (StubSym.getPointer() == 0)
+ if (!StubSym.getPointer())
StubSym = MachineModuleInfoImpl::
StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
} else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$stub");
MachineModuleInfoImpl::StubValueTy &StubSym =
P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
- if (StubSym.getPointer() == 0)
+ if (!StubSym.getPointer())
StubSym = MachineModuleInfoImpl::
StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
}
@@ -174,7 +173,7 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
unsigned OpNo, raw_ostream &O,
- const char *Modifier = 0, unsigned AsmVariant = 0);
+ const char *Modifier = nullptr, unsigned AsmVariant = 0);
/// printPCRelImm - This is used to print an immediate value that ends up
/// being encoded as a pc-relative value. These print slightly differently, for
@@ -232,7 +231,7 @@ static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
unsigned Op, raw_ostream &O,
- const char *Modifier = NULL) {
+ const char *Modifier = nullptr) {
const MachineOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg);
const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
@@ -284,7 +283,7 @@ static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
static void printMemReference(X86AsmPrinter &P, const MachineInstr *MI,
unsigned Op, raw_ostream &O,
- const char *Modifier = NULL) {
+ const char *Modifier = nullptr) {
assert(isMem(MI, Op) && "Invalid memory reference!");
const MachineOperand &Segment = MI->getOperand(Op+X86::AddrSegmentReg);
if (Segment.getReg()) {
@@ -296,7 +295,7 @@ static void printMemReference(X86AsmPrinter &P, const MachineInstr *MI,
static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
unsigned Op, raw_ostream &O,
- const char *Modifier = NULL,
+ const char *Modifier = nullptr,
unsigned AsmVariant = 1) {
const MachineOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg);
unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
@@ -464,7 +463,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
}
}
- printOperand(*this, MI, OpNo, O, /*Modifier*/ 0, AsmVariant);
+ printOperand(*this, MI, OpNo, O, /*Modifier*/ nullptr, AsmVariant);
return false;
}
@@ -527,6 +526,55 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
}
}
+static void
+emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
+ MachineModuleInfoImpl::StubValueTy &MCSym) {
+ // L_foo$stub:
+ OutStreamer.EmitLabel(StubLabel);
+ // .indirect_symbol _foo
+ OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
+
+ if (MCSym.getInt())
+ // External to current translation unit.
+ OutStreamer.EmitIntValue(0, 4/*size*/);
+ else
+ // Internal to current translation unit.
+ //
+ // When we place the LSDA into the TEXT section, the type info
+ // pointers need to be indirect and pc-rel. We accomplish this by
+ // using NLPs; however, sometimes the types are local to the file.
+ // We need to fill in the value for the NLP in those cases.
+ OutStreamer.EmitValue(
+ MCSymbolRefExpr::Create(MCSym.getPointer(), OutStreamer.getContext()),
+ 4 /*size*/);
+}
+
+void X86AsmPrinter::GenerateExportDirective(const MCSymbol *Sym, bool IsData) {
+ SmallString<128> Directive;
+ raw_svector_ostream OS(Directive);
+ StringRef Name = Sym->getName();
+
+ if (Subtarget->isTargetKnownWindowsMSVC())
+ OS << " /EXPORT:";
+ else
+ OS << " -export:";
+
+ if ((Subtarget->isTargetWindowsGNU() || Subtarget->isTargetWindowsCygwin()) &&
+ (Name[0] == getDataLayout().getGlobalPrefix()))
+ Name = Name.drop_front();
+
+ OS << Name;
+
+ if (IsData) {
+ if (Subtarget->isTargetKnownWindowsMSVC())
+ OS << ",DATA";
+ else
+ OS << ",data";
+ }
+
+ OS.flush();
+ OutStreamer.EmitBytes(Directive);
+}
void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
if (Subtarget->isTargetMacho()) {
@@ -547,11 +595,11 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
5, SectionKind::getMetadata());
OutStreamer.SwitchSection(TheSection);
- for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+ for (const auto &Stub : Stubs) {
// L_foo$stub:
- OutStreamer.EmitLabel(Stubs[i].first);
+ OutStreamer.EmitLabel(Stub.first);
// .indirect_symbol _foo
- OutStreamer.EmitSymbolAttribute(Stubs[i].second.getPointer(),
+ OutStreamer.EmitSymbolAttribute(Stub.second.getPointer(),
MCSA_IndirectSymbol);
// hlt; hlt; hlt; hlt; hlt hlt = 0xf4.
const char HltInsts[] = "\xf4\xf4\xf4\xf4\xf4";
@@ -571,44 +619,24 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
SectionKind::getMetadata());
OutStreamer.SwitchSection(TheSection);
- for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
- // L_foo$non_lazy_ptr:
- OutStreamer.EmitLabel(Stubs[i].first);
- // .indirect_symbol _foo
- MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second;
- OutStreamer.EmitSymbolAttribute(MCSym.getPointer(),
- MCSA_IndirectSymbol);
- // .long 0
- if (MCSym.getInt())
- // External to current translation unit.
- OutStreamer.EmitIntValue(0, 4/*size*/);
- else
- // Internal to current translation unit.
- //
- // When we place the LSDA into the TEXT section, the type info
- // pointers need to be indirect and pc-rel. We accomplish this by
- // using NLPs. However, sometimes the types are local to the file. So
- // we need to fill in the value for the NLP in those cases.
- OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
- OutContext), 4/*size*/);
- }
+ for (auto &Stub : Stubs)
+ emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second);
+
Stubs.clear();
OutStreamer.AddBlankLine();
}
Stubs = MMIMacho.GetHiddenGVStubList();
if (!Stubs.empty()) {
- OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
- EmitAlignment(2);
-
- for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
- // L_foo$non_lazy_ptr:
- OutStreamer.EmitLabel(Stubs[i].first);
- // .long _foo
- OutStreamer.EmitValue(MCSymbolRefExpr::
- Create(Stubs[i].second.getPointer(),
- OutContext), 4/*size*/);
- }
+ const MCSection *TheSection =
+ OutContext.getMachOSection("__IMPORT", "__pointers",
+ MachO::S_NON_LAZY_SYMBOL_POINTERS,
+ SectionKind::getMetadata());
+ OutStreamer.SwitchSection(TheSection);
+
+ for (auto &Stub : Stubs)
+ emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second);
+
Stubs.clear();
OutStreamer.AddBlankLine();
}
@@ -630,46 +658,25 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
}
if (Subtarget->isTargetCOFF()) {
- X86COFFMachineModuleInfo &COFFMMI =
- MMI->getObjFileInfo<X86COFFMachineModuleInfo>();
-
- // Emit type information for external functions
- typedef X86COFFMachineModuleInfo::externals_iterator externals_iterator;
- for (externals_iterator I = COFFMMI.externals_begin(),
- E = COFFMMI.externals_end();
- I != E; ++I) {
- OutStreamer.BeginCOFFSymbolDef(CurrentFnSym);
- OutStreamer.EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_EXTERNAL);
- OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
- << COFF::SCT_COMPLEX_TYPE_SHIFT);
- OutStreamer.EndCOFFSymbolDef();
- }
-
// Necessary for dllexport support
std::vector<const MCSymbol*> DLLExportedFns, DLLExportedGlobals;
- for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I)
- if (I->hasDLLExportStorageClass())
- DLLExportedFns.push_back(getSymbol(I));
+ for (const auto &Function : M)
+ if (Function.hasDLLExportStorageClass())
+ DLLExportedFns.push_back(getSymbol(&Function));
- for (Module::const_global_iterator I = M.global_begin(),
- E = M.global_end(); I != E; ++I)
- if (I->hasDLLExportStorageClass())
- DLLExportedGlobals.push_back(getSymbol(I));
+ for (const auto &Global : M.globals())
+ if (Global.hasDLLExportStorageClass())
+ DLLExportedGlobals.push_back(getSymbol(&Global));
- for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
- I != E; ++I) {
- const GlobalValue *GV = I;
- if (!GV->hasDLLExportStorageClass())
+ for (const auto &Alias : M.aliases()) {
+ if (!Alias.hasDLLExportStorageClass())
continue;
- while (const GlobalAlias *A = dyn_cast<GlobalAlias>(GV))
- GV = A->getAliasedGlobal();
-
- if (isa<Function>(GV))
- DLLExportedFns.push_back(getSymbol(I));
- else if (isa<GlobalVariable>(GV))
- DLLExportedGlobals.push_back(getSymbol(I));
+ if (Alias.getType()->getElementType()->isFunctionTy())
+ DLLExportedFns.push_back(getSymbol(&Alias));
+ else
+ DLLExportedGlobals.push_back(getSymbol(&Alias));
}
// Output linker support code for dllexported globals on windows.
@@ -678,28 +685,11 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering());
OutStreamer.SwitchSection(TLOFCOFF.getDrectveSection());
- SmallString<128> name;
- for (unsigned i = 0, e = DLLExportedGlobals.size(); i != e; ++i) {
- if (Subtarget->isTargetKnownWindowsMSVC())
- name = " /EXPORT:";
- else
- name = " -export:";
- name += DLLExportedGlobals[i]->getName();
- if (Subtarget->isTargetKnownWindowsMSVC())
- name += ",DATA";
- else
- name += ",data";
- OutStreamer.EmitBytes(name);
- }
- for (unsigned i = 0, e = DLLExportedFns.size(); i != e; ++i) {
- if (Subtarget->isTargetKnownWindowsMSVC())
- name = " /EXPORT:";
- else
- name = " -export:";
- name += DLLExportedFns[i]->getName();
- OutStreamer.EmitBytes(name);
- }
+ for (auto & Symbol : DLLExportedGlobals)
+ GenerateExportDirective(Symbol, /*IsData=*/true);
+ for (auto & Symbol : DLLExportedFns)
+ GenerateExportDirective(Symbol, /*IsData=*/false);
}
}
@@ -715,9 +705,9 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
const DataLayout *TD = TM.getDataLayout();
- for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
- OutStreamer.EmitLabel(Stubs[i].first);
- OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
+ for (const auto &Stub : Stubs) {
+ OutStreamer.EmitLabel(Stub.first);
+ OutStreamer.EmitSymbolValue(Stub.second.getPointer(),
TD->getPointerSize());
}
Stubs.clear();
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index 3308cc2..e4eef5d 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -16,13 +16,15 @@
#include "llvm/Target/TargetMachine.h"
namespace llvm {
-
class MCStreamer;
+class MCSymbol;
class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
const X86Subtarget *Subtarget;
StackMaps SM;
+ void GenerateExportDirective(const MCSymbol *Sym, bool IsData);
+
public:
explicit X86AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
: AsmPrinter(TM, Streamer), SM(*this) {
diff --git a/lib/Target/X86/X86COFFMachineModuleInfo.cpp b/lib/Target/X86/X86COFFMachineModuleInfo.cpp
deleted file mode 100644
index 6a6125b..0000000
--- a/lib/Target/X86/X86COFFMachineModuleInfo.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-//===-- X86COFFMachineModuleInfo.cpp - X86 COFF MMI Impl ------------------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This is an MMI implementation for X86 COFF (windows) targets.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86COFFMachineModuleInfo.h"
-using namespace llvm;
-
-
-X86COFFMachineModuleInfo::~X86COFFMachineModuleInfo() {
-}
diff --git a/lib/Target/X86/X86COFFMachineModuleInfo.h b/lib/Target/X86/X86COFFMachineModuleInfo.h
deleted file mode 100644
index 0dfeb42..0000000
--- a/lib/Target/X86/X86COFFMachineModuleInfo.h
+++ /dev/null
@@ -1,46 +0,0 @@
-//===-- X86coffmachinemoduleinfo.h - X86 COFF MMI Impl ----------*- C++ -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This is an MMI implementation for X86 COFF (windows) targets.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef X86COFF_MACHINEMODULEINFO_H
-#define X86COFF_MACHINEMODULEINFO_H
-
-#include "X86MachineFunctionInfo.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-
-namespace llvm {
- class X86MachineFunctionInfo;
- class DataLayout;
-
-/// X86COFFMachineModuleInfo - This is a MachineModuleInfoImpl implementation
-/// for X86 COFF targets.
-class X86COFFMachineModuleInfo : public MachineModuleInfoImpl {
- DenseSet<MCSymbol const *> Externals;
-public:
- X86COFFMachineModuleInfo(const MachineModuleInfo &) {}
- virtual ~X86COFFMachineModuleInfo();
-
- void addExternalFunction(MCSymbol* Symbol) {
- Externals.insert(Symbol);
- }
-
- typedef DenseSet<MCSymbol const *>::const_iterator externals_iterator;
- externals_iterator externals_begin() const { return Externals.begin(); }
- externals_iterator externals_end() const { return Externals.end(); }
-};
-
-
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/X86/X86CallingConv.h b/lib/Target/X86/X86CallingConv.h
index 040da35..e76f9fd 100644
--- a/lib/Target/X86/X86CallingConv.h
+++ b/lib/Target/X86/X86CallingConv.h
@@ -29,33 +29,6 @@ inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
return false;
}
-inline bool CC_X86_CDeclMethod_SRet(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags, CCState &State) {
- // Swap the order of the first two parameters if the first parameter is sret.
- if (ArgFlags.isSRet()) {
- assert(ValNo == 0);
- assert(ValVT == MVT::i32);
- State.AllocateStack(8, 4);
- State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, 4, LocVT, LocInfo));
-
- // Indicate that we need to swap the order of the first and second
- // parameters by "allocating" register zero. There are no register
- // parameters with cdecl methods, so we can use this to communicate to the
- // next call.
- State.AllocateReg(1);
- return true;
- } else if (ValNo == 1 && State.isAllocated(1)) {
- assert(ValVT == MVT::i32 && "non-i32-sized this param unsupported");
- // Stack was already allocated while processing sret.
- State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, 0, LocVT, LocInfo));
- return true;
- }
-
- // All other args use the C calling convention.
- return false;
-}
-
} // End llvm namespace
#endif
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 1cfd827..0824d4e 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -485,15 +485,6 @@ def CC_X86_32_ThisCall_Win : CallingConv<[
CCDelegateTo<CC_X86_32_ThisCall_Common>
]>;
-def CC_X86_CDeclMethod : CallingConv<[
- // Promote i8/i16 arguments to i32.
- CCIfType<[i8, i16], CCPromoteToType<i32>>,
-
- CCCustom<"CC_X86_CDeclMethod_SRet">,
-
- CCDelegateTo<CC_X86_32_Common>
-]>;
-
def CC_X86_32_ThisCall : CallingConv<[
CCIfSubtarget<"isTargetCygMing()", CCDelegateTo<CC_X86_32_ThisCall_Mingw>>,
CCDelegateTo<CC_X86_32_ThisCall_Win>
@@ -583,7 +574,6 @@ def CC_Intel_OCL_BI : CallingConv<[
def CC_X86_32 : CallingConv<[
CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
- CCIfCC<"CallingConv::X86_CDeclMethod", CCDelegateTo<CC_X86_CDeclMethod>>,
CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index f6c4c2e..76718d0 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -12,7 +12,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "x86-emitter"
#include "X86.h"
#include "X86InstrInfo.h"
#include "X86JITInfo.h"
@@ -36,6 +35,8 @@
#include "llvm/Target/TargetOptions.h"
using namespace llvm;
+#define DEBUG_TYPE "x86-emitter"
+
STATISTIC(NumEmitted, "Number of machine instructions emitted");
namespace {
@@ -52,7 +53,7 @@ namespace {
public:
static char ID;
explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce)
- : MachineFunctionPass(ID), II(0), TD(0), TM(tm),
+ : MachineFunctionPass(ID), II(nullptr), TD(nullptr), TM(tm),
MCE(mce), PICBaseOffset(0), Is64BitMode(false),
IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
@@ -450,7 +451,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
intptr_t PCAdj) {
const MachineOperand &Op3 = MI.getOperand(Op+3);
int DispVal = 0;
- const MachineOperand *DispForReloc = 0;
+ const MachineOperand *DispForReloc = nullptr;
// Figure out what sort of displacement we have to handle here.
if (Op3.isGlobal()) {
@@ -1475,7 +1476,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
#ifndef NDEBUG
dbgs() << "Cannot encode all operands of: " << MI << "\n";
#endif
- llvm_unreachable(0);
+ llvm_unreachable(nullptr);
}
MCE.processDebugLoc(MI.getDebugLoc(), false);
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 1aab1ea..56bcfa3 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -183,7 +183,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
unsigned &ResultReg) {
// Get opcode and regclass of the output for the given load instruction.
unsigned Opc = 0;
- const TargetRegisterClass *RC = NULL;
+ const TargetRegisterClass *RC = nullptr;
switch (VT.getSimpleVT().SimpleTy) {
default: return false;
case MVT::i1:
@@ -363,7 +363,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
// it works...).
if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
if (const GlobalVariable *GVar =
- dyn_cast_or_null<GlobalVariable>(GA->getAliasedGlobal()))
+ dyn_cast_or_null<GlobalVariable>(GA->getAliasee()))
if (GVar->isThreadLocal())
return false;
@@ -406,7 +406,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
} else {
// Issue load from stub.
unsigned Opc = 0;
- const TargetRegisterClass *RC = NULL;
+ const TargetRegisterClass *RC = nullptr;
X86AddressMode StubAM;
StubAM.Base.Reg = AM.Base.Reg;
StubAM.GV = GV;
@@ -441,7 +441,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
// Now construct the final address. Note that the Disp, Scale,
// and Index values may already be set here.
AM.Base.Reg = LoadReg;
- AM.GV = 0;
+ AM.GV = nullptr;
return true;
}
}
@@ -467,7 +467,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
SmallVector<const Value *, 32> GEPs;
redo_gep:
- const User *U = NULL;
+ const User *U = nullptr;
unsigned Opcode = Instruction::UserOp1;
if (const Instruction *I = dyn_cast<Instruction>(V)) {
// Don't walk into other basic blocks; it's possible we haven't
@@ -626,7 +626,7 @@ redo_gep:
/// X86SelectCallAddress - Attempt to fill in an address from the given value.
///
bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
- const User *U = NULL;
+ const User *U = nullptr;
unsigned Opcode = Instruction::UserOp1;
const Instruction *I = dyn_cast<Instruction>(V);
// Record if the value is defined in the same basic block.
@@ -1247,7 +1247,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
bool X86FastISel::X86SelectShift(const Instruction *I) {
unsigned CReg = 0, OpReg = 0;
- const TargetRegisterClass *RC = NULL;
+ const TargetRegisterClass *RC = nullptr;
if (I->getType()->isIntegerTy(8)) {
CReg = X86::CL;
RC = &X86::GR8RegClass;
@@ -1487,7 +1487,7 @@ bool X86FastISel::X86SelectSelect(const Instruction *I) {
if (!Subtarget->hasCMov()) return false;
unsigned Opc = 0;
- const TargetRegisterClass *RC = NULL;
+ const TargetRegisterClass *RC = nullptr;
if (VT == MVT::i16) {
Opc = X86::CMOVE16rr;
RC = &X86::GR16RegClass;
@@ -1821,10 +1821,10 @@ bool X86FastISel::FastLowerArguments() {
}
}
- static const uint16_t GPR32ArgRegs[] = {
+ static const MCPhysReg GPR32ArgRegs[] = {
X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
};
- static const uint16_t GPR64ArgRegs[] = {
+ static const MCPhysReg GPR64ArgRegs[] = {
X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
};
@@ -1865,7 +1865,7 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
if (cast<CallInst>(I)->isTailCall())
return false;
- return DoSelectCall(I, 0);
+ return DoSelectCall(I, nullptr);
}
static unsigned computeBytesPoppedByCallee(const X86Subtarget &Subtarget,
@@ -1936,8 +1936,8 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
if (!X86SelectCallAddress(Callee, CalleeAM))
return false;
unsigned CalleeOp = 0;
- const GlobalValue *GV = 0;
- if (CalleeAM.GV != 0) {
+ const GlobalValue *GV = nullptr;
+ if (CalleeAM.GV != nullptr) {
GV = CalleeAM.GV;
} else if (CalleeAM.Base.Reg != 0) {
CalleeOp = CalleeAM.Base.Reg;
@@ -2163,7 +2163,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
if (Subtarget->is64Bit() && isVarArg && !isWin64) {
// Count the number of XMM registers allocated.
- static const uint16_t XMMArgRegs[] = {
+ static const MCPhysReg XMMArgRegs[] = {
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
};
@@ -2387,7 +2387,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
// Get opcode and regclass of the output for the given load instruction.
unsigned Opc = 0;
- const TargetRegisterClass *RC = NULL;
+ const TargetRegisterClass *RC = nullptr;
switch (VT.SimpleTy) {
default: return 0;
case MVT::i8:
@@ -2437,7 +2437,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
// If the expression is just a basereg, then we're done, otherwise we need
// to emit an LEA.
if (AM.BaseType == X86AddressMode::RegBase &&
- AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == 0)
+ AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
return AM.Base.Reg;
Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
@@ -2510,7 +2510,7 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
// Get opcode and regclass for the given zero.
unsigned Opc = 0;
- const TargetRegisterClass *RC = NULL;
+ const TargetRegisterClass *RC = nullptr;
switch (VT.SimpleTy) {
default: return 0;
case MVT::f32:
@@ -2558,7 +2558,7 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
MachineInstr *Result =
XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment);
- if (Result == 0) return false;
+ if (!Result) return false;
FuncInfo.MBB->insert(FuncInfo.InsertPt, Result);
MI->eraseFromParent();
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index c2c234b..6c5b86f 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -13,7 +13,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "x86-fixup-LEAs"
#include "X86.h"
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
@@ -28,6 +27,8 @@
#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
+#define DEBUG_TYPE "x86-fixup-LEAs"
+
STATISTIC(NumLEAs, "Number of LEA instructions created");
namespace {
@@ -56,6 +57,11 @@ namespace {
void processInstruction(MachineBasicBlock::iterator& I,
MachineFunction::iterator MFI);
+ /// \brief Given a LEA instruction which is unprofitable
+ /// on Silvermont try to replace it with an equivalent ADD instruction
+ void processInstructionForSLM(MachineBasicBlock::iterator& I,
+ MachineFunction::iterator MFI);
+
/// \brief Determine if an instruction references a machine register
/// and, if so, whether it reads or writes the register.
RegUsageState usesRegister(MachineOperand& p,
@@ -85,7 +91,7 @@ namespace {
private:
MachineFunction *MF;
const TargetMachine *TM;
- const TargetInstrInfo *TII; // Machine instruction info.
+ const X86InstrInfo *TII; // Machine instruction info.
};
char FixupLEAPass::ID = 0;
@@ -97,7 +103,7 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
MachineInstr* MI = MBBI;
MachineInstr* NewMI;
switch (MI->getOpcode()) {
- case X86::MOV32rr:
+ case X86::MOV32rr:
case X86::MOV64rr: {
const MachineOperand& Src = MI->getOperand(1);
const MachineOperand& Dest = MI->getOperand(0);
@@ -123,7 +129,7 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
if (!MI->getOperand(2).isImm()) {
// convertToThreeAddress will call getImm()
// which requires isImm() to be true
- return 0;
+ return nullptr;
}
break;
case X86::ADD16rr:
@@ -132,10 +138,10 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
// if src1 != src2, then convertToThreeAddress will
// need to create a Virtual register, which we cannot do
// after register allocation.
- return 0;
+ return nullptr;
}
}
- return TII->convertToThreeAddress(MFI, MBBI, 0);
+ return TII->convertToThreeAddress(MFI, MBBI, nullptr);
}
FunctionPass *llvm::createX86FixupLEAs() {
@@ -143,9 +149,12 @@ FunctionPass *llvm::createX86FixupLEAs() {
}
bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
- MF = &Func;
- TM = &MF->getTarget();
- TII = TM->getInstrInfo();
+ TM = &Func.getTarget();
+ const X86Subtarget &ST = TM->getSubtarget<X86Subtarget>();
+ if (!ST.LEAusesAG() && !ST.slowLEA())
+ return false;
+
+ TII = static_cast<const X86InstrInfo*>(TM->getInstrInfo());
DEBUG(dbgs() << "Start X86FixupLEAs\n";);
// Process all basic blocks.
@@ -211,7 +220,7 @@ MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p,
InstrDistance += TII->getInstrLatency(TM->getInstrItineraryData(), CurInst);
Found = getPreviousInstr(CurInst, MFI);
}
- return 0;
+ return nullptr;
}
void FixupLEAPass::processInstruction(MachineBasicBlock::iterator& I,
@@ -242,9 +251,9 @@ void FixupLEAPass::seekLEAFixup(MachineOperand& p,
MachineInstr* NewMI = postRAConvertToLEA(MFI, MBI);
if (NewMI) {
++NumLEAs;
- DEBUG(dbgs() << "Candidate to replace:"; MBI->dump(););
+ DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump(););
// now to replace with an equivalent LEA...
- DEBUG(dbgs() << "Replaced by: "; NewMI->dump(););
+ DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump(););
MFI->erase(MBI);
MachineBasicBlock::iterator J =
static_cast<MachineBasicBlock::iterator> (NewMI);
@@ -253,10 +262,80 @@ void FixupLEAPass::seekLEAFixup(MachineOperand& p,
}
}
+void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI) {
+ MachineInstr *MI = I;
+ const int opcode = MI->getOpcode();
+ if (opcode != X86::LEA16r && opcode != X86::LEA32r && opcode != X86::LEA64r &&
+ opcode != X86::LEA64_32r)
+ return;
+ if (MI->getOperand(5).getReg() != 0 || !MI->getOperand(4).isImm() ||
+ !TII->isSafeToClobberEFLAGS(*MFI, I))
+ return;
+ const unsigned DstR = MI->getOperand(0).getReg();
+ const unsigned SrcR1 = MI->getOperand(1).getReg();
+ const unsigned SrcR2 = MI->getOperand(3).getReg();
+ if ((SrcR1 == 0 || SrcR1 != DstR) && (SrcR2 == 0 || SrcR2 != DstR))
+ return;
+ if (MI->getOperand(2).getImm() > 1)
+ return;
+ int addrr_opcode, addri_opcode;
+ switch (opcode) {
+ case X86::LEA16r:
+ addrr_opcode = X86::ADD16rr;
+ addri_opcode = X86::ADD16ri;
+ break;
+ case X86::LEA32r:
+ addrr_opcode = X86::ADD32rr;
+ addri_opcode = X86::ADD32ri;
+ break;
+ case X86::LEA64_32r:
+ case X86::LEA64r:
+ addrr_opcode = X86::ADD64rr;
+ addri_opcode = X86::ADD64ri32;
+ break;
+ default:
+ assert(false && "Unexpected LEA instruction");
+ }
+ DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
+ DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+ MachineInstr *NewMI = 0;
+ const MachineOperand &Dst = MI->getOperand(0);
+ // Make ADD instruction for two registers writing to LEA's destination
+ if (SrcR1 != 0 && SrcR2 != 0) {
+ const MachineOperand &Src1 = MI->getOperand(SrcR1 == DstR ? 1 : 3);
+ const MachineOperand &Src2 = MI->getOperand(SrcR1 == DstR ? 3 : 1);
+ NewMI = BuildMI(*MF, MI->getDebugLoc(), TII->get(addrr_opcode))
+ .addOperand(Dst)
+ .addOperand(Src1)
+ .addOperand(Src2);
+ MFI->insert(I, NewMI);
+ DEBUG(NewMI->dump(););
+ }
+ // Make ADD instruction for immediate
+ if (MI->getOperand(4).getImm() != 0) {
+ const MachineOperand &SrcR = MI->getOperand(SrcR1 == DstR ? 1 : 3);
+ NewMI = BuildMI(*MF, MI->getDebugLoc(), TII->get(addri_opcode))
+ .addOperand(Dst)
+ .addOperand(SrcR)
+ .addImm(MI->getOperand(4).getImm());
+ MFI->insert(I, NewMI);
+ DEBUG(NewMI->dump(););
+ }
+ if (NewMI) {
+ MFI->erase(I);
+ I = static_cast<MachineBasicBlock::iterator>(NewMI);
+ }
+}
+
bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
MachineFunction::iterator MFI) {
- for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I)
- processInstruction(I, MFI);
+ for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
+ if (TM->getSubtarget<X86Subtarget>().isSLM())
+ processInstructionForSLM(I, MFI);
+ else
+ processInstruction(I, MFI);
+ }
return false;
}
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 7955ade..c8a3ab3 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -23,7 +23,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "x86-codegen"
#include "X86.h"
#include "X86InstrInfo.h"
#include "llvm/ADT/DepthFirstIterator.h"
@@ -45,6 +44,8 @@
#include <algorithm>
using namespace llvm;
+#define DEBUG_TYPE "x86-codegen"
+
STATISTIC(NumFXCH, "Number of fxch instructions inserted");
STATISTIC(NumFP , "Number of floating point instructions");
@@ -430,7 +431,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
if (FPInstClass == X86II::NotFP)
continue; // Efficiently ignore non-fp insts!
- MachineInstr *PrevMI = 0;
+ MachineInstr *PrevMI = nullptr;
if (I != BB.begin())
PrevMI = std::prev(I);
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index f0ad4d1..4c1374f 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -182,7 +182,7 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
}
}
- MachineInstr *MI = NULL;
+ MachineInstr *MI = nullptr;
if (UseLEA) {
MI = addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
@@ -204,7 +204,7 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
/// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator.
static
void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
- unsigned StackPtr, uint64_t *NumBytes = NULL) {
+ unsigned StackPtr, uint64_t *NumBytes = nullptr) {
if (MBBI == MBB.begin()) return;
MachineBasicBlock::iterator PI = std::prev(MBBI);
@@ -225,11 +225,12 @@ void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
}
}
-/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower iterator.
+/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower
+/// iterator.
static
void mergeSPUpdatesDown(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
- unsigned StackPtr, uint64_t *NumBytes = NULL) {
+ unsigned StackPtr, uint64_t *NumBytes = nullptr) {
// FIXME: THIS ISN'T RUN!!!
return;
@@ -257,19 +258,19 @@ void mergeSPUpdatesDown(MachineBasicBlock &MBB,
}
/// mergeSPUpdates - Checks the instruction before/after the passed
-/// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and the
-/// stack adjustment is returned as a positive value for ADD/LEA and a negative for
-/// SUB.
+/// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and
+/// the stack adjustment is returned as a positive value for ADD/LEA and a
+/// negative for SUB.
static int mergeSPUpdates(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MBBI,
- unsigned StackPtr,
- bool doMergeWithPrevious) {
+ MachineBasicBlock::iterator &MBBI, unsigned StackPtr,
+ bool doMergeWithPrevious) {
if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
(!doMergeWithPrevious && MBBI == MBB.end()))
return 0;
MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
- MachineBasicBlock::iterator NI = doMergeWithPrevious ? 0 : std::next(MBBI);
+ MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr
+ : std::next(MBBI);
unsigned Opc = PI->getOpcode();
int Offset = 0;
@@ -366,8 +367,10 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(
unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
unsigned CFIIndex =
- MMI.addFrameInst(MCCFIInstruction::createOffset(0, DwarfReg, Offset));
- BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+ MMI.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg,
+ Offset));
+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
}
}
@@ -446,7 +449,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
!MFI->adjustsStack() && // No calls.
!IsWin64 && // Win64 has no Red Zone
!usesTheStack(MF) && // Don't push and pop.
- !MF.getTarget().Options.EnableSegmentedStacks) { // Regular stack
+ !MF.shouldSplitStack()) { // Regular stack
uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
if (HasFP) MinSize += SlotSize;
StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
@@ -511,15 +514,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
// Define the current CFA rule to use the provided offset.
assert(StackSize);
unsigned CFIIndex = MMI.addFrameInst(
- MCCFIInstruction::createDefCfaOffset(0, 2 * stackGrowth));
- BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+ MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth));
+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
// Change the rule for the FramePtr to be an "offset" rule.
unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true);
CFIIndex = MMI.addFrameInst(
- MCCFIInstruction::createOffset(0, DwarfFramePtr, 2 * stackGrowth));
- BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+ MCCFIInstruction::createOffset(nullptr,
+ DwarfFramePtr, 2 * stackGrowth));
+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
}
@@ -534,8 +538,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
// Define the current CFA to use the EBP/RBP register.
unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true);
unsigned CFIIndex = MMI.addFrameInst(
- MCCFIInstruction::createDefCfaRegister(0, DwarfFramePtr));
- BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+ MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr));
+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
}
@@ -564,7 +568,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
assert(StackSize);
unsigned CFIIndex = MMI.addFrameInst(
MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset));
- BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
StackOffset += stackGrowth;
}
@@ -698,9 +702,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
// Define the current CFA rule to use the provided offset.
assert(StackSize);
unsigned CFIIndex = MMI.addFrameInst(
- MCCFIInstruction::createDefCfaOffset(0, -StackSize + stackGrowth));
+ MCCFIInstruction::createDefCfaOffset(nullptr,
+ -StackSize + stackGrowth));
- BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
}
@@ -905,7 +910,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
}
}
-int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const {
+int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+ int FI) const {
const X86RegisterInfo *RegInfo =
static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo());
const MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -1170,6 +1176,15 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
!STI.isTargetWin32() && !STI.isTargetWin64() && !STI.isTargetFreeBSD())
report_fatal_error("Segmented stacks not supported on this platform.");
+ // Eventually StackSize will be calculated by a link-time pass; which will
+ // also decide whether checking code needs to be injected into this particular
+ // prologue.
+ StackSize = MFI->getStackSize();
+
+ // Do not generate a prologue for functions with a stack of size zero
+ if (StackSize == 0)
+ return;
+
MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
@@ -1194,11 +1209,6 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
MF.push_front(allocMBB);
MF.push_front(checkMBB);
- // Eventually StackSize will be calculated by a link-time pass; which will
- // also decide whether checking code needs to be injected into this particular
- // prologue.
- StackSize = MFI->getStackSize();
-
// When the frame size is less than 256 we just compare the stack
// boundary directly to the value of the stack pointer, per gcc.
bool CompareStackPointer = StackSize < kSplitStackAvailable;
@@ -1256,22 +1266,23 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
.addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
} else if (STI.isTargetDarwin()) {
- // TlsOffset doesn't fit into a mod r/m byte so we need an extra register
+ // TlsOffset doesn't fit into a mod r/m byte so we need an extra register.
unsigned ScratchReg2;
bool SaveScratch2;
if (CompareStackPointer) {
- // The primary scratch register is available for holding the TLS offset
+ // The primary scratch register is available for holding the TLS offset.
ScratchReg2 = GetScratchRegister(Is64Bit, MF, true);
SaveScratch2 = false;
} else {
// Need to use a second register to hold the TLS offset
ScratchReg2 = GetScratchRegister(Is64Bit, MF, false);
- // Unfortunately, with fastcc the second scratch register may hold an arg
+ // Unfortunately, with fastcc the second scratch register may hold an
+ // argument.
SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);
}
- // If Scratch2 is live-in then it needs to be saved
+ // If Scratch2 is live-in then it needs to be saved.
assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) &&
"Scratch register is live-in and not saved");
@@ -1348,14 +1359,14 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
///
/// CheckStack:
-/// temp0 = sp - MaxStack
-/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+/// temp0 = sp - MaxStack
+/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
/// OldStart:
-/// ...
+/// ...
/// IncStack:
-/// call inc_stack # doubles the stack space
-/// temp0 = sp - MaxStack
-/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+/// call inc_stack # doubles the stack space
+/// temp0 = sp - MaxStack
+/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
const X86InstrInfo &TII = *TM.getInstrInfo();
MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -1514,7 +1525,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
- MachineInstr *New = 0;
+ MachineInstr *New = nullptr;
if (Opcode == TII.getCallFrameSetupOpcode()) {
New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)),
StackPtr)
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index f0db8cb..208bb8b 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -47,7 +47,7 @@ public:
void adjustForHiPEPrologue(MachineFunction &MF) const override;
void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
- RegScavenger *RS = NULL) const override;
+ RegScavenger *RS = nullptr) const override;
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 3e45adb..74386d3 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -12,7 +12,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "x86-isel"
#include "X86.h"
#include "X86InstrBuilder.h"
#include "X86MachineFunctionInfo.h"
@@ -36,6 +35,8 @@
#include "llvm/Target/TargetOptions.h"
using namespace llvm;
+#define DEBUG_TYPE "x86-isel"
+
STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
//===----------------------------------------------------------------------===//
@@ -70,17 +71,18 @@ namespace {
X86ISelAddressMode()
: BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
- Segment(), GV(0), CP(0), BlockAddr(0), ES(0), JT(-1), Align(0),
- SymbolFlags(X86II::MO_NO_FLAG) {
+ Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
+ JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {
}
bool hasSymbolicDisplacement() const {
- return GV != 0 || CP != 0 || ES != 0 || JT != -1 || BlockAddr != 0;
+ return GV != nullptr || CP != nullptr || ES != nullptr ||
+ JT != -1 || BlockAddr != nullptr;
}
bool hasBaseOrIndexReg() const {
return BaseType == FrameIndexBase ||
- IndexReg.getNode() != 0 || Base_Reg.getNode() != 0;
+ IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
}
/// isRIPRelative - Return true if this addressing mode is already RIP
@@ -102,14 +104,14 @@ namespace {
void dump() {
dbgs() << "X86ISelAddressMode " << this << '\n';
dbgs() << "Base_Reg ";
- if (Base_Reg.getNode() != 0)
+ if (Base_Reg.getNode())
Base_Reg.getNode()->dump();
else
dbgs() << "nul";
dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'
<< " Scale" << Scale << '\n'
<< "IndexReg ";
- if (IndexReg.getNode() != 0)
+ if (IndexReg.getNode())
IndexReg.getNode()->dump();
else
dbgs() << "nul";
@@ -160,6 +162,13 @@ namespace {
return "X86 DAG->DAG Instruction Selection";
}
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ // Reset the subtarget each time through.
+ Subtarget = &TM.getSubtarget<X86Subtarget>();
+ SelectionDAGISel::runOnMachineFunction(MF);
+ return true;
+ }
+
void EmitFunctionEntryCode() override;
bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
@@ -374,14 +383,13 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
else
Ops.push_back(Chain.getOperand(i));
SDValue NewChain =
- CurDAG->getNode(ISD::TokenFactor, SDLoc(Load),
- MVT::Other, &Ops[0], Ops.size());
+ CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
Ops.clear();
Ops.push_back(NewChain);
}
for (unsigned i = 1, e = OrigChain.getNumOperands(); i != e; ++i)
Ops.push_back(OrigChain.getOperand(i));
- CurDAG->UpdateNodeOperands(OrigChain.getNode(), &Ops[0], Ops.size());
+ CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
Load.getOperand(1), Load.getOperand(2));
@@ -390,7 +398,7 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
Ops.push_back(SDValue(Load.getNode(), 1));
for (unsigned i = 1, e = NumOps; i != e; ++i)
Ops.push_back(Call.getOperand(i));
- CurDAG->UpdateNodeOperands(Call.getNode(), &Ops[0], NumOps);
+ CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
}
/// isCalleeLoad - Return true if call address is a load and it can be
@@ -612,7 +620,7 @@ bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
// gs:0 (or fs:0 on X86-64) contains its own address.
// For more information see http://people.redhat.com/drepper/tls.pdf
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
- if (C->getSExtValue() == 0 && AM.Segment.getNode() == 0 &&
+ if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
Subtarget->isTargetLinux())
switch (N->getPointerInfo().getAddrSpace()) {
case 256:
@@ -733,7 +741,7 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
// a smaller encoding and avoids a scaled-index.
if (AM.Scale == 2 &&
AM.BaseType == X86ISelAddressMode::RegBase &&
- AM.Base_Reg.getNode() == 0) {
+ AM.Base_Reg.getNode() == nullptr) {
AM.Base_Reg = AM.IndexReg;
AM.Scale = 1;
}
@@ -745,8 +753,8 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
Subtarget->is64Bit() &&
AM.Scale == 1 &&
AM.BaseType == X86ISelAddressMode::RegBase &&
- AM.Base_Reg.getNode() == 0 &&
- AM.IndexReg.getNode() == 0 &&
+ AM.Base_Reg.getNode() == nullptr &&
+ AM.IndexReg.getNode() == nullptr &&
AM.SymbolFlags == X86II::MO_NO_FLAG &&
AM.hasSymbolicDisplacement())
AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
@@ -926,7 +934,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
APInt MaskedHighBits =
APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
APInt KnownZero, KnownOne;
- DAG.ComputeMaskedBits(X, KnownZero, KnownOne);
+ DAG.computeKnownBits(X, KnownZero, KnownOne);
if (MaskedHighBits != KnownZero) return true;
// We've identified a pattern that can be transformed into a single shift
@@ -1009,7 +1017,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
case ISD::FrameIndex:
if (AM.BaseType == X86ISelAddressMode::RegBase &&
- AM.Base_Reg.getNode() == 0 &&
+ AM.Base_Reg.getNode() == nullptr &&
(!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
AM.BaseType = X86ISelAddressMode::FrameIndexBase;
AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
@@ -1018,7 +1026,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
break;
case ISD::SHL:
- if (AM.IndexReg.getNode() != 0 || AM.Scale != 1)
+ if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
break;
if (ConstantSDNode
@@ -1052,7 +1060,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
case ISD::SRL: {
// Scale must not be used already.
- if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break;
+ if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
SDValue And = N.getOperand(0);
if (And.getOpcode() != ISD::AND) break;
@@ -1086,8 +1094,8 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
case X86ISD::MUL_IMM:
// X*[3,5,9] -> X+X*[2,4,8]
if (AM.BaseType == X86ISelAddressMode::RegBase &&
- AM.Base_Reg.getNode() == 0 &&
- AM.IndexReg.getNode() == 0) {
+ AM.Base_Reg.getNode() == nullptr &&
+ AM.IndexReg.getNode() == nullptr) {
if (ConstantSDNode
*CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
@@ -1237,7 +1245,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
// with a constant to enable use of the scaled offset field.
// Scale must not be used already.
- if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break;
+ if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
SDValue Shift = N.getOperand(0);
if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break;
@@ -1276,7 +1284,7 @@ bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) {
// Is the base register already occupied?
if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
// If so, check to see if the scale index register is set.
- if (AM.IndexReg.getNode() == 0) {
+ if (!AM.IndexReg.getNode()) {
AM.IndexReg = N;
AM.Scale = 1;
return false;
@@ -1567,7 +1575,7 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
if (!SelectAddr(Node, In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
- return NULL;
+ return nullptr;
MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, Chain};
@@ -1756,7 +1764,7 @@ static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
if (Node->hasAnyUseOfValue(0))
- return 0;
+ return nullptr;
SDLoc dl(Node);
@@ -1768,13 +1776,13 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
SDValue Val = Node->getOperand(2);
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
- return 0;
+ return nullptr;
// Which index into the table.
enum AtomicOpc Op;
switch (Node->getOpcode()) {
default:
- return 0;
+ return nullptr;
case ISD::ATOMIC_LOAD_OR:
Op = OR;
break;
@@ -1795,7 +1803,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
unsigned Opc = 0;
switch (NVT.SimpleTy) {
- default: return 0;
+ default: return nullptr;
case MVT::i8:
if (isCN)
Opc = AtomicOpcTbl[Op][ConstantI8];
@@ -1847,7 +1855,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
}
cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
SDValue RetVals[] = { Undef, Ret };
- return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
+ return CurDAG->getMergeValues(RetVals, dl).getNode();
}
/// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has
@@ -1990,7 +1998,7 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
// Make a new TokenFactor with all the other input chains except
// for the load.
InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
- MVT::Other, &ChainOps[0], ChainOps.size());
+ MVT::Other, ChainOps);
}
if (!ChainCheck)
return false;
@@ -2027,7 +2035,7 @@ SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
SDValue VMask = Node->getOperand(5);
ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
if (!Scale)
- return 0;
+ return nullptr;
SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(),
MVT::Other);
@@ -2058,7 +2066,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
if (Node->isMachineOpcode()) {
DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
Node->setNodeId(-1);
- return NULL; // Already selected.
+ return nullptr; // Already selected.
}
switch (Opcode) {
@@ -2108,7 +2116,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
SDNode *RetVal = SelectGather(Node, Opc);
if (RetVal)
// We already called ReplaceUses inside SelectGather.
- return NULL;
+ return nullptr;
break;
}
}
@@ -2259,7 +2267,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1));
ReplaceUses(SDValue(Node, 2), SDValue(CNode, 2));
- return NULL;
+ return nullptr;
}
case ISD::SMUL_LOHI:
@@ -2386,7 +2394,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
}
// Copy the low half of the result, if it is needed.
if (!SDValue(Node, 0).use_empty()) {
- if (ResLo.getNode() == 0) {
+ if (!ResLo.getNode()) {
assert(LoReg && "Register for low half is not defined!");
ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT,
InFlag);
@@ -2397,7 +2405,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
}
// Copy the high half of the result, if it is needed.
if (!SDValue(Node, 1).use_empty()) {
- if (ResHi.getNode() == 0) {
+ if (!ResHi.getNode()) {
assert(HiReg && "Register for high half is not defined!");
ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT,
InFlag);
@@ -2407,7 +2415,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
}
- return NULL;
+ return nullptr;
}
case ISD::SDIVREM:
@@ -2575,7 +2583,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
ReplaceUses(SDValue(Node, 1), Result);
DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
}
- return NULL;
+ return nullptr;
}
case X86ISD::CMP:
@@ -2632,7 +2640,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
// one, do not call ReplaceAllUsesWith.
ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
SDValue(NewNode, 0));
- return NULL;
+ return nullptr;
}
// For example, "testl %eax, $2048" to "testb %ah, $8".
@@ -2669,7 +2677,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
// one, do not call ReplaceAllUsesWith.
ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
SDValue(NewNode, 0));
- return NULL;
+ return nullptr;
}
// For example, "testl %eax, $32776" to "testw %ax, $32776".
@@ -2691,7 +2699,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
// one, do not call ReplaceAllUsesWith.
ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
SDValue(NewNode, 0));
- return NULL;
+ return nullptr;
}
// For example, "testq %rax, $268468232" to "testl %eax, $268468232".
@@ -2713,7 +2721,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
// one, do not call ReplaceAllUsesWith.
ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
SDValue(NewNode, 0));
- return NULL;
+ return nullptr;
}
}
break;
@@ -2740,7 +2748,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
SDValue StoredVal = StoreNode->getOperand(1);
unsigned Opc = StoredVal->getOpcode();
- LoadSDNode *LoadNode = 0;
+ LoadSDNode *LoadNode = nullptr;
SDValue InputChain;
if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG,
LoadNode, InputChain))
@@ -2772,7 +2780,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
SDNode *ResNode = SelectCode(Node);
DEBUG(dbgs() << "=> ";
- if (ResNode == NULL || ResNode == Node)
+ if (ResNode == nullptr || ResNode == Node)
Node->dump(CurDAG);
else
ResNode->dump(CurDAG);
@@ -2790,7 +2798,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
case 'v': // not offsetable ??
default: return true;
case 'm': // memory
- if (!SelectAddr(0, Op, Op0, Op1, Op2, Op3, Op4))
+ if (!SelectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
return true;
break;
}
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 2a35061..cbaf44e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -12,7 +12,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "x86-isel"
#include "X86ISelLowering.h"
#include "Utils/X86ShuffleDecode.h"
#include "X86CallingConv.h"
@@ -23,6 +22,7 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/VariadicFunction.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -52,6 +52,8 @@
#include <cctype>
using namespace llvm;
+#define DEBUG_TYPE "x86-isel"
+
STATISTIC(NumTailCalls, "Number of tail calls");
// Forward declarations.
@@ -84,7 +86,8 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
// If the input is a buildvector just emit a smaller one.
if (Vec.getOpcode() == ISD::BUILD_VECTOR)
return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
- Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk);
+ makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
+ ElemsPerChunk));
SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
@@ -265,10 +268,10 @@ void X86TargetLowering::resetOperationActions() {
// The _ftol2 runtime function has an unusual calling conv, which
// is modeled by a special pseudo-instruction.
- setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
- setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
- setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
- setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
+ setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
+ setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
+ setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
+ setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
}
if (Subtarget->isTargetDarwin()) {
@@ -635,15 +638,8 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
- if (Subtarget->isOSWindows() && !Subtarget->isTargetMacho())
- setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
- MVT::i64 : MVT::i32, Custom);
- else if (TM.Options.EnableSegmentedStacks)
- setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
- MVT::i64 : MVT::i32, Custom);
- else
- setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
- MVT::i64 : MVT::i32, Expand);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+ MVT::i64 : MVT::i32, Custom);
if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
// f32 and f64 use SSE.
@@ -832,7 +828,9 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::FRINT, VT, Expand);
setOperationAction(ISD::FNEARBYINT, VT, Expand);
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::UDIVREM, VT, Expand);
setOperationAction(ISD::FPOW, VT, Expand);
@@ -944,6 +942,10 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::ADD, MVT::v2i64, Legal);
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+ setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
+ setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
+ setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
+ setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
setOperationAction(ISD::SUB, MVT::v16i8, Legal);
setOperationAction(ISD::SUB, MVT::v8i16, Legal);
setOperationAction(ISD::SUB, MVT::v4i32, Legal);
@@ -1036,6 +1038,10 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal);
+
+ setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
+ setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
+ setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
}
if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
@@ -1064,11 +1070,14 @@ void X86TargetLowering::resetOperationActions() {
// FIXME: Do we need to handle scalar-to-vector here?
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
- setOperationAction(ISD::VSELECT, MVT::v2f64, Legal);
- setOperationAction(ISD::VSELECT, MVT::v2i64, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v2f64, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v2i64, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v4i32, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v8i16, Custom);
+ // There is no BLENDI for byte vectors. We don't need to custom lower
+ // some vselects for now.
setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
- setOperationAction(ISD::VSELECT, MVT::v4i32, Legal);
- setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);
// i8 and i16 vectors are custom , because the source register and source
// source memory operand types are not the same width. f32 vectors are
@@ -1111,9 +1120,6 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::SHL, MVT::v4i32, Custom);
setOperationAction(ISD::SRA, MVT::v4i32, Custom);
-
- setOperationAction(ISD::SDIV, MVT::v8i16, Custom);
- setOperationAction(ISD::SDIV, MVT::v4i32, Custom);
}
if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
@@ -1178,8 +1184,6 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::SRA, MVT::v16i16, Custom);
setOperationAction(ISD::SRA, MVT::v32i8, Custom);
- setOperationAction(ISD::SDIV, MVT::v16i16, Custom);
-
setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
@@ -1189,10 +1193,10 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
- setOperationAction(ISD::VSELECT, MVT::v4f64, Legal);
- setOperationAction(ISD::VSELECT, MVT::v4i64, Legal);
- setOperationAction(ISD::VSELECT, MVT::v8i32, Legal);
- setOperationAction(ISD::VSELECT, MVT::v8f32, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v4f64, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v4i64, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v8i32, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v8f32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
@@ -1232,9 +1236,13 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::MUL, MVT::v16i16, Legal);
// Don't lower v32i8 because there is no 128-bit byte mul
- setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
+ setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
+ setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
+ setOperationAction(ISD::MULHU, MVT::v16i16, Legal);
+ setOperationAction(ISD::MULHS, MVT::v16i16, Legal);
- setOperationAction(ISD::SDIV, MVT::v8i32, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v16i16, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
} else {
setOperationAction(ISD::ADD, MVT::v4i64, Custom);
setOperationAction(ISD::ADD, MVT::v8i32, Custom);
@@ -1343,7 +1351,6 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::FNEG, MVT::v8f64, Custom);
setOperationAction(ISD::FMA, MVT::v8f64, Legal);
setOperationAction(ISD::FMA, MVT::v16f32, Legal);
- setOperationAction(ISD::SDIV, MVT::v16i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
@@ -1358,9 +1365,11 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
@@ -1392,6 +1401,8 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom);
setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
@@ -1474,6 +1485,8 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+ if (!Subtarget->is64Bit())
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
// handle type legalization for these operations here.
@@ -1498,9 +1511,9 @@ void X86TargetLowering::resetOperationActions() {
if (!Subtarget->is64Bit()) {
// These libcalls are not available in 32-bit.
- setLibcallName(RTLIB::SHL_I128, 0);
- setLibcallName(RTLIB::SRL_I128, 0);
- setLibcallName(RTLIB::SRA_I128, 0);
+ setLibcallName(RTLIB::SHL_I128, nullptr);
+ setLibcallName(RTLIB::SRL_I128, nullptr);
+ setLibcallName(RTLIB::SRA_I128, nullptr);
}
// Combine sin / cos into one node or libcall if possible.
@@ -1516,6 +1529,15 @@ void X86TargetLowering::resetOperationActions() {
}
}
+ if (Subtarget->isTargetWin64()) {
+ setOperationAction(ISD::SDIV, MVT::i128, Custom);
+ setOperationAction(ISD::UDIV, MVT::i128, Custom);
+ setOperationAction(ISD::SREM, MVT::i128, Custom);
+ setOperationAction(ISD::UREM, MVT::i128, Custom);
+ setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
+ setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
+ }
+
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
@@ -1540,6 +1562,7 @@ void X86TargetLowering::resetOperationActions() {
setTargetDAGCombine(ISD::TRUNCATE);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::SETCC);
+ setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
if (Subtarget->is64Bit())
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::XOR);
@@ -1738,7 +1761,7 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
// FIXME: Why this routine is here? Move to RegInfo!
std::pair<const TargetRegisterClass*, uint8_t>
X86TargetLowering::findRepresentativeClass(MVT VT) const{
- const TargetRegisterClass *RRC = 0;
+ const TargetRegisterClass *RRC = nullptr;
uint8_t Cost = 1;
switch (VT.SimpleTy) {
default:
@@ -1806,8 +1829,8 @@ X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
return CCInfo.CheckReturn(Outs, RetCC_X86);
}
-const uint16_t *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
- static const uint16_t ScratchRegs[] = { X86::R11, 0 };
+const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
+ static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
return ScratchRegs;
}
@@ -1930,8 +1953,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
if (Flag.getNode())
RetOps.push_back(Flag);
- return DAG.getNode(X86ISD::RET_FLAG, dl,
- MVT::Other, &RetOps[0], RetOps.size());
+ return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
}
bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
@@ -2285,22 +2307,25 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
InVals.push_back(ArgValue);
}
- // The x86-64 ABIs require that for returning structs by value we copy
- // the sret argument into %rax/%eax (depending on ABI) for the return.
- // Win32 requires us to put the sret argument to %eax as well.
- // Save the argument into a virtual register so that we can access it
- // from the return points.
- if (MF.getFunction()->hasStructRetAttr() &&
- (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
- X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
- unsigned Reg = FuncInfo->getSRetReturnReg();
- if (!Reg) {
- MVT PtrTy = getPointerTy();
- Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
- FuncInfo->setSRetReturnReg(Reg);
+ if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ // The x86-64 ABIs require that for returning structs by value we copy
+ // the sret argument into %rax/%eax (depending on ABI) for the return.
+ // Win32 requires us to put the sret argument to %eax as well.
+ // Save the argument into a virtual register so that we can access it
+ // from the return points.
+ if (Ins[i].Flags.isSRet()) {
+ unsigned Reg = FuncInfo->getSRetReturnReg();
+ if (!Reg) {
+ MVT PtrTy = getPointerTy();
+ Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
+ FuncInfo->setSRetReturnReg(Reg);
+ }
+ SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+ break;
+ }
}
- SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
}
unsigned StackSize = CCInfo.getNextStackOffset();
@@ -2320,17 +2345,17 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
// FIXME: We should really autogenerate these arrays
- static const uint16_t GPR64ArgRegsWin64[] = {
+ static const MCPhysReg GPR64ArgRegsWin64[] = {
X86::RCX, X86::RDX, X86::R8, X86::R9
};
- static const uint16_t GPR64ArgRegs64Bit[] = {
+ static const MCPhysReg GPR64ArgRegs64Bit[] = {
X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
};
- static const uint16_t XMMArgRegs64Bit[] = {
+ static const MCPhysReg XMMArgRegs64Bit[] = {
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
};
- const uint16_t *GPR64ArgRegs;
+ const MCPhysReg *GPR64ArgRegs;
unsigned NumXMMRegs = 0;
if (IsWin64) {
@@ -2424,13 +2449,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
SaveXMMOps.push_back(Val);
}
MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
- MVT::Other,
- &SaveXMMOps[0], SaveXMMOps.size()));
+ MVT::Other, SaveXMMOps));
}
if (!MemOps.empty())
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
- &MemOps[0], MemOps.size());
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
}
}
@@ -2497,10 +2520,10 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
/// optimization is performed and it is required (FPDiff!=0).
-static SDValue
-EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
- SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
- unsigned SlotSize, int FPDiff, SDLoc dl) {
+static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
+ SDValue Chain, SDValue RetAddrFrIdx,
+ EVT PtrVT, unsigned SlotSize,
+ int FPDiff, SDLoc dl) {
// Store the return address to the appropriate stack slot.
if (!FPDiff) return Chain;
// Calculate the new stack slot for the return address.
@@ -2537,7 +2560,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (MF.getTarget().Options.DisableTailCalls)
isTailCall = false;
- if (isTailCall) {
+ bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
+ if (IsMustTail) {
+ // Force this to be a tail call. The verifier rules are enough to ensure
+ // that we can lower this successfully without moving the return address
+ // around.
+ isTailCall = true;
+ } else if (isTailCall) {
// Check if it's really possible to do a tail call.
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
isVarArg, SR != NotStructReturn,
@@ -2578,7 +2607,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
int FPDiff = 0;
- if (isTailCall && !IsSibcall) {
+ if (isTailCall && !IsSibcall && !IsMustTail) {
// Lower arguments at fp - stackoffset + fpdiff.
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
@@ -2683,7 +2712,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
} else if (!IsSibcall && (!isTailCall || isByVal)) {
assert(VA.isMemLoc());
- if (StackPtr.getNode() == 0)
+ if (!StackPtr.getNode())
StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
getPointerTy());
MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
@@ -2692,8 +2721,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
if (!MemOpChains.empty())
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
- &MemOpChains[0], MemOpChains.size());
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
if (Subtarget->isPICStyleGOT()) {
// ELF / PIC requires GOT in the EBX register before function calls via PLT
@@ -2730,7 +2758,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// registers used and is in the range 0 - 8 inclusive.
// Count the number of XMM registers allocated.
- static const uint16_t XMMArgRegs[] = {
+ static const MCPhysReg XMMArgRegs[] = {
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
};
@@ -2742,8 +2770,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
DAG.getConstant(NumXMMRegs, MVT::i8)));
}
- // For tail calls lower the arguments to the 'real' stack slot.
- if (isTailCall) {
+ // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
+ // don't need this because the eligibility check rejects calls that require
+ // shuffling arguments passed in memory.
+ if (!IsSibcall && isTailCall) {
// Force all the incoming stack arguments to be loaded from the stack
// before any new outgoing arguments are stored to the stack, because the
// outgoing stack slots may alias the incoming argument stack slots, and
@@ -2755,45 +2785,45 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVector<SDValue, 8> MemOpChains2;
SDValue FIN;
int FI = 0;
- if (getTargetMachine().Options.GuaranteedTailCallOpt) {
- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
- CCValAssign &VA = ArgLocs[i];
- if (VA.isRegLoc())
- continue;
- assert(VA.isMemLoc());
- SDValue Arg = OutVals[i];
- ISD::ArgFlagsTy Flags = Outs[i].Flags;
- // Create frame index.
- int32_t Offset = VA.getLocMemOffset()+FPDiff;
- uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
- FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
- FIN = DAG.getFrameIndex(FI, getPointerTy());
-
- if (Flags.isByVal()) {
- // Copy relative to framepointer.
- SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
- if (StackPtr.getNode() == 0)
- StackPtr = DAG.getCopyFromReg(Chain, dl,
- RegInfo->getStackRegister(),
- getPointerTy());
- Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
-
- MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
- ArgChain,
- Flags, DAG, dl));
- } else {
- // Store relative to framepointer.
- MemOpChains2.push_back(
- DAG.getStore(ArgChain, dl, Arg, FIN,
- MachinePointerInfo::getFixedStack(FI),
- false, false, 0));
- }
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (VA.isRegLoc())
+ continue;
+ assert(VA.isMemLoc());
+ SDValue Arg = OutVals[i];
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ // Skip inalloca arguments. They don't require any work.
+ if (Flags.isInAlloca())
+ continue;
+ // Create frame index.
+ int32_t Offset = VA.getLocMemOffset()+FPDiff;
+ uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
+ FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+ FIN = DAG.getFrameIndex(FI, getPointerTy());
+
+ if (Flags.isByVal()) {
+ // Copy relative to framepointer.
+ SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
+ if (!StackPtr.getNode())
+ StackPtr = DAG.getCopyFromReg(Chain, dl,
+ RegInfo->getStackRegister(),
+ getPointerTy());
+ Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
+
+ MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
+ ArgChain,
+ Flags, DAG, dl));
+ } else {
+ // Store relative to framepointer.
+ MemOpChains2.push_back(
+ DAG.getStore(ArgChain, dl, Arg, FIN,
+ MachinePointerInfo::getFixedStack(FI),
+ false, false, 0));
}
}
if (!MemOpChains2.empty())
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
- &MemOpChains2[0], MemOpChains2.size());
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
// Store the return address to the appropriate stack slot.
Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
@@ -2930,10 +2960,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// This isn't right, although it's probably harmless on x86; liveouts
// should be computed from returns not tail calls. Consider a void
// function making a tail call to a function returning int.
- return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+ return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
}
- Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+ Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
InFlag = Chain.getValue(1);
// Create the CALLSEQ_END node.
@@ -3927,6 +3957,29 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
return true;
}
+/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to INSERTPS.
+/// i. e: If all but one element come from the same vector.
+static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
+ // TODO: Deal with AVX's VINSERTPS
+ if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
+ return false;
+
+ unsigned CorrectPosV1 = 0;
+ unsigned CorrectPosV2 = 0;
+ for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i)
+ if (Mask[i] == i)
+ ++CorrectPosV1;
+ else if (Mask[i] == i + 4)
+ ++CorrectPosV2;
+
+ if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
+ // We have 3 elements from one vector, and one from another.
+ return true;
+
+ return false;
+}
+
//
// Some special combinations that can be optimized.
//
@@ -4146,6 +4199,29 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
return true;
}
+// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
+// (src1[0], src0[1]), manipulation with 256-bit sub-vectors
+static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
+ if (!VT.is512BitVector())
+ return false;
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfSize = NumElts/2;
+ if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
+ if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
+ *Imm = 1;
+ return true;
+ }
+ }
+ if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
+ if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
+ *Imm = 0;
+ return true;
+ }
+ }
+ return false;
+}
+
/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVSS,
/// MOVSD, and MOVD, i.e. setting the lowest element.
@@ -4624,11 +4700,17 @@ unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
return getInsertVINSERTImmediate(N, 256);
}
+/// isZero - Returns true if Elt is a constant integer zero
+static bool isZero(SDValue V) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
+ return C && C->isNullValue();
+}
+
/// isZeroNode - Returns true if Elt is a constant zero or a floating point
/// constant +0.0.
bool X86::isZeroNode(SDValue Elt) {
- if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt))
- return CN->isNullValue();
+ if (isZero(Elt))
+ return true;
if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
return CFP->getValueAPF().isPosZero();
return false;
@@ -4677,7 +4759,7 @@ static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
/// isScalarLoadToVector - Returns true if the node is a scalar load that
/// is promoted to a vector. It also returns the LoadSDNode by reference if
/// required.
-static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
+static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
return false;
N = N->getOperand(0).getNode();
@@ -4803,28 +4885,24 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
if (Subtarget->hasInt256()) { // AVX2
SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
- array_lengthof(Ops));
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
} else {
// 256-bit logic and arithmetic instructions in AVX are all
// floating-point, no support for integer ops. Emit fp zeroed vectors.
SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops,
- array_lengthof(Ops));
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
}
} else if (VT.is512BitVector()) { // AVX-512
SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16);
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
} else if (VT.getScalarType() == MVT::i1) {
assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
- SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
- Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
- Ops, VT.getVectorNumElements());
+ SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
} else
llvm_unreachable("Unexpected vector type");
@@ -4844,8 +4922,7 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
if (VT.is256BitVector()) {
if (HasInt256) { // AVX2
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
- array_lengthof(Ops));
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
} else { // AVX
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
@@ -5307,7 +5384,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
return SDValue();
SDLoc dl(Op);
- SDValue V(0, 0);
+ SDValue V;
bool First = true;
for (unsigned i = 0; i < 16; ++i) {
bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
@@ -5320,7 +5397,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
}
if ((i & 1) != 0) {
- SDValue ThisElt(0, 0), LastElt(0, 0);
+ SDValue ThisElt, LastElt;
bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
if (LastIsNonZero) {
LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
@@ -5355,7 +5432,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
return SDValue();
SDLoc dl(Op);
- SDValue V(0, 0);
+ SDValue V;
bool First = true;
for (unsigned i = 0; i < 8; ++i) {
bool isNonZero = (NonZeros & (1 << i)) != 0;
@@ -5376,6 +5453,79 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
return V;
}
+/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
+static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems,
+ unsigned NonZeros, unsigned NumNonZero,
+ unsigned NumZero, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget,
+ const TargetLowering &TLI) {
+ // We know there's at least one non-zero element
+ unsigned FirstNonZeroIdx = 0;
+ SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx);
+ while (FirstNonZero.getOpcode() == ISD::UNDEF ||
+ X86::isZeroNode(FirstNonZero)) {
+ ++FirstNonZeroIdx;
+ FirstNonZero = Op->getOperand(FirstNonZeroIdx);
+ }
+
+ if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(FirstNonZero.getOperand(1)))
+ return SDValue();
+
+ SDValue V = FirstNonZero.getOperand(0);
+ MVT VVT = V.getSimpleValueType();
+ if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32))
+ return SDValue();
+
+ unsigned FirstNonZeroDst =
+ cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue();
+ unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx;
+ unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx;
+ unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst;
+
+ for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) {
+ SDValue Elem = Op.getOperand(Idx);
+ if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem))
+ continue;
+
+ // TODO: What else can be here? Deal with it.
+ if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ // TODO: Some optimizations are still possible here
+ // ex: Getting one element from a vector, and the rest from another.
+ if (Elem.getOperand(0) != V)
+ return SDValue();
+
+ unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue();
+ if (Dst == Idx)
+ ++CorrectIdx;
+ else if (IncorrectIdx == -1U) {
+ IncorrectIdx = Idx;
+ IncorrectDst = Dst;
+ } else
+ // There was already one element with an incorrect index.
+ // We can't optimize this case to an insertps.
+ return SDValue();
+ }
+
+ if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) {
+ SDLoc dl(Op);
+ EVT VT = Op.getSimpleValueType();
+ unsigned ElementMoveMask = 0;
+ if (IncorrectIdx == -1U)
+ ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4;
+ else
+ ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4;
+
+ SDValue InsertpsMask =
+ DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf));
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask);
+ }
+
+ return SDValue();
+}
+
/// getVShift - Return a vector logical shift node.
///
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
@@ -5480,7 +5630,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
EVT EltVT = VT.getVectorElementType();
unsigned NumElems = Elts.size();
- LoadSDNode *LDBase = NULL;
+ LoadSDNode *LDBase = nullptr;
unsigned LastLoadedElt = -1U;
// For each element in the initializer, see if we've found a load or an undef.
@@ -5545,8 +5695,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
SDValue ResNode =
- DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
- array_lengthof(Ops), MVT::i64,
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
LDBase->getPointerInfo(),
LDBase->getAlignment(),
false/*isVolatile*/, true/*ReadMem*/,
@@ -5661,7 +5810,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
unsigned ScalarSize = CVT.getSizeInBits();
if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
- const Constant *C = 0;
+ const Constant *C = nullptr;
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
C = CI->getConstantIntValue();
else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
@@ -5706,6 +5855,41 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
return SDValue();
}
+/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
+/// underlying vector and index.
+///
+/// Modifies \p ExtractedFromVec to the real vector and returns the real
+/// index.
+static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
+ SDValue ExtIdx) {
+ int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
+ if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
+ return Idx;
+
+ // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
+ // lowered this:
+ // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
+ // to:
+ // (extract_vector_elt (vector_shuffle<2,u,u,u>
+ // (extract_subvector (v8f32 %vreg0), Constant<4>),
+ // undef)
+ // Constant<0>)
+ // In this case the vector is the extract_subvector expression and the index
+ // is 2, as specified by the shuffle.
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
+ SDValue ShuffleVec = SVOp->getOperand(0);
+ MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
+ assert(ShuffleVecVT.getVectorElementType() ==
+ ExtractedFromVec.getSimpleValueType().getVectorElementType());
+
+ int ShuffleIdx = SVOp->getMaskElt(Idx);
+ if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
+ ExtractedFromVec = ShuffleVec;
+ return ShuffleIdx;
+ }
+ return Idx;
+}
+
static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
@@ -5739,34 +5923,32 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
SDValue ExtIdx = Op.getOperand(i).getOperand(1);
+ // Quit if non-constant index.
+ if (!isa<ConstantSDNode>(ExtIdx))
+ return SDValue();
+ int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
// Quit if extracted from vector of different type.
if (ExtractedFromVec.getValueType() != VT)
return SDValue();
- // Quit if non-constant index.
- if (!isa<ConstantSDNode>(ExtIdx))
- return SDValue();
-
- if (VecIn1.getNode() == 0)
+ if (!VecIn1.getNode())
VecIn1 = ExtractedFromVec;
else if (VecIn1 != ExtractedFromVec) {
- if (VecIn2.getNode() == 0)
+ if (!VecIn2.getNode())
VecIn2 = ExtractedFromVec;
else if (VecIn2 != ExtractedFromVec)
// Quit if more than 2 vectors to shuffle
return SDValue();
}
- unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
-
if (ExtractedFromVec == VecIn1)
Mask[i] = Idx;
else if (ExtractedFromVec == VecIn2)
Mask[i] = Idx + NumElems;
}
- if (VecIn1.getNode() == 0)
+ if (!VecIn1.getNode())
return SDValue();
VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
@@ -5791,24 +5973,22 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
if (ISD::isBuildVectorAllZeros(Op.getNode())) {
SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
- SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
- Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
- Ops, VT.getVectorNumElements());
+ SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
}
if (ISD::isBuildVectorAllOnes(Op.getNode())) {
SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
- SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
- Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
- Ops, VT.getVectorNumElements());
+ SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
}
bool AllContants = true;
uint64_t Immediate = 0;
int NonConstIdx = -1;
bool IsSplat = true;
+ unsigned NumNonConsts = 0;
+ unsigned NumConsts = 0;
for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
SDValue In = Op.getOperand(idx);
if (In.getOpcode() == ISD::UNDEF)
@@ -5816,9 +5996,13 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
if (!isa<ConstantSDNode>(In)) {
AllContants = false;
NonConstIdx = idx;
+ NumNonConsts++;
}
- else if (cast<ConstantSDNode>(In)->getZExtValue())
+ else {
+ NumConsts++;
+ if (cast<ConstantSDNode>(In)->getZExtValue())
Immediate |= (1ULL << idx);
+ }
if (In != Op.getOperand(0))
IsSplat = false;
}
@@ -5830,6 +6014,19 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
DAG.getIntPtrConstant(0));
}
+ if (NumNonConsts == 1 && NonConstIdx != 0) {
+ SDValue DstVec;
+ if (NumConsts) {
+ SDValue VecAsImm = DAG.getConstant(Immediate,
+ MVT::getIntegerVT(VT.getSizeInBits()));
+ DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
+ }
+ else
+ DstVec = DAG.getUNDEF(VT);
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+ Op.getOperand(NonConstIdx),
+ DAG.getIntPtrConstant(NonConstIdx));
+ }
if (!IsSplat && (NonConstIdx != 0))
llvm_unreachable("Unsupported BUILD_VECTOR operation");
MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
@@ -6043,9 +6240,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
// Build both the lower and upper subvector.
- SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
- SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
- NumElems/2);
+ SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
+ makeArrayRef(&V[0], NumElems/2));
+ SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
+ makeArrayRef(&V[NumElems / 2], NumElems/2));
// Recreate the wider vector with the lower and upper part.
if (VT.is256BitVector())
@@ -6078,6 +6276,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (V.getNode()) return V;
}
+ // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
+ if (EVTBits == 32 && NumElems == 4) {
+ SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero,
+ NumZero, DAG, Subtarget, *this);
+ if (V.getNode())
+ return V;
+ }
+
// If element VT is == 32 bits, turn it into a number of shuffles.
SmallVector<SDValue, 8> V(NumElems);
if (NumElems == 4 && NumZero > 0) {
@@ -6332,8 +6538,7 @@ static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
if (ShufVT != VT)
V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
- DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT,
- PshufbMask.data(), PshufbMask.size()));
+ DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
}
// v8i16 shuffles - Prefer shuffles in the following order:
@@ -6516,7 +6721,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
&MaskV[0]);
- if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
+ if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
NewV.getOperand(0),
@@ -6540,7 +6745,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
&MaskV[0]);
- if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
+ if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
NewV.getOperand(0),
@@ -6635,7 +6840,7 @@ static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
}
V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
DAG.getNode(ISD::BUILD_VECTOR, dl,
- MVT::v16i8, &pshufbMask[0], 16));
+ MVT::v16i8, pshufbMask));
// As PSHUFB will zero elements with negative indices, it's safe to ignore
// the 2nd operand if it's undefined or zero.
@@ -6653,7 +6858,7 @@ static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
}
V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
DAG.getNode(ISD::BUILD_VECTOR, dl,
- MVT::v16i8, &pshufbMask[0], 16));
+ MVT::v16i8, pshufbMask));
return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
}
@@ -6771,6 +6976,9 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
unsigned Scale;
switch (VT.SimpleTy) {
default: llvm_unreachable("Unexpected!");
+ case MVT::v2i64:
+ case MVT::v2f64:
+ return SDValue(SVOp, 0);
case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break;
case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break;
case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break;
@@ -6805,7 +7013,7 @@ static SDValue getVZextMovL(MVT VT, MVT OpVT,
SDValue SrcOp, SelectionDAG &DAG,
const X86Subtarget *Subtarget, SDLoc dl) {
if (VT == MVT::v2f64 || VT == MVT::v4f32) {
- LoadSDNode *LD = NULL;
+ LoadSDNode *LD = nullptr;
if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
LD = dyn_cast<LoadSDNode>(SrcOp);
if (!LD) {
@@ -6924,8 +7132,7 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
}
// Construct the output using a BUILD_VECTOR.
- Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
- SVOps.size());
+ Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
} else if (InputUsed[0] < 0) {
// No input vectors were used! The result is undefined.
Output[l] = DAG.getUNDEF(NVT);
@@ -7207,6 +7414,93 @@ SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
getShuffleSHUFImmediate(SVOp), DAG);
}
+static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
+ SelectionDAG &DAG) {
+ SDLoc dl(Load);
+ MVT VT = Load->getSimpleValueType(0);
+ MVT EVT = VT.getVectorElementType();
+ SDValue Addr = Load->getOperand(1);
+ SDValue NewAddr = DAG.getNode(
+ ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
+ DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
+
+ SDValue NewLoad =
+ DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
+ DAG.getMachineFunction().getMachineMemOperand(
+ Load->getMemOperand(), 0, EVT.getStoreSize()));
+ return NewLoad;
+}
+
+// It is only safe to call this function if isINSERTPSMask is true for
+// this shufflevector mask.
+static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
+ SelectionDAG &DAG) {
+ // Generate an insertps instruction when inserting an f32 from memory onto a
+ // v4f32 or when copying a member from one v4f32 to another.
+ // We also use it for transferring i32 from one register to another,
+ // since it simply copies the same bits.
+ // If we're transferring an i32 from memory to a specific element in a
+ // register, we output a generic DAG that will match the PINSRD
+ // instruction.
+ MVT VT = SVOp->getSimpleValueType(0);
+ MVT EVT = VT.getVectorElementType();
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ auto Mask = SVOp->getMask();
+ assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
+ "unsupported vector type for insertps/pinsrd");
+
+ int FromV1 = std::count_if(Mask.begin(), Mask.end(),
+ [](const int &i) { return i < 4; });
+
+ SDValue From;
+ SDValue To;
+ unsigned DestIndex;
+ if (FromV1 == 1) {
+ From = V1;
+ To = V2;
+ DestIndex = std::find_if(Mask.begin(), Mask.end(),
+ [](const int &i) { return i < 4; }) -
+ Mask.begin();
+ } else {
+ From = V2;
+ To = V1;
+ DestIndex = std::find_if(Mask.begin(), Mask.end(),
+ [](const int &i) { return i >= 4; }) -
+ Mask.begin();
+ }
+
+ if (MayFoldLoad(From)) {
+ // Trivial case, when From comes from a load and is only used by the
+ // shuffle. Make it use insertps from the vector that we need from that
+ // load.
+ SDValue NewLoad =
+ NarrowVectorLoadToElement(cast<LoadSDNode>(From), DestIndex, DAG);
+ if (!NewLoad.getNode())
+ return SDValue();
+
+ if (EVT == MVT::f32) {
+ // Create this as a scalar to vector to match the instruction pattern.
+ SDValue LoadScalarToVector =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
+ SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
+ InsertpsMask);
+ } else { // EVT == MVT::i32
+ // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
+ // instruction, to match the PINSRD instruction, which loads an i32 to a
+ // certain vector element.
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
+ DAG.getConstant(DestIndex, MVT::i32));
+ }
+ }
+
+ // Vector-element-to-vector
+ unsigned SrcIndex = Mask[DestIndex] % 4;
+ SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
+}
+
// Reduce a vector shuffle to zext.
static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
@@ -7295,9 +7589,8 @@ static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
}
-static SDValue
-NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
+static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
@@ -7322,31 +7615,29 @@ NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
// If the shuffle can be profitably rewritten as a narrower shuffle, then
// do it!
- if (VT == MVT::v8i16 || VT == MVT::v16i8 ||
- VT == MVT::v16i16 || VT == MVT::v32i8) {
+ if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
+ VT == MVT::v32i8) {
SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
if (NewOp.getNode())
return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
- } else if ((VT == MVT::v4i32 ||
- (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
+ } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
// FIXME: Figure out a cleaner way to do this.
- // Try to make use of movq to zero out the top part.
if (ISD::isBuildVectorAllZeros(V2.getNode())) {
SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
if (NewOp.getNode()) {
MVT NewVT = NewOp.getSimpleValueType();
if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
NewVT, true, false))
- return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
- DAG, Subtarget, dl);
+ return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
+ dl);
}
} else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
if (NewOp.getNode()) {
MVT NewVT = NewOp.getSimpleValueType();
if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
- return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
- DAG, Subtarget, dl);
+ return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
+ dl);
}
}
}
@@ -7609,6 +7900,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
getShuffleSHUFImmediate(SVOp), DAG);
}
+ unsigned Idx;
+ if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
+ return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
+ Idx*(NumElems/2), DAG, dl);
+
// Handle VPERM2F128/VPERM2I128 permutations
if (isVPERM2X128Mask(M, VT, HasFp256))
return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
@@ -7618,6 +7914,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
if (BlendOp.getNode())
return BlendOp;
+ if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
+ return getINSERTPS(SVOp, dl, DAG);
+
unsigned Imm8;
if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
@@ -7631,8 +7930,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
}
- SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT,
- &permclMask[0], NumElems);
+ SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
if (V2IsUndef)
// Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
return DAG.getNode(X86ISD::VPERMV, dl, VT,
@@ -7684,6 +7982,109 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
+// This function assumes its argument is a BUILD_VECTOR of constants or
+// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
+// true.
+static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
+ unsigned &MaskValue) {
+ MaskValue = 0;
+ unsigned NumElems = BuildVector->getNumOperands();
+ // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+ unsigned NumLanes = (NumElems - 1) / 8 + 1;
+ unsigned NumElemsInLane = NumElems / NumLanes;
+
+ // Blend for v16i16 should be symetric for the both lanes.
+ for (unsigned i = 0; i < NumElemsInLane; ++i) {
+ SDValue EltCond = BuildVector->getOperand(i);
+ SDValue SndLaneEltCond =
+ (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
+
+ int Lane1Cond = -1, Lane2Cond = -1;
+ if (isa<ConstantSDNode>(EltCond))
+ Lane1Cond = !isZero(EltCond);
+ if (isa<ConstantSDNode>(SndLaneEltCond))
+ Lane2Cond = !isZero(SndLaneEltCond);
+
+ if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
+ // Lane1Cond != 0, means we want the first argument.
+ // Lane1Cond == 0, means we want the second argument.
+ // The encoding of this argument is 0 for the first argument, 1
+ // for the second. Therefore, invert the condition.
+ MaskValue |= !Lane1Cond << i;
+ else if (Lane1Cond < 0)
+ MaskValue |= !Lane2Cond << i;
+ else
+ return false;
+ }
+ return true;
+}
+
+// Try to lower a vselect node into a simple blend instruction.
+static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Cond = Op.getOperand(0);
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ unsigned NumElems = VT.getVectorNumElements();
+
+ // There is no blend with immediate in AVX-512.
+ if (VT.is512BitVector())
+ return SDValue();
+
+ if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+ return SDValue();
+ if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+ return SDValue();
+
+ if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+ return SDValue();
+
+ // Check the mask for BLEND and build the value.
+ unsigned MaskValue = 0;
+ if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+ return SDValue();
+
+ // Convert i32 vectors to floating point if it is not AVX2.
+ // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
+ MVT BlendVT = VT;
+ if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
+ BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
+ NumElems);
+ LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
+ RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
+ }
+
+ SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
+ DAG.getConstant(MaskValue, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+}
+
+SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
+ SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG);
+ if (BlendOp.getNode())
+ return BlendOp;
+
+ // Some types for vselect were previously set to Expand, not Legal or
+ // Custom. Return an empty SDValue so we fall-through to Expand, after
+ // the Custom lowering phase.
+ MVT VT = Op.getSimpleValueType();
+ switch (VT.SimpleTy) {
+ default:
+ break;
+ case MVT::v8i16:
+ case MVT::v16i16:
+ return SDValue();
+ }
+
+ // We couldn't create a "Blend with immediate" node.
+ // This node should still be legal, but we'll have to emit a blendv*
+ // instruction.
+ return Op;
+}
+
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
@@ -7946,10 +8347,47 @@ static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
return SDValue();
}
+/// Insert one bit to mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+SDValue
+X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue Elt = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+ MVT VecVT = Vec.getSimpleValueType();
+
+ if (!isa<ConstantSDNode>(Idx)) {
+ // Non constant index. Extend source and destination,
+ // insert element and then truncate the result.
+ MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
+ MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
+ SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
+ DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
+ DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
+ return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
+ }
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
+ if (Vec.getOpcode() == ISD::UNDEF)
+ return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+ DAG.getConstant(IdxVal, MVT::i8));
+ const TargetRegisterClass* rc = getRegClassFor(VecVT);
+ unsigned MaxSift = rc->getSize()*8 - 1;
+ EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+ DAG.getConstant(MaxSift, MVT::i8));
+ EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
+ DAG.getConstant(MaxSift - IdxVal, MVT::i8));
+ return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
+}
SDValue
X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
+
+ if (EltVT == MVT::i1)
+ return InsertBitToMaskVector(Op, DAG);
SDLoc dl(Op);
SDValue N0 = Op.getOperand(0);
@@ -8294,10 +8732,10 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
if (InFlag) {
SDValue Ops[] = { Chain, TGA, *InFlag };
- Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
+ Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
} else {
SDValue Ops[] = { Chain, TGA };
- Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
+ Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
}
// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
@@ -8325,7 +8763,7 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
static SDValue
LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
const EVT PtrVT) {
- return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
+ return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
X86::RAX, X86II::MO_TLSGD);
}
@@ -8342,7 +8780,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
SDValue Base;
if (is64Bit) {
- Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX,
+ Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
X86II::MO_TLSLD, /*LocalDynamic=*/true);
} else {
SDValue InFlag;
@@ -8481,7 +8919,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = DAG.getEntryNode();
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue Args[] = { Chain, Offset };
- Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2);
+ Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
@@ -8507,10 +8945,6 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
// Windows 64bit: gs:0x58
// Windows 32bit: fs:__tls_array
- // If GV is an alias then use the aliasee for determining
- // thread-localness.
- if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
- GV = GA->getAliasedGlobal();
SDLoc dl(GA);
SDValue Chain = DAG.getEntryNode();
@@ -8609,15 +9043,15 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
if (Op.getOpcode() == ISD::SHL_PARTS) {
- Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
- Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+ Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+ Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
} else {
- Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
- Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+ Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+ Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
}
SDValue Ops[2] = { Lo, Hi };
- return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
+ return DAG.getMergeValues(Ops, dl);
}
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
@@ -8680,8 +9114,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
X86ISD::FILD, DL,
- Tys, Ops, array_lengthof(Ops),
- SrcVT, MMO);
+ Tys, Ops, SrcVT, MMO);
if (useSSE) {
Chain = Result.getValue(1);
@@ -8704,8 +9137,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
MachineMemOperand::MOStore, SSFISize, SSFISize);
Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
- Ops, array_lengthof(Ops),
- Op.getValueType(), MMO);
+ Ops, Op.getValueType(), MMO);
Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
MachinePointerInfo::getFixedStack(SSFI),
false, false, false, 0);
@@ -8900,7 +9332,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
- array_lengthof(Ops), MVT::i64, MMO);
+ MVT::i64, MMO);
APInt FF(32, 0x5F800000ULL);
@@ -8993,8 +9425,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
MachineMemOperand *MMO =
MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
MachineMemOperand::MOLoad, MemSize, MemSize);
- Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops,
- array_lengthof(Ops), DstTy, MMO);
+ Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
Chain = Value.getValue(1);
SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
@@ -9008,8 +9439,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
// Build the FP_TO_INT*_IN_MEM
SDValue Ops[] = { Chain, Value, StackSlot };
SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
- Ops, array_lengthof(Ops), DstTy,
- MMO);
+ Ops, DstTy, MMO);
return std::make_pair(FIST, StackSlot);
} else {
SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
@@ -9021,8 +9451,8 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
MVT::i32, eax.getValue(2));
SDValue Ops[] = { eax, edx };
SDValue pair = IsReplace
- ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, array_lengthof(Ops))
- : DAG.getMergeValues(Ops, array_lengthof(Ops), DL);
+ ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
+ : DAG.getMergeValues(Ops, DL);
return std::make_pair(pair, SDValue());
}
}
@@ -9217,8 +9647,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
for (unsigned j = 0; j < 8; ++j)
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
}
- SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8,
- &pshufbMask[0], 32);
+ SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
@@ -9284,7 +9713,7 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
/*IsSigned=*/ true, /*IsReplace=*/ false);
SDValue FIST = Vals.first, StackSlot = Vals.second;
// If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
- if (FIST.getNode() == 0) return Op;
+ if (!FIST.getNode()) return Op;
if (StackSlot.getNode())
// Load the result.
@@ -9581,12 +10010,29 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
VecIns.back(), VecIns.back());
}
+/// \brief return true if \c Op has a use that doesn't just read flags.
+static bool hasNonFlagsUse(SDValue Op) {
+ for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
+ ++UI) {
+ SDNode *User = *UI;
+ unsigned UOpNo = UI.getOperandNo();
+ if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
+ // Look pass truncate.
+ UOpNo = User->use_begin().getOperandNo();
+ User = *User->use_begin();
+ }
+
+ if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
+ !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
+ return true;
+ }
+ return false;
+}
+
/// Emit nodes that will be selected as "test Op0,Op0", or something
/// equivalent.
-SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
+SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
SelectionDAG &DAG) const {
- SDLoc dl(Op);
-
if (Op.getValueType() == MVT::i1)
// KORTEST instruction should be selected
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
@@ -9687,31 +10133,35 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
Opcode = X86ISD::ADD;
NumOperands = 2;
break;
- case ISD::AND: {
- // If the primary and result isn't used, don't bother using X86ISD::AND,
- // because a TEST instruction will be better.
- bool NonFlagUse = false;
- for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
- UE = Op.getNode()->use_end(); UI != UE; ++UI) {
- SDNode *User = *UI;
- unsigned UOpNo = UI.getOperandNo();
- if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
- // Look pass truncate.
- UOpNo = User->use_begin().getOperandNo();
- User = *User->use_begin();
- }
-
- if (User->getOpcode() != ISD::BRCOND &&
- User->getOpcode() != ISD::SETCC &&
- !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) {
- NonFlagUse = true;
+ case ISD::SHL:
+ case ISD::SRL:
+ // If we have a constant logical shift that's only used in a comparison
+ // against zero turn it into an equivalent AND. This allows turning it into
+ // a TEST instruction later.
+ if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
+ isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
+ EVT VT = Op.getValueType();
+ unsigned BitWidth = VT.getSizeInBits();
+ unsigned ShAmt = Op->getConstantOperandVal(1);
+ if (ShAmt >= BitWidth) // Avoid undefined shifts.
break;
- }
+ APInt Mask = ArithOp.getOpcode() == ISD::SRL
+ ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
+ : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
+ if (!Mask.isSignedIntN(32)) // Avoid large immediates.
+ break;
+ SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
+ DAG.getConstant(Mask, VT));
+ DAG.ReplaceAllUsesWith(Op, New);
+ Op = New;
}
+ break;
- if (!NonFlagUse)
+ case ISD::AND:
+ // If the primary and result isn't used, don't bother using X86ISD::AND,
+ // because a TEST instruction will be better.
+ if (!hasNonFlagsUse(Op))
break;
- }
// FALL THROUGH
case ISD::SUB:
case ISD::OR:
@@ -9794,7 +10244,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
for (unsigned i = 0; i != NumOperands; ++i)
Ops.push_back(Op.getOperand(i));
- SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
+ SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
DAG.ReplaceAllUsesWith(Op, New);
return SDValue(New.getNode(), 1);
}
@@ -9802,11 +10252,10 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
/// Emit nodes that will be selected as "cmp Op0,Op1", or something
/// equivalent.
SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
- SelectionDAG &DAG) const {
- SDLoc dl(Op0);
+ SDLoc dl, SelectionDAG &DAG) const {
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
if (C->getAPIntValue() == 0)
- return EmitTest(Op0, X86CC, DAG);
+ return EmitTest(Op0, X86CC, dl, DAG);
if (Op0.getValueType() == MVT::i1)
llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
@@ -9888,7 +10337,7 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
unsigned AndBitWidth = And.getValueSizeInBits();
if (BitWidth > AndBitWidth) {
APInt Zeros, Ones;
- DAG.ComputeMaskedBits(Op0, Zeros, Ones);
+ DAG.computeKnownBits(Op0, Zeros, Ones);
if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
return SDValue();
}
@@ -10054,7 +10503,7 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
/// operand \p Op1. If non-trivial (for example because it's not constant)
/// return an empty value.
-static SDValue ChangeVSETULTtoVSETULE(SDValue Op1, SelectionDAG &DAG)
+static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
{
BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
if (!BV)
@@ -10078,8 +10527,7 @@ static SDValue ChangeVSETULTtoVSETULE(SDValue Op1, SelectionDAG &DAG)
ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
}
- return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op1), VT, ULTOp1.data(),
- ULTOp1.size());
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
}
static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
@@ -10204,7 +10652,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
// Only do this pre-AVX since vpcmp* is no longer destructive.
if (Subtarget->hasAVX())
break;
- SDValue ULEOp1 = ChangeVSETULTtoVSETULE(Op1, DAG);
+ SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
if (ULEOp1.getNode()) {
Op1 = ULEOp1;
Subus = true; Invert = false; Swap = false;
@@ -10383,7 +10831,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
if (X86CC == X86::COND_INVALID)
return SDValue();
- SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
+ SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
DAG.getConstant(X86CC, MVT::i8), EFLAGS);
@@ -10418,11 +10866,6 @@ static bool isX86LogicalCmp(SDValue Op) {
return false;
}
-static bool isZero(SDValue V) {
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
- return C && C->isNullValue();
-}
-
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
if (V.getOpcode() != ISD::TRUNCATE)
return false;
@@ -10517,7 +10960,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
Res = DAG.getNOT(DL, Res, Res.getValueType());
ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
- if (N2C == 0 || !N2C->isNullValue())
+ if (!N2C || !N2C->isNullValue())
Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
return Res;
}
@@ -10606,7 +11049,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if (addTest) {
CC = DAG.getConstant(X86::COND_NE, MVT::i8);
- Cond = EmitTest(Cond, X86::COND_NE, DAG);
+ Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
}
// a < b ? -1 : 0 -> RES = ~setcc_carry
@@ -10646,7 +11089,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// condition is true.
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
SDValue Ops[] = { Op2, Op1, CC, Cond };
- return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
+ return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
}
static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) {
@@ -11027,7 +11470,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
if (addTest) {
CC = DAG.getConstant(X86::COND_NE, MVT::i8);
- Cond = EmitTest(Cond, X86::COND_NE, DAG);
+ Cond = EmitTest(Cond, X86::COND_NE, dl, DAG);
}
Cond = ConvertCmpIfNecessary(Cond, DAG);
return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
@@ -11042,13 +11485,50 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue
X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
- assert((Subtarget->isOSWindows() ||
- getTargetMachine().Options.EnableSegmentedStacks) &&
- "This should be used only on Windows targets or when segmented stacks "
- "are being used");
- assert(!Subtarget->isTargetMacho() && "Not implemented");
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool SplitStack = MF.shouldSplitStack();
+ bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) ||
+ SplitStack;
SDLoc dl(Op);
+ if (!Lower) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDNode* Node = Op.getNode();
+
+ unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+ assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
+ " not tell us which reg is the stack pointer!");
+ EVT VT = Node->getValueType(0);
+ SDValue Tmp1 = SDValue(Node, 0);
+ SDValue Tmp2 = SDValue(Node, 1);
+ SDValue Tmp3 = Node->getOperand(2);
+ SDValue Chain = Tmp1.getOperand(0);
+
+ // Chain the dynamic stack allocation so that it doesn't modify the stack
+ // pointer when other instructions are using the stack.
+ Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
+ SDLoc(Node));
+
+ SDValue Size = Tmp2.getOperand(1);
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+ Chain = SP.getValue(1);
+ unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
+ const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
+ unsigned StackAlign = TFI.getStackAlignment();
+ Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+ if (Align > StackAlign)
+ Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
+ DAG.getConstant(-(uint64_t)Align, VT));
+ Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
+
+ Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
+ DAG.getIntPtrConstant(0, true), SDValue(),
+ SDLoc(Node));
+
+ SDValue Ops[2] = { Tmp1, Tmp2 };
+ return DAG.getMergeValues(Ops, dl);
+ }
+
// Get the inputs.
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
@@ -11058,8 +11538,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
bool Is64Bit = Subtarget->is64Bit();
EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
- if (getTargetMachine().Options.EnableSegmentedStacks) {
- MachineFunction &MF = DAG.getMachineFunction();
+ if (SplitStack) {
MachineRegisterInfo &MRI = MF.getRegInfo();
if (Is64Bit) {
@@ -11081,7 +11560,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
DAG.getRegister(Vreg, SPTy));
SDValue Ops1[2] = { Value, Chain };
- return DAG.getMergeValues(Ops1, 2, dl);
+ return DAG.getMergeValues(Ops1, dl);
} else {
SDValue Flag;
unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
@@ -11105,7 +11584,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
}
SDValue Ops1[2] = { SP, Chain };
- return DAG.getMergeValues(Ops1, 2, dl);
+ return DAG.getMergeValues(Ops1, dl);
}
}
@@ -11166,8 +11645,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
MachinePointerInfo(SV, 16), false, false, 0);
MemOps.push_back(Store);
- return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
- &MemOps[0], MemOps.size());
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
}
SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
@@ -11221,8 +11699,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
InstOps.push_back(DAG.getConstant(Align, MVT::i32));
SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
- VTs, &InstOps[0], InstOps.size(),
- MVT::i64,
+ VTs, InstOps, MVT::i64,
MachinePointerInfo(SV),
/*Align=*/0,
/*Volatile=*/false,
@@ -11262,6 +11739,10 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
SelectionDAG &DAG) {
MVT ElementType = VT.getVectorElementType();
+ // Fold this packed shift into its first operand if ShiftAmt is 0.
+ if (ShiftAmt == 0)
+ return SrcOp;
+
// Check for ShiftAmt >= element width
if (ShiftAmt >= ElementType.getSizeInBits()) {
if (Opc == X86ISD::VSRAI)
@@ -11282,7 +11763,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
ConstantSDNode *ND;
switch(Opc) {
- default: llvm_unreachable(0);
+ default: llvm_unreachable(nullptr);
case X86ISD::VSHLI:
for (unsigned i=0; i!=NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
@@ -11321,7 +11802,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
break;
}
- return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElts);
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
}
return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
@@ -11353,7 +11834,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
ShOps[0] = ShAmt;
ShOps[1] = DAG.getConstant(0, MVT::i32);
ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
- ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
+ ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, ShOps);
// The return type has to be a 128-bit type with the same element
// type as the input type.
@@ -11476,6 +11957,21 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::x86_sse41_pmuldq:
+ case Intrinsic::x86_avx2_pmul_dq:
+ return DAG.getNode(X86ISD::PMULDQ, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
+ case Intrinsic::x86_sse2_pmulhu_w:
+ case Intrinsic::x86_avx2_pmulhu_w:
+ return DAG.getNode(ISD::MULHU, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
+ case Intrinsic::x86_sse2_pmulh_w:
+ case Intrinsic::x86_avx2_pmulh_w:
+ return DAG.getNode(ISD::MULHS, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
// SSE2/AVX2 sub with unsigned saturation intrinsics
case Intrinsic::x86_sse2_psubus_b:
case Intrinsic::x86_sse2_psubus_w:
@@ -11927,7 +12423,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
}
SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
- SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+ SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
DAG.getConstant(X86CC, MVT::i8),
SDValue(PCMP.getNode(), 1));
@@ -11944,7 +12440,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
- return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+ return DAG.getNode(Opcode, dl, VTs, NewOps);
}
case Intrinsic::x86_fma_vfmadd_ps:
case Intrinsic::x86_fma_vfmadd_pd:
@@ -12042,27 +12538,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
}
static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
- SDValue Base, SDValue Index,
- SDValue ScaleOp, SDValue Chain,
- const X86Subtarget * Subtarget) {
- SDLoc dl(Op);
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
- assert(C && "Invalid scale type");
- SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
- SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
- EVT MaskVT = MVT::getVectorVT(MVT::i1,
- Index.getSimpleValueType().getVectorNumElements());
- SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
- SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
- SDValue Segment = DAG.getRegister(0, MVT::i32);
- SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
- SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
- SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
- return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl);
-}
-
-static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Src, SDValue Mask, SDValue Base,
SDValue Index, SDValue ScaleOp, SDValue Chain,
const X86Subtarget * Subtarget) {
@@ -12072,7 +12547,12 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
EVT MaskVT = MVT::getVectorVT(MVT::i1,
Index.getSimpleValueType().getVectorNumElements());
- SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+ SDValue MaskInReg;
+ ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+ if (MaskC)
+ MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+ else
+ MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
@@ -12081,12 +12561,12 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
- return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl);
+ return DAG.getMergeValues(RetOps, dl);
}
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
- SDValue Src, SDValue Base, SDValue Index,
- SDValue ScaleOp, SDValue Chain) {
+ SDValue Src, SDValue Mask, SDValue Base,
+ SDValue Index, SDValue ScaleOp, SDValue Chain) {
SDLoc dl(Op);
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
assert(C && "Invalid scale type");
@@ -12095,52 +12575,218 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Segment = DAG.getRegister(0, MVT::i32);
EVT MaskVT = MVT::getVectorVT(MVT::i1,
Index.getSimpleValueType().getVectorNumElements());
- SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
+ SDValue MaskInReg;
+ ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+ if (MaskC)
+ MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+ else
+ MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
return SDValue(Res, 1);
}
-static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
- SDValue Src, SDValue Mask, SDValue Base,
- SDValue Index, SDValue ScaleOp, SDValue Chain) {
+static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+ SDValue Mask, SDValue Base, SDValue Index,
+ SDValue ScaleOp, SDValue Chain) {
SDLoc dl(Op);
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
assert(C && "Invalid scale type");
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
- EVT MaskVT = MVT::getVectorVT(MVT::i1,
- Index.getSimpleValueType().getVectorNumElements());
- SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
- SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
- SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
- SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
- return SDValue(Res, 1);
+ EVT MaskVT =
+ MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
+ SDValue MaskInReg;
+ ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+ if (MaskC)
+ MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+ else
+ MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+ //SDVTList VTs = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+ SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
+ return SDValue(Res, 0);
+}
+
+// getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
+// read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
+// also used to custom lower READCYCLECOUNTER nodes.
+static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
+ SelectionDAG &DAG, const X86Subtarget *Subtarget,
+ SmallVectorImpl<SDValue> &Results) {
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
+ SDValue LO, HI;
+
+ // The processor's time-stamp counter (a 64-bit MSR) is stored into the
+ // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
+ // and the EAX register is loaded with the low-order 32 bits.
+ if (Subtarget->is64Bit()) {
+ LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+ LO.getValue(2));
+ } else {
+ LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+ LO.getValue(2));
+ }
+ SDValue Chain = HI.getValue(1);
+
+ if (Opcode == X86ISD::RDTSCP_DAG) {
+ assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+
+ // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
+ // the ECX register. Add 'ecx' explicitly to the chain.
+ SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
+ HI.getValue(2));
+ // Explicitly store the content of ECX at the location passed in input
+ // to the 'rdtscp' intrinsic.
+ Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
+ MachinePointerInfo(), false, false, 0);
+ }
+
+ if (Subtarget->is64Bit()) {
+ // The EDX register is loaded with the high-order 32 bits of the MSR, and
+ // the EAX register is loaded with the low-order 32 bits.
+ SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+ DAG.getConstant(32, MVT::i8));
+ Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+ Results.push_back(Chain);
+ return;
+ }
+
+ // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+ SDValue Ops[] = { LO, HI };
+ SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+ Results.push_back(Pair);
+ Results.push_back(Chain);
+}
+
+static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SmallVector<SDValue, 2> Results;
+ SDLoc DL(Op);
+ getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
+ Results);
+ return DAG.getMergeValues(Results, DL);
+}
+
+enum IntrinsicType {
+ GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDTSC, XTEST
+};
+
+struct IntrinsicData {
+ IntrinsicData(IntrinsicType IType, unsigned IOpc0, unsigned IOpc1)
+ :Type(IType), Opc0(IOpc0), Opc1(IOpc1) {}
+ IntrinsicType Type;
+ unsigned Opc0;
+ unsigned Opc1;
+};
+
+std::map < unsigned, IntrinsicData> IntrMap;
+static void InitIntinsicsMap() {
+ static bool Initialized = false;
+ if (Initialized)
+ return;
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512,
+ IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512,
+ IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpd_512,
+ IntrinsicData(GATHER, X86::VGATHERQPDZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpd_512,
+ IntrinsicData(GATHER, X86::VGATHERDPDZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dps_512,
+ IntrinsicData(GATHER, X86::VGATHERDPSZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpi_512,
+ IntrinsicData(GATHER, X86::VPGATHERQDZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpq_512,
+ IntrinsicData(GATHER, X86::VPGATHERQQZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpi_512,
+ IntrinsicData(GATHER, X86::VPGATHERDDZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpq_512,
+ IntrinsicData(GATHER, X86::VPGATHERDQZrm, 0)));
+
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qps_512,
+ IntrinsicData(SCATTER, X86::VSCATTERQPSZmr, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpd_512,
+ IntrinsicData(SCATTER, X86::VSCATTERQPDZmr, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpd_512,
+ IntrinsicData(SCATTER, X86::VSCATTERDPDZmr, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dps_512,
+ IntrinsicData(SCATTER, X86::VSCATTERDPSZmr, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpi_512,
+ IntrinsicData(SCATTER, X86::VPSCATTERQDZmr, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpq_512,
+ IntrinsicData(SCATTER, X86::VPSCATTERQQZmr, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpi_512,
+ IntrinsicData(SCATTER, X86::VPSCATTERDDZmr, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpq_512,
+ IntrinsicData(SCATTER, X86::VPSCATTERDQZmr, 0)));
+
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qps_512,
+ IntrinsicData(PREFETCH, X86::VGATHERPF0QPSm,
+ X86::VGATHERPF1QPSm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qpd_512,
+ IntrinsicData(PREFETCH, X86::VGATHERPF0QPDm,
+ X86::VGATHERPF1QPDm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dpd_512,
+ IntrinsicData(PREFETCH, X86::VGATHERPF0DPDm,
+ X86::VGATHERPF1DPDm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dps_512,
+ IntrinsicData(PREFETCH, X86::VGATHERPF0DPSm,
+ X86::VGATHERPF1DPSm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qps_512,
+ IntrinsicData(PREFETCH, X86::VSCATTERPF0QPSm,
+ X86::VSCATTERPF1QPSm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qpd_512,
+ IntrinsicData(PREFETCH, X86::VSCATTERPF0QPDm,
+ X86::VSCATTERPF1QPDm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dpd_512,
+ IntrinsicData(PREFETCH, X86::VSCATTERPF0DPDm,
+ X86::VSCATTERPF1DPDm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dps_512,
+ IntrinsicData(PREFETCH, X86::VSCATTERPF0DPSm,
+ X86::VSCATTERPF1DPSm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_16,
+ IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_32,
+ IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_64,
+ IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_16,
+ IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_32,
+ IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_64,
+ IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_xtest,
+ IntrinsicData(XTEST, X86ISD::XTEST, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdtsc,
+ IntrinsicData(RDTSC, X86ISD::RDTSC_DAG, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp,
+ IntrinsicData(RDTSC, X86ISD::RDTSCP_DAG, 0)));
+ Initialized = true;
}
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- SDLoc dl(Op);
+ InitIntinsicsMap();
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
- switch (IntNo) {
- default: return SDValue(); // Don't custom lower most intrinsics.
+ std::map < unsigned, IntrinsicData>::const_iterator itr = IntrMap.find(IntNo);
+ if (itr == IntrMap.end())
+ return SDValue();
- // RDRAND/RDSEED intrinsics.
- case Intrinsic::x86_rdrand_16:
- case Intrinsic::x86_rdrand_32:
- case Intrinsic::x86_rdrand_64:
- case Intrinsic::x86_rdseed_16:
- case Intrinsic::x86_rdseed_32:
- case Intrinsic::x86_rdseed_64: {
- unsigned Opcode = (IntNo == Intrinsic::x86_rdseed_16 ||
- IntNo == Intrinsic::x86_rdseed_32 ||
- IntNo == Intrinsic::x86_rdseed_64) ? X86ISD::RDSEED :
- X86ISD::RDRAND;
+ SDLoc dl(Op);
+ IntrinsicData Intr = itr->second;
+ switch(Intr.Type) {
+ case RDSEED:
+ case RDRAND: {
// Emit the node with the right value type.
SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
- SDValue Result = DAG.getNode(Opcode, dl, VTs, Op.getOperand(0));
+ SDValue Result = DAG.getNode(Intr.Opc0, dl, VTs, Op.getOperand(0));
// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
// Otherwise return the value from Rand, which is always 0, casted to i32.
@@ -12150,152 +12796,55 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SDValue(Result.getNode(), 1) };
SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
DAG.getVTList(Op->getValueType(1), MVT::Glue),
- Ops, array_lengthof(Ops));
+ Ops);
// Return { result, isValid, chain }.
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
SDValue(Result.getNode(), 2));
}
- //int_gather(index, base, scale);
- case Intrinsic::x86_avx512_gather_qpd_512:
- case Intrinsic::x86_avx512_gather_qps_512:
- case Intrinsic::x86_avx512_gather_dpd_512:
- case Intrinsic::x86_avx512_gather_qpi_512:
- case Intrinsic::x86_avx512_gather_qpq_512:
- case Intrinsic::x86_avx512_gather_dpq_512:
- case Intrinsic::x86_avx512_gather_dps_512:
- case Intrinsic::x86_avx512_gather_dpi_512: {
- unsigned Opc;
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break;
- case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break;
- case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break;
- case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break;
- case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break;
- case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break;
- case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break;
- case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break;
- }
- SDValue Chain = Op.getOperand(0);
- SDValue Index = Op.getOperand(2);
- SDValue Base = Op.getOperand(3);
- SDValue Scale = Op.getOperand(4);
- return getGatherNode(Opc, Op, DAG, Base, Index, Scale, Chain, Subtarget);
- }
- //int_gather_mask(v1, mask, index, base, scale);
- case Intrinsic::x86_avx512_gather_qps_mask_512:
- case Intrinsic::x86_avx512_gather_qpd_mask_512:
- case Intrinsic::x86_avx512_gather_dpd_mask_512:
- case Intrinsic::x86_avx512_gather_dps_mask_512:
- case Intrinsic::x86_avx512_gather_qpi_mask_512:
- case Intrinsic::x86_avx512_gather_qpq_mask_512:
- case Intrinsic::x86_avx512_gather_dpi_mask_512:
- case Intrinsic::x86_avx512_gather_dpq_mask_512: {
- unsigned Opc;
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_avx512_gather_qps_mask_512:
- Opc = X86::VGATHERQPSZrm; break;
- case Intrinsic::x86_avx512_gather_qpd_mask_512:
- Opc = X86::VGATHERQPDZrm; break;
- case Intrinsic::x86_avx512_gather_dpd_mask_512:
- Opc = X86::VGATHERDPDZrm; break;
- case Intrinsic::x86_avx512_gather_dps_mask_512:
- Opc = X86::VGATHERDPSZrm; break;
- case Intrinsic::x86_avx512_gather_qpi_mask_512:
- Opc = X86::VPGATHERQDZrm; break;
- case Intrinsic::x86_avx512_gather_qpq_mask_512:
- Opc = X86::VPGATHERQQZrm; break;
- case Intrinsic::x86_avx512_gather_dpi_mask_512:
- Opc = X86::VPGATHERDDZrm; break;
- case Intrinsic::x86_avx512_gather_dpq_mask_512:
- Opc = X86::VPGATHERDQZrm; break;
- }
+ case GATHER: {
+ //gather(v1, mask, index, base, scale);
SDValue Chain = Op.getOperand(0);
SDValue Src = Op.getOperand(2);
- SDValue Mask = Op.getOperand(3);
+ SDValue Base = Op.getOperand(3);
SDValue Index = Op.getOperand(4);
- SDValue Base = Op.getOperand(5);
+ SDValue Mask = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
- return getMGatherNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
+ return getGatherNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
Subtarget);
}
- //int_scatter(base, index, v1, scale);
- case Intrinsic::x86_avx512_scatter_qpd_512:
- case Intrinsic::x86_avx512_scatter_qps_512:
- case Intrinsic::x86_avx512_scatter_dpd_512:
- case Intrinsic::x86_avx512_scatter_qpi_512:
- case Intrinsic::x86_avx512_scatter_qpq_512:
- case Intrinsic::x86_avx512_scatter_dpq_512:
- case Intrinsic::x86_avx512_scatter_dps_512:
- case Intrinsic::x86_avx512_scatter_dpi_512: {
- unsigned Opc;
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_avx512_scatter_qpd_512:
- Opc = X86::VSCATTERQPDZmr; break;
- case Intrinsic::x86_avx512_scatter_qps_512:
- Opc = X86::VSCATTERQPSZmr; break;
- case Intrinsic::x86_avx512_scatter_dpd_512:
- Opc = X86::VSCATTERDPDZmr; break;
- case Intrinsic::x86_avx512_scatter_dps_512:
- Opc = X86::VSCATTERDPSZmr; break;
- case Intrinsic::x86_avx512_scatter_qpi_512:
- Opc = X86::VPSCATTERQDZmr; break;
- case Intrinsic::x86_avx512_scatter_qpq_512:
- Opc = X86::VPSCATTERQQZmr; break;
- case Intrinsic::x86_avx512_scatter_dpq_512:
- Opc = X86::VPSCATTERDQZmr; break;
- case Intrinsic::x86_avx512_scatter_dpi_512:
- Opc = X86::VPSCATTERDDZmr; break;
- }
- SDValue Chain = Op.getOperand(0);
- SDValue Base = Op.getOperand(2);
- SDValue Index = Op.getOperand(3);
- SDValue Src = Op.getOperand(4);
- SDValue Scale = Op.getOperand(5);
- return getScatterNode(Opc, Op, DAG, Src, Base, Index, Scale, Chain);
- }
- //int_scatter_mask(base, mask, index, v1, scale);
- case Intrinsic::x86_avx512_scatter_qps_mask_512:
- case Intrinsic::x86_avx512_scatter_qpd_mask_512:
- case Intrinsic::x86_avx512_scatter_dpd_mask_512:
- case Intrinsic::x86_avx512_scatter_dps_mask_512:
- case Intrinsic::x86_avx512_scatter_qpi_mask_512:
- case Intrinsic::x86_avx512_scatter_qpq_mask_512:
- case Intrinsic::x86_avx512_scatter_dpi_mask_512:
- case Intrinsic::x86_avx512_scatter_dpq_mask_512: {
- unsigned Opc;
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_avx512_scatter_qpd_mask_512:
- Opc = X86::VSCATTERQPDZmr; break;
- case Intrinsic::x86_avx512_scatter_qps_mask_512:
- Opc = X86::VSCATTERQPSZmr; break;
- case Intrinsic::x86_avx512_scatter_dpd_mask_512:
- Opc = X86::VSCATTERDPDZmr; break;
- case Intrinsic::x86_avx512_scatter_dps_mask_512:
- Opc = X86::VSCATTERDPSZmr; break;
- case Intrinsic::x86_avx512_scatter_qpi_mask_512:
- Opc = X86::VPSCATTERQDZmr; break;
- case Intrinsic::x86_avx512_scatter_qpq_mask_512:
- Opc = X86::VPSCATTERQQZmr; break;
- case Intrinsic::x86_avx512_scatter_dpq_mask_512:
- Opc = X86::VPSCATTERDQZmr; break;
- case Intrinsic::x86_avx512_scatter_dpi_mask_512:
- Opc = X86::VPSCATTERDDZmr; break;
- }
+ case SCATTER: {
+ //scatter(base, mask, index, v1, scale);
SDValue Chain = Op.getOperand(0);
SDValue Base = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
SDValue Index = Op.getOperand(4);
SDValue Src = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
- return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
+ return getScatterNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
+ }
+ case PREFETCH: {
+ SDValue Hint = Op.getOperand(6);
+ unsigned HintVal;
+ if (dyn_cast<ConstantSDNode> (Hint) == 0 ||
+ (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
+ llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
+ unsigned Opcode = (HintVal ? Intr.Opc1 : Intr.Opc0);
+ SDValue Chain = Op.getOperand(0);
+ SDValue Mask = Op.getOperand(2);
+ SDValue Index = Op.getOperand(3);
+ SDValue Base = Op.getOperand(4);
+ SDValue Scale = Op.getOperand(5);
+ return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
+ }
+ // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
+ case RDTSC: {
+ SmallVector<SDValue, 2> Results;
+ getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results);
+ return DAG.getMergeValues(Results, dl);
}
// XTEST intrinsics.
- case Intrinsic::x86_xtest: {
+ case XTEST: {
SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0));
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
@@ -12306,6 +12855,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
Ret, SDValue(InTrans.getNode(), 1));
}
}
+ llvm_unreachable("Unknown Intrinsic Type");
}
SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
@@ -12358,6 +12908,19 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
return FrameAddr;
}
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned X86TargetLowering::getRegisterByName(const char* RegName,
+ EVT VT) const {
+ unsigned Reg = StringSwitch<unsigned>(RegName)
+ .Case("esp", X86::ESP)
+ .Case("rsp", X86::RSP)
+ .Default(0);
+ if (Reg)
+ return Reg;
+ report_fatal_error("Invalid register name global variable");
+}
+
SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
SelectionDAG &DAG) const {
const X86RegisterInfo *RegInfo =
@@ -12477,7 +13040,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
MachinePointerInfo(TrmpAddr, 22),
false, false, 0);
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6);
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
} else {
const Function *Func =
cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
@@ -12557,7 +13120,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
MachinePointerInfo(TrmpAddr, 6),
false, false, 1);
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4);
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
}
}
@@ -12600,8 +13163,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
DAG.getVTList(MVT::Other),
- Ops, array_lengthof(Ops), MVT::i16,
- MMO);
+ Ops, MVT::i16, MMO);
// Load FP Control Word from stack slot
SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
@@ -12654,7 +13216,7 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
DAG.getConstant(X86::COND_E, MVT::i8),
Op.getValue(1)
};
- Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
+ Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
// Finally xor with NumBits-1.
Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
@@ -12706,7 +13268,7 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
DAG.getConstant(X86::COND_E, MVT::i8),
Op.getValue(1)
};
- return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops));
+ return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
}
// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
@@ -12824,59 +13386,104 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
}
-static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
- MVT VT = Op.getSimpleValueType();
- MVT EltTy = VT.getVectorElementType();
- unsigned NumElts = VT.getVectorNumElements();
- SDValue N0 = Op.getOperand(0);
+SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
+ assert(Subtarget->isTargetWin64() && "Unexpected target");
+ EVT VT = Op.getValueType();
+ assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
+ "Unexpected return type for lowering");
+
+ RTLIB::Libcall LC;
+ bool isSigned;
+ switch (Op->getOpcode()) {
+ default: llvm_unreachable("Unexpected request for libcall!");
+ case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
+ case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
+ case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
+ case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
+ case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
+ case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
+ }
+
SDLoc dl(Op);
+ SDValue InChain = DAG.getEntryNode();
- // Lower sdiv X, pow2-const.
- BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1));
- if (!C)
- return SDValue();
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
+ EVT ArgVT = Op->getOperand(i).getValueType();
+ assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
+ "Unexpected argument type for lowering");
+ SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
+ Entry.Node = StackPtr;
+ InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
+ false, false, 16);
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+ Entry.Ty = PointerType::get(ArgTy,0);
+ Entry.isSExt = false;
+ Entry.isZExt = false;
+ Args.push_back(Entry);
+ }
+
+ SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+ getPointerTy());
- APInt SplatValue, SplatUndef;
- unsigned SplatBitSize;
- bool HasAnyUndefs;
- if (!C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
- HasAnyUndefs) ||
- EltTy.getSizeInBits() < SplatBitSize)
- return SDValue();
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl).setChain(InChain)
+ .setCallee(getLibcallCallingConv(LC),
+ static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
+ Callee, &Args, 0)
+ .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
+
+ std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+ return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
+}
+
+static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
+ EVT VT = Op0.getValueType();
+ SDLoc dl(Op);
- if ((SplatValue != 0) &&
- (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) {
- unsigned Lg2 = SplatValue.countTrailingZeros();
- // Splat the sign bit.
- SmallVector<SDValue, 16> Sz(NumElts,
- DAG.getConstant(EltTy.getSizeInBits() - 1,
- EltTy));
- SDValue SGN = DAG.getNode(ISD::SRA, dl, VT, N0,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Sz[0],
- NumElts));
- // Add (N0 < 0) ? abs2 - 1 : 0;
- SmallVector<SDValue, 16> Amt(NumElts,
- DAG.getConstant(EltTy.getSizeInBits() - Lg2,
- EltTy));
- SDValue SRL = DAG.getNode(ISD::SRL, dl, VT, SGN,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Amt[0],
- NumElts));
- SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL);
- SmallVector<SDValue, 16> Lg2Amt(NumElts, DAG.getConstant(Lg2, EltTy));
- SDValue SRA = DAG.getNode(ISD::SRA, dl, VT, ADD,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Lg2Amt[0],
- NumElts));
-
- // If we're dividing by a positive value, we're done. Otherwise, we must
- // negate the result.
- if (SplatValue.isNonNegative())
- return SRA;
-
- SmallVector<SDValue, 16> V(NumElts, DAG.getConstant(0, EltTy));
- SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts);
- return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA);
+ assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
+ (VT == MVT::v8i32 && Subtarget->hasInt256()));
+
+ // Get the high parts.
+ const int Mask[] = {1, 2, 3, 4, 5, 6, 7, 8};
+ SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
+ SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
+
+ // Emit two multiplies, one for the lower 2 ints and one for the higher 2
+ // ints.
+ MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
+ bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
+ unsigned Opcode =
+ (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
+ SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
+ DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
+ SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
+ DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1));
+
+ // Shuffle it back into the right order.
+ const int HighMask[] = {1, 5, 3, 7, 9, 13, 11, 15};
+ SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+ const int LowMask[] = {0, 4, 2, 6, 8, 12, 10, 14};
+ SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+
+ // If we have a signed multiply but no PMULDQ fix up the high parts of a
+ // unsigned multiply.
+ if (IsSigned && !Subtarget->hasSSE41()) {
+ SDValue ShAmt =
+ DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
+ SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
+ SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
+
+ SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
+ Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
}
- return SDValue();
+
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows);
}
static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
@@ -12920,7 +13527,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
DAG.getConstant(uint8_t(-1U << ShiftAmt),
MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SHL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
}
if (Op.getOpcode() == ISD::SRL) {
// Make a large shift.
@@ -12933,7 +13540,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SRL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
}
if (Op.getOpcode() == ISD::SRA) {
if (ShiftAmt == 7) {
@@ -12946,7 +13553,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
MVT::i8));
- SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
+ SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
return Res;
@@ -12966,7 +13573,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
DAG.getConstant(uint8_t(-1U << ShiftAmt),
MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SHL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
}
if (Op.getOpcode() == ISD::SRL) {
// Make a large shift.
@@ -12979,7 +13586,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SRL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
}
if (Op.getOpcode() == ISD::SRA) {
if (ShiftAmt == 7) {
@@ -12992,7 +13599,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
MVT::i8));
- SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32);
+ SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
return Res;
@@ -13014,7 +13621,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
uint64_t ShiftAmt = 0;
for (unsigned i = 0; i != Ratio; ++i) {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
- if (C == 0)
+ if (!C)
return SDValue();
// 6 == Log2(64)
ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
@@ -13025,7 +13632,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
for (unsigned j = 0; j != Ratio; ++j) {
ConstantSDNode *C =
dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
- if (C == 0)
+ if (!C)
return SDValue();
// 6 == Log2(64)
ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
@@ -13107,7 +13714,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
BaseShAmt = InVec.getOperand(1);
}
}
- if (BaseShAmt.getNode() == 0)
+ if (!BaseShAmt.getNode())
BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt,
DAG.getIntPtrConstant(0));
}
@@ -13260,7 +13867,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
}
Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
}
- SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElems);
+ SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
return DAG.getNode(ISD::MUL, dl, VT, R, BV);
}
@@ -13274,6 +13881,79 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
return DAG.getNode(ISD::MUL, dl, VT, Op, R);
}
+ // If possible, lower this shift as a sequence of two shifts by
+ // constant plus a MOVSS/MOVSD instead of scalarizing it.
+ // Example:
+ // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
+ //
+ // Could be rewritten as:
+ // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
+ //
+ // The advantage is that the two shifts from the example would be
+ // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
+ // the vector shift into four scalar shifts plus four pairs of vector
+ // insert/extract.
+ if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
+ ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+ unsigned TargetOpcode = X86ISD::MOVSS;
+ bool CanBeSimplified;
+ // The splat value for the first packed shift (the 'X' from the example).
+ SDValue Amt1 = Amt->getOperand(0);
+ // The splat value for the second packed shift (the 'Y' from the example).
+ SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
+ Amt->getOperand(2);
+
+ // See if it is possible to replace this node with a sequence of
+ // two shifts followed by a MOVSS/MOVSD
+ if (VT == MVT::v4i32) {
+ // Check if it is legal to use a MOVSS.
+ CanBeSimplified = Amt2 == Amt->getOperand(2) &&
+ Amt2 == Amt->getOperand(3);
+ if (!CanBeSimplified) {
+ // Otherwise, check if we can still simplify this node using a MOVSD.
+ CanBeSimplified = Amt1 == Amt->getOperand(1) &&
+ Amt->getOperand(2) == Amt->getOperand(3);
+ TargetOpcode = X86ISD::MOVSD;
+ Amt2 = Amt->getOperand(2);
+ }
+ } else {
+ // Do similar checks for the case where the machine value type
+ // is MVT::v8i16.
+ CanBeSimplified = Amt1 == Amt->getOperand(1);
+ for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
+ CanBeSimplified = Amt2 == Amt->getOperand(i);
+
+ if (!CanBeSimplified) {
+ TargetOpcode = X86ISD::MOVSD;
+ CanBeSimplified = true;
+ Amt2 = Amt->getOperand(4);
+ for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
+ CanBeSimplified = Amt1 == Amt->getOperand(i);
+ for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
+ CanBeSimplified = Amt2 == Amt->getOperand(j);
+ }
+ }
+
+ if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
+ isa<ConstantSDNode>(Amt2)) {
+ // Replace this node with two shifts followed by a MOVSS/MOVSD.
+ EVT CastVT = MVT::v4i32;
+ SDValue Splat1 =
+ DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
+ SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
+ SDValue Splat2 =
+ DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
+ SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
+ if (TargetOpcode == X86ISD::MOVSD)
+ CastVT = MVT::v2i64;
+ SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
+ SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
+ SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
+ BitCast1, DAG);
+ return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+ }
+ }
+
if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
@@ -13351,10 +14031,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
for (unsigned i = NumElems/2; i != NumElems; ++i)
Amt2Csts.push_back(Amt->getOperand(i));
- Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
- &Amt1Csts[0], NumElems/2);
- Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
- &Amt2Csts[0], NumElems/2);
+ Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
+ Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
} else {
// Variable shift amount
Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
@@ -13585,35 +14263,47 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
- Ops, array_lengthof(Ops), T, MMO);
+ Ops, T, MMO);
SDValue cpOut =
DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
return cpOut;
}
-static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- assert(Subtarget->is64Bit() && "Result not type legalized?");
- SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue TheChain = Op.getOperand(0);
- SDLoc dl(Op);
- SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
- SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
- SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
- rax.getValue(2));
- SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
- DAG.getConstant(32, MVT::i8));
- SDValue Ops[] = {
- DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
- rdx.getValue(1)
- };
- return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
-}
-
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
MVT SrcVT = Op.getOperand(0).getSimpleValueType();
MVT DstVT = Op.getSimpleValueType();
+
+ if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
+ assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+ if (DstVT != MVT::f64)
+ // This conversion needs to be expanded.
+ return SDValue();
+
+ SDValue InVec = Op->getOperand(0);
+ SDLoc dl(Op);
+ unsigned NumElts = SrcVT.getVectorNumElements();
+ EVT SVT = SrcVT.getVectorElementType();
+
+ // Widen the vector in input in the case of MVT::v2i32.
+ // Example: from MVT::v2i32 to MVT::v4i32.
+ SmallVector<SDValue, 16> Elts;
+ for (unsigned i = 0, e = NumElts; i != e; ++i)
+ Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
+ DAG.getIntPtrConstant(i)));
+
+ // Explicitly mark the extra elements as Undef.
+ SDValue Undef = DAG.getUNDEF(SVT);
+ for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
+ Elts.push_back(Undef);
+
+ EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+ SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
+ SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
+ DAG.getIntPtrConstant(0));
+ }
+
assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
Subtarget->hasMMX() && "Unexpected custom BITCAST");
assert((DstVT == MVT::i64 ||
@@ -13641,8 +14331,7 @@ static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
cast<AtomicSDNode>(Node)->getMemoryVT(),
Node->getOperand(0),
Node->getOperand(1), negOp,
- cast<AtomicSDNode>(Node)->getSrcValue(),
- cast<AtomicSDNode>(Node)->getAlignment(),
+ cast<AtomicSDNode>(Node)->getMemOperand(),
cast<AtomicSDNode>(Node)->getOrdering(),
cast<AtomicSDNode>(Node)->getSynchScope());
}
@@ -13730,12 +14419,11 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
Type *RetTy = isF64
? (Type*)StructType::get(ArgTy, ArgTy, NULL)
: (Type*)VectorType::get(ArgTy, 4);
- TargetLowering::
- CallLoweringInfo CLI(DAG.getEntryNode(), RetTy,
- false, false, false, false, 0,
- CallingConv::C, /*isTaillCall=*/false,
- /*doesNotRet=*/false, /*isReturnValueUsed*/true,
- Callee, Args, DAG, dl);
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+ .setCallee(CallingConv::C, RetTy, Callee, &Args, 0);
+
std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
if (isF64)
@@ -13764,6 +14452,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::VSELECT: return LowerVSELECT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
@@ -13815,6 +14504,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG);
case ISD::CTTZ: return LowerCTTZ(Op, DAG);
case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
+ case ISD::UMUL_LOHI:
+ case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
case ISD::SRA:
case ISD::SRL:
case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
@@ -13832,7 +14523,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
case ISD::ADD: return LowerADD(Op, DAG);
case ISD::SUB: return LowerSUB(Op, DAG);
- case ISD::SDIV: return LowerSDIV(Op, DAG);
case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
}
}
@@ -13875,10 +14565,10 @@ ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
SDValue Ops[] = { Chain, In1, In2L, In2H };
SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
SDValue Result =
- DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, array_lengthof(Ops), MVT::i64,
+ DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, MVT::i64,
cast<MemSDNode>(Node)->getMemOperand());
SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
- Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF));
Results.push_back(Result.getValue(2));
}
@@ -13899,6 +14589,16 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::SUBE:
// We don't want to expand or promote these.
return;
+ case ISD::SDIV:
+ case ISD::UDIV:
+ case ISD::SREM:
+ case ISD::UREM:
+ case ISD::SDIVREM:
+ case ISD::UDIVREM: {
+ SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
+ Results.push_back(V);
+ return;
+ }
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT: {
bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
@@ -13909,10 +14609,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
std::pair<SDValue,SDValue> Vals =
FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
SDValue FIST = Vals.first, StackSlot = Vals.second;
- if (FIST.getNode() != 0) {
+ if (FIST.getNode()) {
EVT VT = N->getValueType(0);
// Return a load from the stack slot.
- if (StackSlot.getNode() != 0)
+ if (StackSlot.getNode())
Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
MachinePointerInfo(),
false, false, false, 0));
@@ -13945,20 +14645,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(V);
return;
}
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default : llvm_unreachable("Do not know how to custom type "
+ "legalize this intrinsic operation!");
+ case Intrinsic::x86_rdtsc:
+ return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+ Results);
+ case Intrinsic::x86_rdtscp:
+ return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
+ Results);
+ }
+ }
case ISD::READCYCLECOUNTER: {
- SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue TheChain = N->getOperand(0);
- SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
- SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
- rd.getValue(1));
- SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
- eax.getValue(2));
- // Use a buildpair to merge the two 32-bit values into a 64-bit one.
- SDValue Ops[] = { eax, edx };
- Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops,
- array_lengthof(Ops)));
- Results.push_back(edx.getValue(1));
- return;
+ return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+ Results);
}
case ISD::ATOMIC_CMP_SWAP: {
EVT T = N->getValueType(0);
@@ -13994,8 +14696,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
X86ISD::LCMPXCHG8_DAG;
- SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
- Ops, array_lengthof(Ops), T, MMO);
+ SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
Regs64bit ? X86::RAX : X86::EAX,
HalfT, Result.getValue(1));
@@ -14003,7 +14704,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Regs64bit ? X86::RDX : X86::EDX,
HalfT, cpOutL.getValue(2));
SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
- Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
Results.push_back(cpOutH.getValue(1));
return;
}
@@ -14058,14 +14759,39 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc);
return;
}
- case ISD::ATOMIC_LOAD:
+ case ISD::ATOMIC_LOAD: {
ReplaceATOMIC_LOAD(N, Results, DAG);
+ return;
+ }
+ case ISD::BITCAST: {
+ assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+ EVT DstVT = N->getValueType(0);
+ EVT SrcVT = N->getOperand(0)->getValueType(0);
+
+ if (SrcVT != MVT::f64 ||
+ (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
+ return;
+
+ unsigned NumElts = DstVT.getVectorNumElements();
+ EVT SVT = DstVT.getVectorElementType();
+ EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+ SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ MVT::v2f64, N->getOperand(0));
+ SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
+
+ SmallVector<SDValue, 8> Elts;
+ for (unsigned i = 0, e = NumElts; i != e; ++i)
+ Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
+ ToVecInt, DAG.getIntPtrConstant(i)));
+
+ Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
+ }
}
}
const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
switch (Opcode) {
- default: return NULL;
+ default: return nullptr;
case X86ISD::BSF: return "X86ISD::BSF";
case X86ISD::BSR: return "X86ISD::BSR";
case X86ISD::SHLD: return "X86ISD::SHLD";
@@ -14176,7 +14902,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::OR: return "X86ISD::OR";
case X86ISD::XOR: return "X86ISD::XOR";
case X86ISD::AND: return "X86ISD::AND";
- case X86ISD::BZHI: return "X86ISD::BZHI";
case X86ISD::BEXTR: return "X86ISD::BEXTR";
case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
case X86ISD::PTEST: return "X86ISD::PTEST";
@@ -14203,6 +14928,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
+ case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
case X86ISD::VPERMILP: return "X86ISD::VPERMILP";
case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
case X86ISD::VPERMV: return "X86ISD::VPERMV";
@@ -14210,6 +14936,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
case X86ISD::VPERMI: return "X86ISD::VPERMI";
case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
+ case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
@@ -14240,7 +14967,7 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
Reloc::Model R = getTargetMachine().getRelocationModel();
// X86 allows a sign-extended 32-bit immediate field as a displacement.
- if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
+ if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
return false;
if (AM.BaseGV) {
@@ -14418,7 +15145,23 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
if (VT.getSizeInBits() == 64)
return false;
- // FIXME: pshufb, blends, shifts.
+ // If this is a single-input shuffle with no 128 bit lane crossings we can
+ // lower it into pshufb.
+ if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
+ (SVT.is256BitVector() && Subtarget->hasInt256())) {
+ bool isLegal = true;
+ for (unsigned I = 0, E = M.size(); I != E; ++I) {
+ if (M[I] >= (int)SVT.getVectorNumElements() ||
+ ShuffleCrosses128bitLane(SVT, I, M[I])) {
+ isLegal = false;
+ break;
+ }
+ }
+ if (isLegal)
+ return true;
+ }
+
+ // FIXME: blends, shifts.
return (SVT.getVectorNumElements() == 2 ||
ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
isMOVLMask(M, SVT) ||
@@ -15366,7 +16109,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
OffsetDestReg = 0; // unused
OverflowDestReg = DestReg;
- offsetMBB = NULL;
+ offsetMBB = nullptr;
overflowMBB = thisMBB;
endMBB = thisMBB;
} else {
@@ -15736,7 +16479,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
MachineFunction *MF = BB->getParent();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- assert(getTargetMachine().Options.EnableSegmentedStacks);
+ assert(MF->shouldSplitStack());
unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
@@ -16509,11 +17252,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
// X86 Optimization Hooks
//===----------------------------------------------------------------------===//
-void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
- APInt &KnownZero,
- APInt &KnownOne,
- const SelectionDAG &DAG,
- unsigned Depth) const {
+void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
unsigned BitWidth = KnownZero.getBitWidth();
unsigned Opc = Op.getOpcode();
assert((Opc >= ISD::BUILTIN_OP_END ||
@@ -16576,8 +17319,10 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
}
}
-unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
- unsigned Depth) const {
+unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
+ SDValue Op,
+ const SelectionDAG &,
+ unsigned Depth) const {
// SETCC_CARRY sets the dest to ~0 for true or 0 for false.
if (Op.getOpcode() == X86ISD::SETCC_CARRY)
return Op.getValueType().getScalarType().getSizeInBits();
@@ -16679,7 +17424,6 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
SDValue ResNode =
DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
- array_lengthof(Ops),
Ld->getMemoryVT(),
Ld->getPointerInfo(),
Ld->getAlignment(),
@@ -17036,6 +17780,51 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
return std::make_pair(Opc, NeedSplit);
}
+static SDValue
+TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ SDLoc dl(N);
+ SDValue Cond = N->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+
+ if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
+ SDValue CondSrc = Cond->getOperand(0);
+ if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
+ Cond = CondSrc->getOperand(0);
+ }
+
+ MVT VT = N->getSimpleValueType(0);
+ MVT EltVT = VT.getVectorElementType();
+ unsigned NumElems = VT.getVectorNumElements();
+ // There is no blend with immediate in AVX-512.
+ if (VT.is512BitVector())
+ return SDValue();
+
+ if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+ return SDValue();
+ if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+ return SDValue();
+
+ if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+ return SDValue();
+
+ unsigned MaskValue = 0;
+ if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+ return SDValue();
+
+ SmallVector<int, 8> ShuffleMask(NumElems, -1);
+ for (unsigned i = 0; i < NumElems; ++i) {
+ // Be sure we emit undef where we can.
+ if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
+ ShuffleMask[i] = -1;
+ else
+ ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
+ }
+
+ return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
+}
+
/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
/// nodes.
static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
@@ -17378,7 +18167,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
// Another special case: If C was a sign bit, the sub has been
// canonicalized into a xor.
- // FIXME: Would it be better to use ComputeMaskedBits to determine whether
+ // FIXME: Would it be better to use computeKnownBits to determine whether
// it's safe to decanonicalize the xor?
// x s< 0 ? x^C : 0 --> subus x, C
if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
@@ -17544,7 +18333,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
// depend on the highest bit in each word. Try to use SimplifyDemandedBits
// to simplify previous instructions.
if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
- !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
+ !DCI.isBeforeLegalize() &&
+ // We explicitly check against v8i16 and v16i16 because, although
+ // they're marked as Custom, they might only be legal when Cond is a
+ // build_vector of constants. This will be taken care in a later
+ // condition.
+ (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
+ VT != MVT::v8i16)) {
unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
// Don't optimize vector selects that map to mask-registers.
@@ -17571,6 +18366,23 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
DCI.CommitTargetLoweringOpt(TLO);
}
+ // We should generate an X86ISD::BLENDI from a vselect if its argument
+ // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
+ // constants. This specific pattern gets generated when we split a
+ // selector for a 512 bit vector in a machine without AVX512 (but with
+ // 256-bit vectors), during legalization:
+ //
+ // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
+ //
+ // Iff we find this pattern and the build_vectors are built from
+ // constants, we translate the vselect into a shuffle_vector that we
+ // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
+ if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) {
+ SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+ if (Shuffle.getNode())
+ return Shuffle;
+ }
+
return SDValue();
}
@@ -17605,7 +18417,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
SDValue Op2 = Cmp.getOperand(1);
SDValue SetCC;
- const ConstantSDNode* C = 0;
+ const ConstantSDNode* C = nullptr;
bool needOppositeCond = (CC == X86::COND_E);
bool checkAgainstTrue = false; // Is it a comparison against 1?
@@ -17740,8 +18552,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
(FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
SDValue Ops[] = { FalseOp, TrueOp,
DAG.getConstant(CC, MVT::i8), Flags };
- return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
- Ops, array_lengthof(Ops));
+ return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
}
// If this is a select between two integer constants, try to do some
@@ -17856,7 +18667,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
// the DCI.xxxx conditions are provided to postpone the optimization as
// late as possible.
- ConstantSDNode *CmpAgainst = 0;
+ ConstantSDNode *CmpAgainst = nullptr;
if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
!isa<ConstantSDNode>(Cond.getOperand(0))) {
@@ -17871,8 +18682,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
SDValue Ops[] = { FalseOp, Cond.getOperand(0),
DAG.getConstant(CC, MVT::i8), Cond };
- return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops,
- array_lengthof(Ops));
+ return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
}
}
}
@@ -17880,6 +18690,106 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ switch (IntNo) {
+ default: return SDValue();
+ // SSE/AVX/AVX2 blend intrinsics.
+ case Intrinsic::x86_avx2_pblendvb:
+ case Intrinsic::x86_avx2_pblendw:
+ case Intrinsic::x86_avx2_pblendd_128:
+ case Intrinsic::x86_avx2_pblendd_256:
+ // Don't try to simplify this intrinsic if we don't have AVX2.
+ if (!Subtarget->hasAVX2())
+ return SDValue();
+ // FALL-THROUGH
+ case Intrinsic::x86_avx_blend_pd_256:
+ case Intrinsic::x86_avx_blend_ps_256:
+ case Intrinsic::x86_avx_blendv_pd_256:
+ case Intrinsic::x86_avx_blendv_ps_256:
+ // Don't try to simplify this intrinsic if we don't have AVX.
+ if (!Subtarget->hasAVX())
+ return SDValue();
+ // FALL-THROUGH
+ case Intrinsic::x86_sse41_pblendw:
+ case Intrinsic::x86_sse41_blendpd:
+ case Intrinsic::x86_sse41_blendps:
+ case Intrinsic::x86_sse41_blendvps:
+ case Intrinsic::x86_sse41_blendvpd:
+ case Intrinsic::x86_sse41_pblendvb: {
+ SDValue Op0 = N->getOperand(1);
+ SDValue Op1 = N->getOperand(2);
+ SDValue Mask = N->getOperand(3);
+
+ // Don't try to simplify this intrinsic if we don't have SSE4.1.
+ if (!Subtarget->hasSSE41())
+ return SDValue();
+
+ // fold (blend A, A, Mask) -> A
+ if (Op0 == Op1)
+ return Op0;
+ // fold (blend A, B, allZeros) -> A
+ if (ISD::isBuildVectorAllZeros(Mask.getNode()))
+ return Op0;
+ // fold (blend A, B, allOnes) -> B
+ if (ISD::isBuildVectorAllOnes(Mask.getNode()))
+ return Op1;
+
+ // Simplify the case where the mask is a constant i32 value.
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
+ if (C->isNullValue())
+ return Op0;
+ if (C->isAllOnesValue())
+ return Op1;
+ }
+ }
+
+ // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
+ case Intrinsic::x86_sse2_psrai_w:
+ case Intrinsic::x86_sse2_psrai_d:
+ case Intrinsic::x86_avx2_psrai_w:
+ case Intrinsic::x86_avx2_psrai_d:
+ case Intrinsic::x86_sse2_psra_w:
+ case Intrinsic::x86_sse2_psra_d:
+ case Intrinsic::x86_avx2_psra_w:
+ case Intrinsic::x86_avx2_psra_d: {
+ SDValue Op0 = N->getOperand(1);
+ SDValue Op1 = N->getOperand(2);
+ EVT VT = Op0.getValueType();
+ assert(VT.isVector() && "Expected a vector type!");
+
+ if (isa<BuildVectorSDNode>(Op1))
+ Op1 = Op1.getOperand(0);
+
+ if (!isa<ConstantSDNode>(Op1))
+ return SDValue();
+
+ EVT SVT = VT.getVectorElementType();
+ unsigned SVTBits = SVT.getSizeInBits();
+
+ ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
+ const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
+ uint64_t ShAmt = C.getZExtValue();
+
+ // Don't try to convert this shift into a ISD::SRA if the shift
+ // count is bigger than or equal to the element size.
+ if (ShAmt >= SVTBits)
+ return SDValue();
+
+ // Trivial case: if the shift count is zero, then fold this
+ // into the first operand.
+ if (ShAmt == 0)
+ return Op0;
+
+ // Replace this packed shift intrinsic with a target independent
+ // shift dag node.
+ SDValue Splat = DAG.getConstant(C, VT);
+ return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
+ }
+ }
+}
+
/// PerformMulCombine - Optimize a single multiply with constant into two
/// in order to implement it with two cheaper instructions, e.g.
/// LEA + SHL, LEA + LEA.
@@ -18223,7 +19133,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
N1->getOperand(0));
SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
- N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size());
+ N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
} else if (RHSTrunc) {
N1 = N1->getOperand(0);
}
@@ -18260,40 +19170,13 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
if (R.getNode())
return R;
- // Create BEXTR and BZHI instructions
- // BZHI is X & ((1 << Y) - 1)
+ // Create BEXTR instructions
// BEXTR is ((X >> imm) & (2**size-1))
if (VT == MVT::i32 || VT == MVT::i64) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDLoc DL(N);
- if (Subtarget->hasBMI2()) {
- // Check for (and (add (shl 1, Y), -1), X)
- if (N0.getOpcode() == ISD::ADD && isAllOnes(N0.getOperand(1))) {
- SDValue N00 = N0.getOperand(0);
- if (N00.getOpcode() == ISD::SHL) {
- SDValue N001 = N00.getOperand(1);
- assert(N001.getValueType() == MVT::i8 && "unexpected type");
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(N00.getOperand(0));
- if (C && C->getZExtValue() == 1)
- return DAG.getNode(X86ISD::BZHI, DL, VT, N1, N001);
- }
- }
-
- // Check for (and X, (add (shl 1, Y), -1))
- if (N1.getOpcode() == ISD::ADD && isAllOnes(N1.getOperand(1))) {
- SDValue N10 = N1.getOperand(0);
- if (N10.getOpcode() == ISD::SHL) {
- SDValue N101 = N10.getOperand(1);
- assert(N101.getValueType() == MVT::i8 && "unexpected type");
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(N10.getOperand(0));
- if (C && C->getZExtValue() == 1)
- return DAG.getNode(X86ISD::BZHI, DL, VT, N0, N101);
- }
- }
- }
-
// Check for BEXTR.
if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
(N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
@@ -18533,8 +19416,7 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
SDValue Ops[] = { N0.getOperand(0), Neg,
DAG.getConstant(X86::COND_GE, MVT::i8),
SDValue(Neg.getNode(), 1) };
- return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue),
- Ops, array_lengthof(Ops));
+ return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
}
return SDValue();
}
@@ -18691,8 +19573,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
}
- SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
- Chains.size());
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
// Bitcast the loaded value to a vector of the original element type, in
// the size of the target vector type.
@@ -18867,8 +19748,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
Chains.push_back(Ch);
}
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
- Chains.size());
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
}
// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
@@ -18891,7 +19771,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
!cast<LoadSDNode>(St->getValue())->isVolatile() &&
St->getChain().hasOneUse() && !St->isVolatile()) {
SDNode* LdVal = St->getValue().getNode();
- LoadSDNode *Ld = 0;
+ LoadSDNode *Ld = nullptr;
int TokenFactorIndex = -1;
SmallVector<SDValue, 8> Ops;
SDNode* ChainVal = St->getChain().getNode();
@@ -18934,8 +19814,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
SDValue NewChain = NewLd.getValue(1);
if (TokenFactorIndex != -1) {
Ops.push_back(NewChain);
- NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
- Ops.size());
+ NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
}
return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
St->getPointerInfo(),
@@ -18962,8 +19841,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
if (TokenFactorIndex != -1) {
Ops.push_back(LoLd);
Ops.push_back(HiLd);
- NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
- Ops.size());
+ NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
}
LoAddr = St->getBasePtr();
@@ -19432,6 +20310,33 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ SDLoc dl(N);
+ MVT VT = N->getOperand(1)->getSimpleValueType(0);
+ assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
+ "X86insertps is only defined for v4x32");
+
+ SDValue Ld = N->getOperand(1);
+ if (MayFoldLoad(Ld)) {
+ // Extract the countS bits from the immediate so we can get the proper
+ // address when narrowing the vector load to a specific element.
+ // When the second source op is a memory address, interps doesn't use
+ // countS and just gets an f32 from that address.
+ unsigned DestIndex =
+ cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
+ Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
+ } else
+ return SDValue();
+
+ // Create this as a scalar to vector to match the instruction pattern.
+ SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
+ // countS bits are ignored when loading from memory on insertps, which
+ // means we don't need to explicitly set them to 0.
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
+ LoadScalarToVector, N->getOperand(2));
+}
+
// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
// as "sbb reg,reg", since it can be extended without zext and produces
// an all-ones bit which is more useful than 0/1 in some cases.
@@ -19711,7 +20616,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);
- case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
+ case ISD::SIGN_EXTEND_INREG:
+ return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget);
case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget);
case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget);
@@ -19732,6 +20638,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::VPERM2X128:
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
+ case ISD::INTRINSIC_WO_CHAIN:
+ return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
+ case X86ISD::INSERTPS:
+ return PerformINSERTPSCombine(N, DAG, Subtarget);
}
return SDValue();
@@ -20006,7 +20916,7 @@ TargetLowering::ConstraintWeight
Value *CallOperandVal = info.CallOperandVal;
// If we don't have a value, we can't do a match,
// but allow it at the lowest weight.
- if (CallOperandVal == NULL)
+ if (!CallOperandVal)
return CW_Default;
Type *type = CallOperandVal->getType();
// Look at the constraint type.
@@ -20124,7 +21034,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
std::string &Constraint,
std::vector<SDValue>&Ops,
SelectionDAG &DAG) const {
- SDValue Result(0, 0);
+ SDValue Result;
// Only support length 1 constraints for now.
if (Constraint.length() > 1) return;
@@ -20207,7 +21117,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
// If we are in non-pic codegen mode, we allow the address of a global (with
// an optional displacement) to be used with 'i'.
- GlobalAddressSDNode *GA = 0;
+ GlobalAddressSDNode *GA = nullptr;
int64_t Offset = 0;
// Match either (GA), (GA+C), (GA+C1+C2), etc.
@@ -20363,7 +21273,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
// Not found as a standard register?
- if (Res.second == 0) {
+ if (!Res.second) {
// Map st(0) -> st(7) -> ST0
if (Constraint.size() == 7 && Constraint[0] == '{' &&
tolower(Constraint[1]) == 's' &&
@@ -20488,3 +21398,30 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
return Res;
}
+
+int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
+ Type *Ty) const {
+ // Scaling factors are not free at all.
+ // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
+ // will take 2 allocations in the out of order engine instead of 1
+ // for plain addressing mode, i.e. inst (reg1).
+ // E.g.,
+ // vaddps (%rsi,%drx), %ymm0, %ymm1
+ // Requires two allocations (one for the load, one for the computation)
+ // whereas:
+ // vaddps (%rsi), %ymm0, %ymm1
+ // Requires just 1 allocation, i.e., freeing allocations for other operations
+ // and having less micro operations to execute.
+ //
+ // For some X86 architectures, this is even worse because for instance for
+ // stores, the complex addressing mode forces the instruction to use the
+ // "load" ports instead of the dedicated "store" port.
+ // E.g., on Haswell:
+ // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
+ // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
+ if (isLegalAddressingMode(AM, Ty))
+ // Scale represents reg2 * scale, thus account for 1
+ // as soon as we use a second register.
+ return AM.Scale != 0;
+ return -1;
+}
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 0f0d17b..9f51b53 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -83,6 +83,9 @@ namespace llvm {
/// readcyclecounter
RDTSC_DAG,
+ /// X86 Read Time-Stamp Counter and Processor ID.
+ RDTSCP_DAG,
+
/// X86 compare and logical compare instructions.
CMP, COMI, UCOMI,
@@ -291,7 +294,6 @@ namespace llvm {
ADD, SUB, ADC, SBB, SMUL,
INC, DEC, OR, XOR, AND,
- BZHI, // BZHI - Zero high bits
BEXTR, // BEXTR - Bit field extract
UMUL, // LOW, HI, FLAGS = umul LHS, RHS
@@ -345,6 +347,8 @@ namespace llvm {
// PMULUDQ - Vector multiply packed unsigned doubleword integers
PMULUDQ,
+ // PMULUDQ - Vector multiply packed signed doubleword integers
+ PMULDQ,
// FMA nodes
FMADD,
@@ -614,18 +618,19 @@ namespace llvm {
/// getSetCCResultType - Return the value type to use for ISD::SETCC.
EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
- /// computeMaskedBitsForTargetNode - Determine which of the bits specified
+ /// computeKnownBitsForTargetNode - Determine which of the bits specified
/// in Mask are known to be either zero or one and return them in the
/// KnownZero/KnownOne bitsets.
- void computeMaskedBitsForTargetNode(const SDValue Op,
- APInt &KnownZero,
- APInt &KnownOne,
- const SelectionDAG &DAG,
- unsigned Depth = 0) const override;
+ void computeKnownBitsForTargetNode(const SDValue Op,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
// ComputeNumSignBitsForTargetNode - Determine the number of bits in the
// operation that are sign bits.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+ const SelectionDAG &DAG,
unsigned Depth) const override;
bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA,
@@ -679,6 +684,12 @@ namespace llvm {
/// the immediate into a register.
bool isLegalAddImmediate(int64_t Imm) const override;
+ /// \brief Return the cost of the scaling factor used in the addressing
+ /// mode represented by AM for this target, for a load/store
+ /// of the specified type.
+ /// If the AM is supported, the return value must be >= 0.
+ /// If the AM is not supported, it returns a negative value.
+ int getScalingFactorCost(const AddrMode &AM, Type *Ty) const override;
bool isVectorShiftByScalarCheap(Type *Ty) const override;
@@ -771,10 +782,12 @@ namespace llvm {
Type *Ty) const override;
/// Intel processors have a unified instruction and data cache
- const char * getClearCacheBuiltinName() const {
- return 0; // nothing to do, move along.
+ const char * getClearCacheBuiltinName() const override {
+ return nullptr; // nothing to do, move along.
}
+ unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+
/// createFastISel - This method returns a target specific FastISel object,
/// or null if the target does not support "fast" ISel.
FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
@@ -871,8 +884,11 @@ namespace llvm {
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const;
+ SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const;
+
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -908,6 +924,7 @@ namespace llvm {
SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
SDValue
LowerFormalArguments(SDValue Chain,
@@ -936,7 +953,7 @@ namespace llvm {
const SmallVectorImpl<ISD::OutputArg> &Outs,
LLVMContext &Context) const override;
- const uint16_t *getScratchRegisters(CallingConv::ID CC) const override;
+ const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
/// Utility function to emit atomic-load-arith operations (and, or, xor,
/// nand, max, min, umax, umin). It takes the corresponding instruction to
@@ -987,11 +1004,12 @@ namespace llvm {
/// Emit nodes that will be selected as "test Op0,Op0", or something
/// equivalent, for use with the given x86 condition code.
- SDValue EmitTest(SDValue Op0, unsigned X86CC, SelectionDAG &DAG) const;
+ SDValue EmitTest(SDValue Op0, unsigned X86CC, SDLoc dl,
+ SelectionDAG &DAG) const;
/// Emit nodes that will be selected as "cmp Op0,Op1", or something
/// equivalent, for use with the given x86 condition code.
- SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+ SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, SDLoc dl,
SelectionDAG &DAG) const;
/// Convert a comparison if required by the subtarget.
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 2c5edf6..37bcc52 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -209,12 +209,12 @@ def : Pat<(vinsert256_insert:$ins (v16i32 VR512:$src1),
def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2, u32u8imm:$src3),
"vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- [(set VR128X:$dst, (X86insrtps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
+ [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
EVEX_4V;
def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
(ins VR128X:$src1, f32mem:$src2, u32u8imm:$src3),
"vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- [(set VR128X:$dst, (X86insrtps VR128X:$src1,
+ [(set VR128X:$dst, (X86insertps VR128X:$src1,
(v4f32 (scalar_to_vector (loadf32 addr:$src2))),
imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
@@ -621,6 +621,22 @@ defm VPERMT2PS : avx512_perm_3src<0x7F, "vpermt2ps", VR512, memopv16f32, i512me
X86VPermv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
defm VPERMT2PD : avx512_perm_3src<0x7F, "vpermt2pd", VR512, memopv8f64, i512mem,
X86VPermv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v16f32 (int_x86_avx512_mask_vpermt_ps_512 (v16i32 VR512:$idx),
+ (v16f32 VR512:$src1), (v16f32 VR512:$src2), (i16 -1))),
+ (VPERMT2PSrr VR512:$src1, VR512:$idx, VR512:$src2)>;
+
+def : Pat<(v16i32 (int_x86_avx512_mask_vpermt_d_512 (v16i32 VR512:$idx),
+ (v16i32 VR512:$src1), (v16i32 VR512:$src2), (i16 -1))),
+ (VPERMT2Drr VR512:$src1, VR512:$idx, VR512:$src2)>;
+
+def : Pat<(v8f64 (int_x86_avx512_mask_vpermt_pd_512 (v8i64 VR512:$idx),
+ (v8f64 VR512:$src1), (v8f64 VR512:$src2), (i8 -1))),
+ (VPERMT2PDrr VR512:$src1, VR512:$idx, VR512:$src2)>;
+
+def : Pat<(v8i64 (int_x86_avx512_mask_vpermt_q_512 (v8i64 VR512:$idx),
+ (v8i64 VR512:$src1), (v8i64 VR512:$src2), (i8 -1))),
+ (VPERMT2Qrr VR512:$src1, VR512:$idx, VR512:$src2)>;
//===----------------------------------------------------------------------===//
// AVX-512 - BLEND using mask
//
@@ -984,6 +1000,10 @@ let Predicates = [HasAVX512] in {
(EXTRACT_SUBREG
(AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)),
sub_16bit)>;
+ def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK16)>;
+ def : Pat<(v8i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK8)>;
}
// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
let Predicates = [HasAVX512] in {
@@ -1356,6 +1376,23 @@ defm VMOVDQU64: avx512_load<0x6F, VR512, VK8WM, i512mem, load,
"vmovdqu64", SSEPackedInt, v8i64>,
XS, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+def: Pat<(v16i32 (int_x86_avx512_mask_loadu_d_512 addr:$ptr,
+ (v16i32 immAllZerosV), GR16:$mask)),
+ (VMOVDQU32rmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
+
+def: Pat<(v8i64 (int_x86_avx512_mask_loadu_q_512 addr:$ptr,
+ (bc_v8i64 (v16i32 immAllZerosV)), GR8:$mask)),
+ (VMOVDQU64rmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
+
+def: Pat<(int_x86_avx512_mask_storeu_d_512 addr:$ptr, (v16i32 VR512:$src),
+ GR16:$mask),
+ (VMOVDQU32mrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
+ VR512:$src)>;
+def: Pat<(int_x86_avx512_mask_storeu_q_512 addr:$ptr, (v8i64 VR512:$src),
+ GR8:$mask),
+ (VMOVDQU64mrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
+ VR512:$src)>;
+
let AddedComplexity = 20 in {
def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src),
(bc_v8i64 (v16i32 immAllZerosV)))),
@@ -3112,6 +3149,17 @@ def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
(EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
(v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))),
+ (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
+ (v16f32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))),
+ (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
+ (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))),
+ (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
+ (v16i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>;
def : Pat<(v16f32 (int_x86_avx512_mask_cvtdq2ps_512 (v16i32 VR512:$src),
(bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), imm:$rc)),
@@ -3715,7 +3763,7 @@ defm VRNDSCALEPSZ : avx512_rndscale<0x08, "vrndscaleps", f512mem, VR512,
EVEX_CD8<32, CD8VF>;
def : Pat<(v16f32 (int_x86_avx512_mask_rndscale_ps_512 (v16f32 VR512:$src1),
- imm:$src2, (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1),
+ imm:$src2, (v16f32 VR512:$src1), (i16 -1),
FROUND_CURRENT)),
(VRNDSCALEPSZr VR512:$src1, imm:$src2)>;
@@ -3725,7 +3773,7 @@ defm VRNDSCALEPDZ : avx512_rndscale<0x09, "vrndscalepd", f512mem, VR512,
VEX_W, EVEX_CD8<64, CD8VF>;
def : Pat<(v8f64 (int_x86_avx512_mask_rndscale_pd_512 (v8f64 VR512:$src1),
- imm:$src2, (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1),
+ imm:$src2, (v8f64 VR512:$src1), (i8 -1),
FROUND_CURRENT)),
(VRNDSCALEPDZr VR512:$src1, imm:$src2)>;
@@ -3807,7 +3855,13 @@ multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr," \t{$src, $dst|$dst, $src}"),
[]>, EVEX;
- def krr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
+ def rrk : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
+ (ins KRC:$mask, srcRC:$src),
+ !strconcat(OpcodeStr,
+ " \t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
+ []>, EVEX, EVEX_K;
+
+ def rrkz : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
(ins KRC:$mask, srcRC:$src),
!strconcat(OpcodeStr,
" \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
@@ -3816,6 +3870,12 @@ multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr,
def mr : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src),
!strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
[]>, EVEX;
+
+ def mrk : AVX512XS8I<opc, MRMDestMem, (outs),
+ (ins x86memop:$dst, KRC:$mask, srcRC:$src),
+ !strconcat(OpcodeStr, " \t{$src, $dst {${mask}}|${dst} {${mask}}, $src}"),
+ []>, EVEX, EVEX_K;
+
}
defm VPMOVQB : avx512_trunc_sat<0x32, "vpmovqb", VR128X, VR512, VK8WM,
i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
@@ -3855,60 +3915,86 @@ def : Pat<(v16i8 (X86vtrunc (v16i32 VR512:$src))), (VPMOVDBrr VR512:$src)>;
def : Pat<(v8i32 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQDrr VR512:$src)>;
def : Pat<(v16i8 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))),
- (VPMOVDBkrr VK16WM:$mask, VR512:$src)>;
+ (VPMOVDBrrkz VK16WM:$mask, VR512:$src)>;
def : Pat<(v16i16 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))),
- (VPMOVDWkrr VK16WM:$mask, VR512:$src)>;
+ (VPMOVDWrrkz VK16WM:$mask, VR512:$src)>;
def : Pat<(v8i16 (X86vtruncm VK8WM:$mask, (v8i64 VR512:$src))),
- (VPMOVQWkrr VK8WM:$mask, VR512:$src)>;
+ (VPMOVQWrrkz VK8WM:$mask, VR512:$src)>;
def : Pat<(v8i32 (X86vtruncm VK8WM:$mask, (v8i64 VR512:$src))),
- (VPMOVQDkrr VK8WM:$mask, VR512:$src)>;
+ (VPMOVQDrrkz VK8WM:$mask, VR512:$src)>;
-multiclass avx512_extend<bits<8> opc, string OpcodeStr, RegisterClass DstRC,
- RegisterClass SrcRC, SDNode OpNode, PatFrag mem_frag,
- X86MemOperand x86memop, ValueType OpVT, ValueType InVT> {
+multiclass avx512_extend<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+ RegisterClass DstRC, RegisterClass SrcRC, SDNode OpNode,
+ PatFrag mem_frag, X86MemOperand x86memop,
+ ValueType OpVT, ValueType InVT> {
def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst),
(ins SrcRC:$src),
!strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
[(set DstRC:$dst, (OpVT (OpNode (InVT SrcRC:$src))))]>, EVEX;
- def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
+
+ def rrk : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst),
+ (ins KRC:$mask, SrcRC:$src),
+ !strconcat(OpcodeStr, " \t{$src, $dst {${mask}} |$dst {${mask}}, $src}"),
+ []>, EVEX, EVEX_K;
+
+ def rrkz : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst),
+ (ins KRC:$mask, SrcRC:$src),
+ !strconcat(OpcodeStr, " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
+ []>, EVEX, EVEX_KZ;
+
+ let mayLoad = 1 in {
+ def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
(ins x86memop:$src),
!strconcat(OpcodeStr," \t{$src, $dst|$dst, $src}"),
[(set DstRC:$dst,
(OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))]>,
EVEX;
+
+ def rmk : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
+ (ins KRC:$mask, x86memop:$src),
+ !strconcat(OpcodeStr," \t{$src, $dst {${mask}} |$dst {${mask}}, $src}"),
+ []>,
+ EVEX, EVEX_K;
+
+ def rmkz : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
+ (ins KRC:$mask, x86memop:$src),
+ !strconcat(OpcodeStr," \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
+ []>,
+ EVEX, EVEX_KZ;
+ }
}
-defm VPMOVZXBDZ: avx512_extend<0x31, "vpmovzxbd", VR512, VR128X, X86vzext,
+defm VPMOVZXBDZ: avx512_extend<0x31, "vpmovzxbd", VK16WM, VR512, VR128X, X86vzext,
memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512,
EVEX_CD8<8, CD8VQ>;
-defm VPMOVZXBQZ: avx512_extend<0x32, "vpmovzxbq", VR512, VR128X, X86vzext,
+defm VPMOVZXBQZ: avx512_extend<0x32, "vpmovzxbq", VK8WM, VR512, VR128X, X86vzext,
memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512,
EVEX_CD8<8, CD8VO>;
-defm VPMOVZXWDZ: avx512_extend<0x33, "vpmovzxwd", VR512, VR256X, X86vzext,
+defm VPMOVZXWDZ: avx512_extend<0x33, "vpmovzxwd", VK16WM, VR512, VR256X, X86vzext,
memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512,
EVEX_CD8<16, CD8VH>;
-defm VPMOVZXWQZ: avx512_extend<0x34, "vpmovzxwq", VR512, VR128X, X86vzext,
+defm VPMOVZXWQZ: avx512_extend<0x34, "vpmovzxwq", VK8WM, VR512, VR128X, X86vzext,
memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512,
EVEX_CD8<16, CD8VQ>;
-defm VPMOVZXDQZ: avx512_extend<0x35, "vpmovzxdq", VR512, VR256X, X86vzext,
+defm VPMOVZXDQZ: avx512_extend<0x35, "vpmovzxdq", VK8WM, VR512, VR256X, X86vzext,
memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512,
EVEX_CD8<32, CD8VH>;
-
-defm VPMOVSXBDZ: avx512_extend<0x21, "vpmovsxbd", VR512, VR128X, X86vsext,
+
+defm VPMOVSXBDZ: avx512_extend<0x21, "vpmovsxbd", VK16WM, VR512, VR128X, X86vsext,
memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512,
EVEX_CD8<8, CD8VQ>;
-defm VPMOVSXBQZ: avx512_extend<0x22, "vpmovsxbq", VR512, VR128X, X86vsext,
+defm VPMOVSXBQZ: avx512_extend<0x22, "vpmovsxbq", VK8WM, VR512, VR128X, X86vsext,
memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512,
EVEX_CD8<8, CD8VO>;
-defm VPMOVSXWDZ: avx512_extend<0x23, "vpmovsxwd", VR512, VR256X, X86vsext,
+defm VPMOVSXWDZ: avx512_extend<0x23, "vpmovsxwd", VK16WM, VR512, VR256X, X86vsext,
memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512,
EVEX_CD8<16, CD8VH>;
-defm VPMOVSXWQZ: avx512_extend<0x24, "vpmovsxwq", VR512, VR128X, X86vsext,
+defm VPMOVSXWQZ: avx512_extend<0x24, "vpmovsxwq", VK8WM, VR512, VR128X, X86vsext,
memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512,
EVEX_CD8<16, CD8VQ>;
-defm VPMOVSXDQZ: avx512_extend<0x25, "vpmovsxdq", VR512, VR256X, X86vsext,
+defm VPMOVSXDQZ: avx512_extend<0x25, "vpmovsxdq", VK8WM, VR512, VR256X, X86vsext,
memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512,
EVEX_CD8<32, CD8VH>;
@@ -3984,6 +4070,62 @@ defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", VK8WM, VR512, vz64mem>,
defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", VK8WM, VR256X, vz64mem>,
EVEX_V512, EVEX_CD8<32, CD8VT1>;
+// prefetch
+multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr,
+ RegisterClass KRC, X86MemOperand memop> {
+ let Predicates = [HasPFI], hasSideEffects = 1 in
+ def m : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src),
+ !strconcat(OpcodeStr, " \t{$src {${mask}}|{${mask}}, $src}"),
+ []>, EVEX, EVEX_K;
+}
+
+defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps",
+ VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps",
+ VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd",
+ VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd",
+ VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps",
+ VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps",
+ VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd",
+ VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd",
+ VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps",
+ VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps",
+ VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd",
+ VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd",
+ VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps",
+ VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps",
+ VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd",
+ VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd",
+ VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
//===----------------------------------------------------------------------===//
// VSHUFPS - VSHUFPD Operations
@@ -4200,3 +4342,19 @@ def : Pat<(int_x86_avx512_mask_conflict_q_512 VR512:$src2, VR512:$src1,
GR8:$mask),
(VPCONFLICTQrrk VR512:$src1,
(v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>;
+
+def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
+def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
+def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
+
+def : Pat<(store VK1:$src, addr:$dst),
+ (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK16))>;
+
+def truncstorei1 : PatFrag<(ops node:$val, node:$ptr),
+ (truncstore node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i1;
+}]>;
+
+def : Pat<(truncstorei1 GR8:$src, addr:$dst),
+ (MOV8mr addr:$dst, GR8:$src)>;
+
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
index aaef4a4..e421f8c 100644
--- a/lib/Target/X86/X86InstrBuilder.h
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -52,7 +52,8 @@ struct X86AddressMode {
unsigned GVOpFlags;
X86AddressMode()
- : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(0), GVOpFlags(0) {
+ : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(nullptr),
+ GVOpFlags(0) {
Base.Reg = 0;
}
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 401849f..34d8fb9 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -1187,9 +1187,9 @@ def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
APInt KnownZero0, KnownOne0;
- CurDAG->ComputeMaskedBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
+ CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
APInt KnownZero1, KnownOne1;
- CurDAG->ComputeMaskedBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
+ CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
return (~KnownZero0 & ~KnownZero1) == 0;
}]>;
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index df6c9da..c0a6864 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -19,8 +19,9 @@ let Constraints = "$src1 = $dst" in {
multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
PatFrag MemFrag128, PatFrag MemFrag256,
ValueType OpVT128, ValueType OpVT256,
+ bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0,
SDPatternOperator Op = null_frag> {
- let usesCustomInserter = 1 in
+ let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
@@ -28,7 +29,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
[(set VR128:$dst, (OpVT128 (Op VR128:$src2,
VR128:$src1, VR128:$src3)))]>;
- let mayLoad = 1 in
+ let mayLoad = 1, isCommutable = IsMVariantCommutable in
def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, f128mem:$src3),
!strconcat(OpcodeStr,
@@ -36,7 +37,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
[(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
(MemFrag128 addr:$src3))))]>;
- let usesCustomInserter = 1 in
+ let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3),
!strconcat(OpcodeStr,
@@ -44,7 +45,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
[(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
VR256:$src3)))]>, VEX_L;
- let mayLoad = 1 in
+ let mayLoad = 1, isCommutable = IsMVariantCommutable in
def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, f256mem:$src3),
!strconcat(OpcodeStr,
@@ -59,18 +60,27 @@ multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpcodeStr, string PackTy,
PatFrag MemFrag128, PatFrag MemFrag256,
SDNode Op, ValueType OpTy128, ValueType OpTy256> {
- let isCommutable = 1 in
+ // For 213, both the register and memory variant are commutable.
+ // Indeed, the commutable operands are 1 and 2 and both live in registers
+ // for both variants.
defm r213 : fma3p_rm<opc213,
!strconcat(OpcodeStr, "213", PackTy),
- MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
+ MemFrag128, MemFrag256, OpTy128, OpTy256,
+ /* IsRVariantCommutable */ 1,
+ /* IsMVariantCommutable */ 1,
+ Op>;
let neverHasSideEffects = 1 in {
defm r132 : fma3p_rm<opc132,
!strconcat(OpcodeStr, "132", PackTy),
MemFrag128, MemFrag256, OpTy128, OpTy256>;
- let isCommutable = 1 in
+ // For 231, only the register variant is commutable.
+ // For the memory variant the folded operand must be in 3. Thus,
+ // in that case, it cannot be swapped with 2.
defm r231 : fma3p_rm<opc231,
!strconcat(OpcodeStr, "231", PackTy),
- MemFrag128, MemFrag256, OpTy128, OpTy256>;
+ MemFrag128, MemFrag256, OpTy128, OpTy256,
+ /* IsRVariantCommutable */ 1,
+ /* IsMVariantCommutable */ 0>;
} // neverHasSideEffects = 1
}
@@ -119,8 +129,9 @@ let ExeDomain = SSEPackedDouble in {
let Constraints = "$src1 = $dst" in {
multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
RegisterClass RC, ValueType OpVT, PatFrag mem_frag,
+ bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0,
SDPatternOperator OpNode = null_frag> {
- let usesCustomInserter = 1 in
+ let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
@@ -128,7 +139,7 @@ multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
[(set RC:$dst,
(OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
- let mayLoad = 1 in
+ let mayLoad = 1, isCommutable = IsMVariantCommutable in
def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, RC:$src2, x86memop:$src3),
!strconcat(OpcodeStr,
@@ -147,14 +158,21 @@ multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
let neverHasSideEffects = 1 in {
defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
x86memop, RC, OpVT, mem_frag>;
- let isCommutable = 1 in
+ // See the other defm of r231 for the explanation regarding the
+ // commutable flags.
defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
- x86memop, RC, OpVT, mem_frag>;
+ x86memop, RC, OpVT, mem_frag,
+ /* IsRVariantCommutable */ 1,
+ /* IsMVariantCommutable */ 0>;
}
-let isCommutable = 1 in
+// See the other defm of r213 for the explanation regarding the
+// commutable flags.
defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
- x86memop, RC, OpVT, mem_frag, OpNode>;
+ x86memop, RC, OpVT, mem_frag,
+ /* IsRVariantCommutable */ 1,
+ /* IsMVariantCommutable */ 1,
+ OpNode>;
}
multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 486e5a9..1582f43 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -81,7 +81,7 @@ def X86pinsrb : SDNode<"X86ISD::PINSRB",
def X86pinsrw : SDNode<"X86ISD::PINSRW",
SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
-def X86insrtps : SDNode<"X86ISD::INSERTPS",
+def X86insertps : SDNode<"X86ISD::INSERTPS",
SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>;
def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL",
@@ -175,6 +175,9 @@ def X86select : SDNode<"X86ISD::SELECT" , SDTSelect>;
def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisSameAs<1,2>]>>;
+def X86pmuldq : SDNode<"X86ISD::PMULDQ",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisSameAs<1,2>]>>;
// Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get
// translated into one of the target nodes below during lowering.
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 6450f2a..6993577 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -36,11 +36,13 @@
#include "llvm/Target/TargetOptions.h"
#include <limits>
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-instr-info"
+
#define GET_INSTRINFO_CTOR_DTOR
#include "X86GenInstrInfo.inc"
-using namespace llvm;
-
static cl::opt<bool>
NoFusing("disable-spill-fusing",
cl::desc("Disable fusing of spill code into instructions"));
@@ -1511,12 +1513,14 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
/// operand and follow operands form a reference to the stack frame.
bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op,
int &FrameIndex) const {
- if (MI->getOperand(Op).isFI() && MI->getOperand(Op+1).isImm() &&
- MI->getOperand(Op+2).isReg() && MI->getOperand(Op+3).isImm() &&
- MI->getOperand(Op+1).getImm() == 1 &&
- MI->getOperand(Op+2).getReg() == 0 &&
- MI->getOperand(Op+3).getImm() == 0) {
- FrameIndex = MI->getOperand(Op).getIndex();
+ if (MI->getOperand(Op+X86::AddrBaseReg).isFI() &&
+ MI->getOperand(Op+X86::AddrScaleAmt).isImm() &&
+ MI->getOperand(Op+X86::AddrIndexReg).isReg() &&
+ MI->getOperand(Op+X86::AddrDisp).isImm() &&
+ MI->getOperand(Op+X86::AddrScaleAmt).getImm() == 1 &&
+ MI->getOperand(Op+X86::AddrIndexReg).getReg() == 0 &&
+ MI->getOperand(Op+X86::AddrDisp).getImm() == 0) {
+ FrameIndex = MI->getOperand(Op+X86::AddrBaseReg).getIndex();
return true;
}
return false;
@@ -1680,15 +1684,16 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
case X86::FsMOVAPSrm:
case X86::FsMOVAPDrm: {
// Loads from constant pools are trivially rematerializable.
- if (MI->getOperand(1).isReg() &&
- MI->getOperand(2).isImm() &&
- MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
+ if (MI->getOperand(1+X86::AddrBaseReg).isReg() &&
+ MI->getOperand(1+X86::AddrScaleAmt).isImm() &&
+ MI->getOperand(1+X86::AddrIndexReg).isReg() &&
+ MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 &&
MI->isInvariantLoad(AA)) {
- unsigned BaseReg = MI->getOperand(1).getReg();
+ unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg();
if (BaseReg == 0 || BaseReg == X86::RIP)
return true;
// Allow re-materialization of PIC load.
- if (!ReMatPICStubLoad && MI->getOperand(4).isGlobal())
+ if (!ReMatPICStubLoad && MI->getOperand(1+X86::AddrDisp).isGlobal())
return false;
const MachineFunction &MF = *MI->getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -1699,13 +1704,14 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
case X86::LEA32r:
case X86::LEA64r: {
- if (MI->getOperand(2).isImm() &&
- MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
- !MI->getOperand(4).isReg()) {
+ if (MI->getOperand(1+X86::AddrScaleAmt).isImm() &&
+ MI->getOperand(1+X86::AddrIndexReg).isReg() &&
+ MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 &&
+ !MI->getOperand(1+X86::AddrDisp).isReg()) {
// lea fi#, lea GV, etc. are all rematerializable.
- if (!MI->getOperand(1).isReg())
+ if (!MI->getOperand(1+X86::AddrBaseReg).isReg())
return true;
- unsigned BaseReg = MI->getOperand(1).getReg();
+ unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg();
if (BaseReg == 0)
return true;
// Allow re-materialization of lea PICBase + x.
@@ -1722,12 +1728,8 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
return true;
}
-/// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction that
-/// would clobber the EFLAGS condition register. Note the result may be
-/// conservative. If it cannot definitely determine the safety after visiting
-/// a few instructions in each direction it assumes it's not safe.
-static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) {
+bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
MachineBasicBlock::iterator E = MBB.end();
// For compile time consideration, if we are not able to determine the
@@ -1998,7 +2000,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
unsigned Src2 = MI->getOperand(2).getReg();
bool isKill2 = MI->getOperand(2).isKill();
unsigned leaInReg2 = 0;
- MachineInstr *InsMI2 = 0;
+ MachineInstr *InsMI2 = nullptr;
if (Src == Src2) {
// ADD16rr %reg1028<kill>, %reg1028
// just a single insert_subreg.
@@ -2062,14 +2064,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
// convert them to equivalent lea if the condition code register def's
// are dead!
if (hasLiveCondCodeDef(MI))
- return 0;
+ return nullptr;
MachineFunction &MF = *MI->getParent()->getParent();
// All instructions input are two-addr instructions. Get the known operands.
const MachineOperand &Dest = MI->getOperand(0);
const MachineOperand &Src = MI->getOperand(1);
- MachineInstr *NewMI = NULL;
+ MachineInstr *NewMI = nullptr;
// FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's. When
// we have better subtarget support, enable the 16-bit LEA generation here.
// 16-bit LEA is also slow on Core2.
@@ -2080,11 +2082,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
switch (MIOpc) {
case X86::SHUFPSrri: {
assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!");
- if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0;
+ if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return nullptr;
unsigned B = MI->getOperand(1).getReg();
unsigned C = MI->getOperand(2).getReg();
- if (B != C) return 0;
+ if (B != C) return nullptr;
unsigned M = MI->getOperand(3).getImm();
NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
.addOperand(Dest).addOperand(Src).addImm(M);
@@ -2092,11 +2094,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
}
case X86::SHUFPDrri: {
assert(MI->getNumOperands() == 4 && "Unknown shufpd instruction!");
- if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0;
+ if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return nullptr;
unsigned B = MI->getOperand(1).getReg();
unsigned C = MI->getOperand(2).getReg();
- if (B != C) return 0;
+ if (B != C) return nullptr;
unsigned M = MI->getOperand(3).getImm();
// Convert to PSHUFD mask.
@@ -2109,13 +2111,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
case X86::SHL64ri: {
assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
unsigned ShAmt = getTruncatedShiftCount(MI, 2);
- if (!isTruncatedShiftCountForLEA(ShAmt)) return 0;
+ if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
// LEA can't handle RSP.
if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
!MF.getRegInfo().constrainRegClass(Src.getReg(),
&X86::GR64_NOSPRegClass))
- return 0;
+ return nullptr;
NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
.addOperand(Dest)
@@ -2125,7 +2127,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
case X86::SHL32ri: {
assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
unsigned ShAmt = getTruncatedShiftCount(MI, 2);
- if (!isTruncatedShiftCountForLEA(ShAmt)) return 0;
+ if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
@@ -2135,7 +2137,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
SrcReg, isKill, isUndef, ImplicitOp))
- return 0;
+ return nullptr;
MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
.addOperand(Dest)
@@ -2151,10 +2153,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
case X86::SHL16ri: {
assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
unsigned ShAmt = getTruncatedShiftCount(MI, 2);
- if (!isTruncatedShiftCountForLEA(ShAmt)) return 0;
+ if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
if (DisableLEA16)
- return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : nullptr;
NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
.addOperand(Dest)
.addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
@@ -2163,7 +2165,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
default: {
switch (MIOpc) {
- default: return 0;
+ default: return nullptr;
case X86::INC64r:
case X86::INC32r:
case X86::INC64_32r: {
@@ -2175,7 +2177,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
SrcReg, isKill, isUndef, ImplicitOp))
- return 0;
+ return nullptr;
MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
.addOperand(Dest)
@@ -2189,7 +2191,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
case X86::INC16r:
case X86::INC64_16r:
if (DisableLEA16)
- return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+ : nullptr;
assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
.addOperand(Dest).addOperand(Src), 1);
@@ -2206,7 +2209,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
SrcReg, isKill, isUndef, ImplicitOp))
- return 0;
+ return nullptr;
MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
.addOperand(Dest)
@@ -2221,7 +2224,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
case X86::DEC16r:
case X86::DEC64_16r:
if (DisableLEA16)
- return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+ : nullptr;
assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
.addOperand(Dest).addOperand(Src), -1);
@@ -2242,7 +2246,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
SrcReg, isKill, isUndef, ImplicitOp))
- return 0;
+ return nullptr;
const MachineOperand &Src2 = MI->getOperand(2);
bool isKill2, isUndef2;
@@ -2250,7 +2254,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
SrcReg2, isKill2, isUndef2, ImplicitOp2))
- return 0;
+ return nullptr;
MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
.addOperand(Dest);
@@ -2272,7 +2276,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
case X86::ADD16rr:
case X86::ADD16rr_DB: {
if (DisableLEA16)
- return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+ : nullptr;
assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
unsigned Src2 = MI->getOperand(2).getReg();
bool isKill2 = MI->getOperand(2).isKill();
@@ -2311,7 +2316,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
SrcReg, isKill, isUndef, ImplicitOp))
- return 0;
+ return nullptr;
MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
.addOperand(Dest)
@@ -2327,7 +2332,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
case X86::ADD16ri_DB:
case X86::ADD16ri8_DB:
if (DisableLEA16)
- return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+ : nullptr;
assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
.addOperand(Dest).addOperand(Src),
@@ -2337,7 +2343,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
}
}
- if (!NewMI) return 0;
+ if (!NewMI) return nullptr;
if (LV) { // Update live variables
if (Src.isKill())
@@ -2789,11 +2795,11 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
std::next(I)->eraseFromParent();
Cond.clear();
- FBB = 0;
+ FBB = nullptr;
// Delete the JMP if it's equivalent to a fall-through.
if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
- TBB = 0;
+ TBB = nullptr;
I->eraseFromParent();
I = MBB.end();
UnCondBrIter = MBB.end();
@@ -3549,6 +3555,26 @@ inline static bool isDefConvertible(MachineInstr *MI) {
}
}
+/// isUseDefConvertible - check whether the use can be converted
+/// to remove a comparison against zero.
+static X86::CondCode isUseDefConvertible(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ default: return X86::COND_INVALID;
+ case X86::LZCNT16rr: case X86::LZCNT16rm:
+ case X86::LZCNT32rr: case X86::LZCNT32rm:
+ case X86::LZCNT64rr: case X86::LZCNT64rm:
+ return X86::COND_B;
+ case X86::POPCNT16rr:case X86::POPCNT16rm:
+ case X86::POPCNT32rr:case X86::POPCNT32rm:
+ case X86::POPCNT64rr:case X86::POPCNT64rm:
+ return X86::COND_E;
+ case X86::TZCNT16rr: case X86::TZCNT16rm:
+ case X86::TZCNT32rr: case X86::TZCNT32rm:
+ case X86::TZCNT64rr: case X86::TZCNT64rm:
+ return X86::COND_B;
+ }
+}
+
/// optimizeCompareInstr - Check if there exists an earlier instruction that
/// operates on the same source operands and sets flags in the same way as
/// Compare; remove Compare if possible.
@@ -3615,13 +3641,38 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
// If we are comparing against zero, check whether we can use MI to update
// EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0);
- if (IsCmpZero && (MI->getParent() != CmpInstr->getParent() ||
- !isDefConvertible(MI)))
+ if (IsCmpZero && MI->getParent() != CmpInstr->getParent())
return false;
+ // If we have a use of the source register between the def and our compare
+ // instruction we can eliminate the compare iff the use sets EFLAGS in the
+ // right way.
+ bool ShouldUpdateCC = false;
+ X86::CondCode NewCC = X86::COND_INVALID;
+ if (IsCmpZero && !isDefConvertible(MI)) {
+ // Scan forward from the use until we hit the use we're looking for or the
+ // compare instruction.
+ for (MachineBasicBlock::iterator J = MI;; ++J) {
+ // Do we have a convertible instruction?
+ NewCC = isUseDefConvertible(J);
+ if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() &&
+ J->getOperand(1).getReg() == SrcReg) {
+ assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!");
+ ShouldUpdateCC = true; // Update CC later on.
+ // This is not a def of SrcReg, but still a def of EFLAGS. Keep going
+ // with the new def.
+ MI = Def = J;
+ break;
+ }
+
+ if (J == I)
+ return false;
+ }
+ }
+
// We are searching for an earlier instruction that can make CmpInstr
// redundant and that instruction will be saved in Sub.
- MachineInstr *Sub = NULL;
+ MachineInstr *Sub = nullptr;
const TargetRegisterInfo *TRI = &getRegisterInfo();
// We iterate backward, starting from the instruction before CmpInstr and
@@ -3634,7 +3685,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
RE = CmpInstr->getParent() == MI->getParent() ?
MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ :
CmpInstr->getParent()->rend();
- MachineInstr *Movr0Inst = 0;
+ MachineInstr *Movr0Inst = nullptr;
for (; RI != RE; ++RI) {
MachineInstr *Instr = &*RI;
// Check whether CmpInstr can be made redundant by the current instruction.
@@ -3716,13 +3767,28 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
// CF and OF are used, we can't perform this optimization.
return false;
}
+
+ // If we're updating the condition code check if we have to reverse the
+ // condition.
+ if (ShouldUpdateCC)
+ switch (OldCC) {
+ default:
+ return false;
+ case X86::COND_E:
+ break;
+ case X86::COND_NE:
+ NewCC = GetOppositeBranchCondition(NewCC);
+ break;
+ }
} else if (IsSwapped) {
// If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
// to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
// We swap the condition code and synthesize the new opcode.
- X86::CondCode NewCC = getSwappedCondition(OldCC);
+ NewCC = getSwappedCondition(OldCC);
if (NewCC == X86::COND_INVALID) return false;
+ }
+ if ((ShouldUpdateCC || IsSwapped) && NewCC != OldCC) {
// Synthesize the new opcode.
bool HasMemoryOperand = Instr.hasOneMemOperand();
unsigned NewOpc;
@@ -3809,19 +3875,19 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
unsigned &FoldAsLoadDefReg,
MachineInstr *&DefMI) const {
if (FoldAsLoadDefReg == 0)
- return 0;
+ return nullptr;
// To be conservative, if there exists another load, clear the load candidate.
if (MI->mayLoad()) {
FoldAsLoadDefReg = 0;
- return 0;
+ return nullptr;
}
// Check whether we can move DefMI here.
DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
assert(DefMI);
bool SawStore = false;
- if (!DefMI->isSafeToMove(this, 0, SawStore))
- return 0;
+ if (!DefMI->isSafeToMove(this, nullptr, SawStore))
+ return nullptr;
// We try to commute MI if possible.
unsigned IdxEnd = (MI->isCommutable()) ? 2 : 1;
@@ -3838,12 +3904,12 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
continue;
// Do not fold if we have a subreg use or a def or multiple uses.
if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
- return 0;
+ return nullptr;
SrcOperandId = i;
FoundSrcOperand = true;
}
- if (!FoundSrcOperand) return 0;
+ if (!FoundSrcOperand) return nullptr;
// Check whether we can fold the def into SrcOperandId.
SmallVector<unsigned, 8> Ops;
@@ -3857,22 +3923,22 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
if (Idx == 1) {
// MI was changed but it didn't help, commute it back!
commuteInstruction(MI, false);
- return 0;
+ return nullptr;
}
// Check whether we can commute MI and enable folding.
if (MI->isCommutable()) {
MachineInstr *NewMI = commuteInstruction(MI, false);
// Unable to commute.
- if (!NewMI) return 0;
+ if (!NewMI) return nullptr;
if (NewMI != MI) {
// New instruction. It doesn't need to be kept.
NewMI->eraseFromParent();
- return 0;
+ return nullptr;
}
}
}
- return 0;
+ return nullptr;
}
/// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr
@@ -4007,7 +4073,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
MachineInstr *MI, unsigned i,
const SmallVectorImpl<MachineOperand> &MOs,
unsigned Size, unsigned Align) const {
- const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = 0;
+ const DenseMap<unsigned,
+ std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
bool isCallRegIndirect = TM.getSubtarget<X86Subtarget>().callRegIndirect();
bool isTwoAddrFold = false;
@@ -4015,7 +4082,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
// when X86Subtarget is Atom.
if (isCallRegIndirect &&
(MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) {
- return NULL;
+ return nullptr;
}
unsigned NumOps = MI->getDesc().getNumOperands();
@@ -4026,9 +4093,9 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
// X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
if (MI->getOpcode() == X86::ADD32ri &&
MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
- return NULL;
+ return nullptr;
- MachineInstr *NewMI = NULL;
+ MachineInstr *NewMI = nullptr;
// Folding a memory location into the two-address part of a two-address
// instruction is different than folding it other places. It requires
// replacing the *two* registers with the memory location.
@@ -4063,7 +4130,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
unsigned Opcode = I->second.first;
unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
if (Align < MinAlign)
- return NULL;
+ return nullptr;
bool NarrowToMOV32rm = false;
if (Size) {
unsigned RCSize = getRegClass(MI->getDesc(), i, &RI, MF)->getSize();
@@ -4071,12 +4138,12 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
// Check if it's safe to fold the load. If the size of the object is
// narrower than the load width, then it's not.
if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
- return NULL;
+ return nullptr;
// If this is a 64-bit load, but the spill slot is 32, then we can do
// a 32-bit load which is implicitly zero-extended. This likely is due
// to liveintervalanalysis remat'ing a load from stack slot.
if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg())
- return NULL;
+ return nullptr;
Opcode = X86::MOV32rm;
NarrowToMOV32rm = true;
}
@@ -4105,7 +4172,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
// No fusion
if (PrintFailedFusing && !MI->isCopy())
dbgs() << "We failed to fuse operand " << i << " in " << *MI;
- return NULL;
+ return nullptr;
}
/// hasPartialRegUpdate - Return true for all instructions that only update
@@ -4270,14 +4337,14 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
const SmallVectorImpl<unsigned> &Ops,
int FrameIndex) const {
// Check switch flag
- if (NoFusing) return NULL;
+ if (NoFusing) return nullptr;
// Unless optimizing for size, don't fold to avoid partial
// register update stalls
if (!MF.getFunction()->getAttributes().
hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
hasPartialRegUpdate(MI->getOpcode()))
- return 0;
+ return nullptr;
const MachineFrameInfo *MFI = MF.getFrameInfo();
unsigned Size = MFI->getObjectSize(FrameIndex);
@@ -4290,7 +4357,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
unsigned NewOpc = 0;
unsigned RCSize = 0;
switch (MI->getOpcode()) {
- default: return NULL;
+ default: return nullptr;
case X86::TEST8rr: NewOpc = X86::CMP8ri; RCSize = 1; break;
case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break;
case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break;
@@ -4299,12 +4366,12 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
// Check if it's safe to fold the load. If the size of the object is
// narrower than the load width, then it's not.
if (Size < RCSize)
- return NULL;
+ return nullptr;
// Change to CMPXXri r, 0 first.
MI->setDesc(get(NewOpc));
MI->getOperand(1).ChangeToImmediate(0);
} else if (Ops.size() != 1)
- return NULL;
+ return nullptr;
SmallVector<MachineOperand,4> MOs;
MOs.push_back(MachineOperand::CreateFI(FrameIndex));
@@ -4322,14 +4389,14 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex);
// Check switch flag
- if (NoFusing) return NULL;
+ if (NoFusing) return nullptr;
// Unless optimizing for size, don't fold to avoid partial
// register update stalls
if (!MF.getFunction()->getAttributes().
hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
hasPartialRegUpdate(MI->getOpcode()))
- return 0;
+ return nullptr;
// Determine the alignment of the load.
unsigned Alignment = 0;
@@ -4352,12 +4419,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
Alignment = 4;
break;
default:
- return 0;
+ return nullptr;
}
if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
unsigned NewOpc = 0;
switch (MI->getOpcode()) {
- default: return NULL;
+ default: return nullptr;
case X86::TEST8rr: NewOpc = X86::CMP8ri; break;
case X86::TEST16rr: NewOpc = X86::CMP16ri8; break;
case X86::TEST32rr: NewOpc = X86::CMP32ri8; break;
@@ -4367,12 +4434,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
MI->setDesc(get(NewOpc));
MI->getOperand(1).ChangeToImmediate(0);
} else if (Ops.size() != 1)
- return NULL;
+ return nullptr;
// Make sure the subregisters match.
// Otherwise we risk changing the size of the load.
if (LoadMI->getOperand(0).getSubReg() != MI->getOperand(Ops[0]).getSubReg())
- return NULL;
+ return nullptr;
SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
switch (LoadMI->getOpcode()) {
@@ -4388,7 +4455,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
// Medium and large mode can't fold loads this way.
if (TM.getCodeModel() != CodeModel::Small &&
TM.getCodeModel() != CodeModel::Kernel)
- return NULL;
+ return nullptr;
// x86-32 PIC requires a PIC base register for constant pools.
unsigned PICBase = 0;
@@ -4400,7 +4467,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
// This doesn't work for several reasons.
// 1. GlobalBaseReg may have been spilled.
// 2. It may not be live at MI.
- return NULL;
+ return nullptr;
}
// Create a constant-pool entry.
@@ -4436,14 +4503,14 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
> 4)
// These instructions only load 32 bits, we can't fold them if the
// destination register is wider than 32 bits (4 bytes).
- return NULL;
+ return nullptr;
if ((LoadMI->getOpcode() == X86::MOVSDrm ||
LoadMI->getOpcode() == X86::VMOVSDrm) &&
MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize()
> 8)
// These instructions only load 64 bits, we can't fold them if the
// destination register is wider than 64 bits (8 bytes).
- return NULL;
+ return nullptr;
// Folding a normal load. Just copy the load's address operands.
for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
@@ -4489,7 +4556,8 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
// Folding a memory location into the two-address part of a two-address
// instruction is different than folding it other places. It requires
// replacing the *two* registers with the memory location.
- const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = 0;
+ const DenseMap<unsigned,
+ std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
if (isTwoAddr && NumOps >= 2 && OpNum < 2) {
OpcodeTablePtr = &RegOp2MemOpTable2Addr;
} else if (OpNum == 0) { // If operand 0
@@ -4671,7 +4739,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
AddrOps.push_back(Chain);
// Emit the load instruction.
- SDNode *Load = 0;
+ SDNode *Load = nullptr;
if (FoldedLoad) {
EVT VT = *RC->vt_begin();
std::pair<MachineInstr::mmo_iterator,
@@ -4696,7 +4764,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
// Emit the data processing instruction.
std::vector<EVT> VTs;
- const TargetRegisterClass *DstRC = 0;
+ const TargetRegisterClass *DstRC = nullptr;
if (MCID.getNumDefs() > 0) {
DstRC = getRegClass(MCID, 0, &RI, MF);
VTs.push_back(*DstRC->vt_begin());
@@ -5190,14 +5258,14 @@ static const uint16_t *lookup(unsigned opcode, unsigned domain) {
for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i)
if (ReplaceableInstrs[i][domain-1] == opcode)
return ReplaceableInstrs[i];
- return 0;
+ return nullptr;
}
static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i)
if (ReplaceableInstrsAVX2[i][domain-1] == opcode)
return ReplaceableInstrsAVX2[i];
- return 0;
+ return nullptr;
}
std::pair<uint16_t, uint16_t>
@@ -5327,8 +5395,10 @@ namespace {
const X86TargetMachine *TM =
static_cast<const X86TargetMachine *>(&MF.getTarget());
- assert(!TM->getSubtarget<X86Subtarget>().is64Bit() &&
- "X86-64 PIC uses RIP relative addressing");
+ // Don't do anything if this is 64-bit as 64-bit PIC
+ // uses RIP relative addressing.
+ if (TM->getSubtarget<X86Subtarget>().is64Bit())
+ return false;
// Only emit a global base reg in PIC mode.
if (TM->getRelocationModel() != Reloc::PIC_)
@@ -5383,7 +5453,7 @@ namespace {
char CGBR::ID = 0;
FunctionPass*
-llvm::createGlobalBaseRegPass() { return new CGBR(); }
+llvm::createX86GlobalBaseRegPass() { return new CGBR(); }
namespace {
struct LDTLSCleanup : public MachineFunctionPass {
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 156291e..5f34915 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -325,7 +325,7 @@ public:
/// value.
unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
bool UnfoldLoad, bool UnfoldStore,
- unsigned *LoadRegIndex = 0) const override;
+ unsigned *LoadRegIndex = nullptr) const override;
/// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler
/// to determine if two loads are loading from the same base address. It
@@ -359,6 +359,13 @@ public:
/// instruction that defines the specified register class.
bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
+ /// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction tha
+ /// would clobber the EFLAGS condition register. Note the result may be
+ /// conservative. If it cannot definitely determine the safety after visiting
+ /// a few instructions in each direction it assumes it's not safe.
+ bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const;
+
static bool isX86_64ExtendedReg(const MachineOperand &MO) {
if (!MO.isReg()) return false;
return X86II::isX86_64ExtendedReg(MO.getReg());
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 8edf873..0d97669 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -206,6 +206,8 @@ def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void,
[SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def X86rdtscp : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void,
+ [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>;
def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>;
@@ -249,7 +251,6 @@ def X86xor_flag : SDNode<"X86ISD::XOR", SDTBinaryArithWithFlags,
def X86and_flag : SDNode<"X86ISD::AND", SDTBinaryArithWithFlags,
[SDNPCommutative]>;
-def X86bzhi : SDNode<"X86ISD::BZHI", SDTIntShiftOp>;
def X86bextr : SDNode<"X86ISD::BEXTR", SDTIntBinOp>;
def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
@@ -2001,6 +2002,46 @@ let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
(implicit EFLAGS)]>, XS;
}
+let Predicates = [HasLZCNT] in {
+ def : Pat<(X86cmov (ctlz GR16:$src), (i16 16), (X86_COND_E),
+ (X86cmp GR16:$src, (i16 0))),
+ (LZCNT16rr GR16:$src)>;
+ def : Pat<(X86cmov (ctlz GR32:$src), (i32 32), (X86_COND_E),
+ (X86cmp GR32:$src, (i32 0))),
+ (LZCNT32rr GR32:$src)>;
+ def : Pat<(X86cmov (ctlz GR64:$src), (i64 64), (X86_COND_E),
+ (X86cmp GR64:$src, (i64 0))),
+ (LZCNT64rr GR64:$src)>;
+ def : Pat<(X86cmov (i16 16), (ctlz GR16:$src), (X86_COND_E),
+ (X86cmp GR16:$src, (i16 0))),
+ (LZCNT16rr GR16:$src)>;
+ def : Pat<(X86cmov (i32 32), (ctlz GR32:$src), (X86_COND_E),
+ (X86cmp GR32:$src, (i32 0))),
+ (LZCNT32rr GR32:$src)>;
+ def : Pat<(X86cmov (i64 64), (ctlz GR64:$src), (X86_COND_E),
+ (X86cmp GR64:$src, (i64 0))),
+ (LZCNT64rr GR64:$src)>;
+
+ def : Pat<(X86cmov (ctlz (loadi16 addr:$src)), (i16 16), (X86_COND_E),
+ (X86cmp (loadi16 addr:$src), (i16 0))),
+ (LZCNT16rm addr:$src)>;
+ def : Pat<(X86cmov (ctlz (loadi32 addr:$src)), (i32 32), (X86_COND_E),
+ (X86cmp (loadi32 addr:$src), (i32 0))),
+ (LZCNT32rm addr:$src)>;
+ def : Pat<(X86cmov (ctlz (loadi64 addr:$src)), (i64 64), (X86_COND_E),
+ (X86cmp (loadi64 addr:$src), (i64 0))),
+ (LZCNT64rm addr:$src)>;
+ def : Pat<(X86cmov (i16 16), (ctlz (loadi16 addr:$src)), (X86_COND_E),
+ (X86cmp (loadi16 addr:$src), (i16 0))),
+ (LZCNT16rm addr:$src)>;
+ def : Pat<(X86cmov (i32 32), (ctlz (loadi32 addr:$src)), (X86_COND_E),
+ (X86cmp (loadi32 addr:$src), (i32 0))),
+ (LZCNT32rm addr:$src)>;
+ def : Pat<(X86cmov (i64 64), (ctlz (loadi64 addr:$src)), (X86_COND_E),
+ (X86cmp (loadi64 addr:$src), (i64 0))),
+ (LZCNT64rm addr:$src)>;
+}
+
//===----------------------------------------------------------------------===//
// BMI Instructions
//
@@ -2077,6 +2118,47 @@ let Predicates = [HasBMI] in {
(BLSI64rr GR64:$src)>;
}
+let Predicates = [HasBMI] in {
+ def : Pat<(X86cmov (cttz GR16:$src), (i16 16), (X86_COND_E),
+ (X86cmp GR16:$src, (i16 0))),
+ (TZCNT16rr GR16:$src)>;
+ def : Pat<(X86cmov (cttz GR32:$src), (i32 32), (X86_COND_E),
+ (X86cmp GR32:$src, (i32 0))),
+ (TZCNT32rr GR32:$src)>;
+ def : Pat<(X86cmov (cttz GR64:$src), (i64 64), (X86_COND_E),
+ (X86cmp GR64:$src, (i64 0))),
+ (TZCNT64rr GR64:$src)>;
+ def : Pat<(X86cmov (i16 16), (cttz GR16:$src), (X86_COND_E),
+ (X86cmp GR16:$src, (i16 0))),
+ (TZCNT16rr GR16:$src)>;
+ def : Pat<(X86cmov (i32 32), (cttz GR32:$src), (X86_COND_E),
+ (X86cmp GR32:$src, (i32 0))),
+ (TZCNT32rr GR32:$src)>;
+ def : Pat<(X86cmov (i64 64), (cttz GR64:$src), (X86_COND_E),
+ (X86cmp GR64:$src, (i64 0))),
+ (TZCNT64rr GR64:$src)>;
+
+ def : Pat<(X86cmov (cttz (loadi16 addr:$src)), (i16 16), (X86_COND_E),
+ (X86cmp (loadi16 addr:$src), (i16 0))),
+ (TZCNT16rm addr:$src)>;
+ def : Pat<(X86cmov (cttz (loadi32 addr:$src)), (i32 32), (X86_COND_E),
+ (X86cmp (loadi32 addr:$src), (i32 0))),
+ (TZCNT32rm addr:$src)>;
+ def : Pat<(X86cmov (cttz (loadi64 addr:$src)), (i64 64), (X86_COND_E),
+ (X86cmp (loadi64 addr:$src), (i64 0))),
+ (TZCNT64rm addr:$src)>;
+ def : Pat<(X86cmov (i16 16), (cttz (loadi16 addr:$src)), (X86_COND_E),
+ (X86cmp (loadi16 addr:$src), (i16 0))),
+ (TZCNT16rm addr:$src)>;
+ def : Pat<(X86cmov (i32 32), (cttz (loadi32 addr:$src)), (X86_COND_E),
+ (X86cmp (loadi32 addr:$src), (i32 0))),
+ (TZCNT32rm addr:$src)>;
+ def : Pat<(X86cmov (i64 64), (cttz (loadi64 addr:$src)), (X86_COND_E),
+ (X86cmp (loadi64 addr:$src), (i64 0))),
+ (TZCNT64rm addr:$src)>;
+}
+
+
multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
X86MemOperand x86memop, Intrinsic Int,
PatFrag ld_frag> {
@@ -2104,18 +2186,38 @@ let Predicates = [HasBMI2], Defs = [EFLAGS] in {
int_x86_bmi_bzhi_64, loadi64>, VEX_W;
}
-def : Pat<(X86bzhi GR32:$src1, GR8:$src2),
- (BZHI32rr GR32:$src1,
- (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-def : Pat<(X86bzhi (loadi32 addr:$src1), GR8:$src2),
- (BZHI32rm addr:$src1,
- (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-def : Pat<(X86bzhi GR64:$src1, GR8:$src2),
- (BZHI64rr GR64:$src1,
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-def : Pat<(X86bzhi (loadi64 addr:$src1), GR8:$src2),
- (BZHI64rm addr:$src1,
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+def CountTrailingOnes : SDNodeXForm<imm, [{
+ // Count the trailing ones in the immediate.
+ return getI8Imm(CountTrailingOnes_64(N->getZExtValue()));
+}]>;
+
+def BZHIMask : ImmLeaf<i64, [{
+ return isMask_64(Imm) && (CountTrailingOnes_64(Imm) > 32);
+}]>;
+
+let Predicates = [HasBMI2] in {
+ def : Pat<(and GR64:$src, BZHIMask:$mask),
+ (BZHI64rr GR64:$src,
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
+
+ def : Pat<(and GR32:$src, (add (shl 1, GR8:$lz), -1)),
+ (BZHI32rr GR32:$src,
+ (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+ def : Pat<(and (loadi32 addr:$src), (add (shl 1, GR8:$lz), -1)),
+ (BZHI32rm addr:$src,
+ (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+ def : Pat<(and GR64:$src, (add (shl 1, GR8:$lz), -1)),
+ (BZHI64rr GR64:$src,
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+ def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)),
+ (BZHI64rm addr:$src,
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+} // HasBMI2
let Predicates = [HasBMI] in {
def : Pat<(X86bextr GR32:$src1, GR32:$src2),
@@ -2617,21 +2719,21 @@ def : InstAlias<"fnstsw" , (FNSTSW16r)>;
// lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but
// this is compatible with what GAS does.
-def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
-def : InstAlias<"ljmp $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
-def : InstAlias<"lcall *$dst", (FARCALL32m opaque48mem:$dst)>, Requires<[Not16BitMode]>;
-def : InstAlias<"ljmp *$dst", (FARJMP32m opaque48mem:$dst)>, Requires<[Not16BitMode]>;
-def : InstAlias<"lcall $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
-def : InstAlias<"ljmp $seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
-def : InstAlias<"lcall *$dst", (FARCALL16m opaque32mem:$dst)>, Requires<[In16BitMode]>;
-def : InstAlias<"ljmp *$dst", (FARJMP16m opaque32mem:$dst)>, Requires<[In16BitMode]>;
-
-def : InstAlias<"call *$dst", (CALL64m i16mem:$dst)>, Requires<[In64BitMode]>;
-def : InstAlias<"jmp *$dst", (JMP64m i16mem:$dst)>, Requires<[In64BitMode]>;
-def : InstAlias<"call *$dst", (CALL32m i16mem:$dst)>, Requires<[In32BitMode]>;
-def : InstAlias<"jmp *$dst", (JMP32m i16mem:$dst)>, Requires<[In32BitMode]>;
-def : InstAlias<"call *$dst", (CALL16m i16mem:$dst)>, Requires<[In16BitMode]>;
-def : InstAlias<"jmp *$dst", (JMP16m i16mem:$dst)>, Requires<[In16BitMode]>;
+def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall *$dst", (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp *$dst", (FARJMP32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp $seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"lcall *$dst", (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp *$dst", (FARJMP16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+
+def : InstAlias<"call *$dst", (CALL64m i16mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"jmp *$dst", (JMP64m i16mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"call *$dst", (CALL32m i16mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"jmp *$dst", (JMP32m i16mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"call *$dst", (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp *$dst", (JMP16m i16mem:$dst), 0>, Requires<[In16BitMode]>;
// "imul <imm>, B" is an alias for "imul <imm>, B, B".
@@ -2664,11 +2766,11 @@ def : InstAlias<"jmpl $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>;
// Force mov without a suffix with a segment and mem to prefer the 'l' form of
// the move. All segment/mem forms are equivalent, this has the shortest
// encoding.
-def : InstAlias<"mov $mem, $seg", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem)>;
-def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg)>;
+def : InstAlias<"mov $mem, $seg", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>;
+def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>;
// Match 'movq <largeimm>, <reg>' as an alias for movabsq.
-def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm)>;
+def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
// Match 'movq GR64, MMX' as an alias for movd.
def : InstAlias<"movq $src, $dst",
@@ -2705,7 +2807,7 @@ def : InstAlias<"outl\t$port", (OUT32ir i8imm:$port), 0>;
// 'sldt <mem>' can be encoded with either sldtw or sldtq with the same
// effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity
// errors, since its encoding is the most compact.
-def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem)>;
+def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem), 0>;
// shld/shrd op,op -> shld op, op, CL
def : InstAlias<"shld{w}\t{$r2, $r1|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>;
@@ -2751,19 +2853,29 @@ defm : ShiftRotateByOneAlias<"ror", "ROR">;
FIXME */
// test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms.
-def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}", (TEST8rm GR8 :$val, i8mem :$mem)>;
-def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}", (TEST16rm GR16:$val, i16mem:$mem)>;
-def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}", (TEST32rm GR32:$val, i32mem:$mem)>;
-def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}", (TEST64rm GR64:$val, i64mem:$mem)>;
+def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}",
+ (TEST8rm GR8 :$val, i8mem :$mem), 0>;
+def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}",
+ (TEST16rm GR16:$val, i16mem:$mem), 0>;
+def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}",
+ (TEST32rm GR32:$val, i32mem:$mem), 0>;
+def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}",
+ (TEST64rm GR64:$val, i64mem:$mem), 0>;
// xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms.
-def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}", (XCHG8rm GR8 :$val, i8mem :$mem)>;
-def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}", (XCHG16rm GR16:$val, i16mem:$mem)>;
-def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}", (XCHG32rm GR32:$val, i32mem:$mem)>;
-def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}", (XCHG64rm GR64:$val, i64mem:$mem)>;
+def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}",
+ (XCHG8rm GR8 :$val, i8mem :$mem), 0>;
+def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}",
+ (XCHG16rm GR16:$val, i16mem:$mem), 0>;
+def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}",
+ (XCHG32rm GR32:$val, i32mem:$mem), 0>;
+def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}",
+ (XCHG64rm GR64:$val, i64mem:$mem), 0>;
// xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms.
-def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src)>;
-def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar GR32:$src)>, Requires<[Not64BitMode]>;
-def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>;
-def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src)>;
+def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src), 0>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
+ (XCHG32ar GR32:$src), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
+ (XCHG32ar64 GR32_NOAX:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src), 0>;
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 050ee39..ecf80a1 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -254,6 +254,11 @@ let neverHasSideEffects = 1 in
def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
"movq\t{$src, $dst|$dst, $src}", [],
IIC_MMX_MOVQ_RR>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src),
+ "movq\t{$src, $dst|$dst, $src}", [],
+ IIC_MMX_MOVQ_RR>;
+}
} // SchedRW
let SchedRW = [WriteLoad] in {
@@ -262,11 +267,12 @@ def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
"movq\t{$src, $dst|$dst, $src}",
[(set VR64:$dst, (load_mmx addr:$src))],
IIC_MMX_MOVQ_RM>;
+} // SchedRW
+let SchedRW = [WriteStore] in
def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
"movq\t{$src, $dst|$dst, $src}",
[(store (x86mmx VR64:$src), addr:$dst)],
IIC_MMX_MOVQ_RM>;
-} // SchedRW
let SchedRW = [WriteMove] in {
def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index f2f3967..1eb0485 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1561,9 +1561,9 @@ defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
let Predicates = [UseAVX] in {
def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
- (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src)>;
+ (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
- (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>;
+ (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
(VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
@@ -1627,9 +1627,9 @@ def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
(CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
- (CVTSI2SSrm FR64:$dst, i32mem:$src)>;
+ (CVTSI2SSrm FR64:$dst, i32mem:$src), 0>;
def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
- (CVTSI2SDrm FR64:$dst, i32mem:$src)>;
+ (CVTSI2SDrm FR64:$dst, i32mem:$src), 0>;
// Conversion Instructions Intrinsics - Match intrinsics which expect MM
// and/or XMM operand(s).
@@ -2005,7 +2005,7 @@ def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
// XMM only
def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
- (VCVTPD2DQrr VR128:$dst, VR128:$src)>;
+ (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
@@ -2024,7 +2024,7 @@ def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
(int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>,
VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
- (VCVTPD2DQYrr VR128:$dst, VR256:$src)>;
+ (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
}
def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
@@ -2127,7 +2127,7 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
// XMM only
def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
- (VCVTTPD2DQrr VR128:$dst, VR128:$src)>;
+ (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttpd2dqx\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvttpd2dq
@@ -2146,7 +2146,7 @@ def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
(int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
- (VCVTTPD2DQYrr VR128:$dst, VR256:$src)>;
+ (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
let Predicates = [HasAVX] in {
def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
@@ -2252,7 +2252,7 @@ def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
// XMM only
def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
- (VCVTPD2PSrr VR128:$dst, VR128:$src)>;
+ (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2psx\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
@@ -2271,7 +2271,7 @@ def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
(int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))],
IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
- (VCVTPD2PSYrr VR128:$dst, VR256:$src)>;
+ (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
@@ -2973,6 +2973,19 @@ defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>;
let isCommutable = 0 in
defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
+// AVX1 requires type coercions in order to fold loads directly into logical
+// operations.
+let Predicates = [HasAVX1Only] in {
+ def : Pat<(bc_v8f32 (and VR256:$src1, (loadv4i64 addr:$src2))),
+ (VANDPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(bc_v8f32 (or VR256:$src1, (loadv4i64 addr:$src2))),
+ (VORPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(bc_v8f32 (xor VR256:$src1, (loadv4i64 addr:$src2))),
+ (VXORPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(bc_v8f32 (X86andnp VR256:$src1, (loadv4i64 addr:$src2))),
+ (VANDNPSYrm VR256:$src1, addr:$src2)>;
+}
+
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Arithmetic Instructions
//===----------------------------------------------------------------------===//
@@ -3144,23 +3157,23 @@ let Predicates = [UseSSE2] in {
let Predicates = [UseSSE41] in {
// If the subtarget has SSE4.1 but not AVX, the vector insert
- // instruction is lowered into a X86insrtps rather than a X86Movss.
+ // instruction is lowered into a X86insertps rather than a X86Movss.
// When selecting SSE scalar single-precision fp arithmetic instructions,
- // make sure that we correctly match the X86insrtps.
+ // make sure that we correctly match the X86insertps.
- def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
(fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
FR32:$src))), (iPTR 0))),
(ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
- def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
(fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
FR32:$src))), (iPTR 0))),
(SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
- def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
(fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
FR32:$src))), (iPTR 0))),
(MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
- def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
(fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
FR32:$src))), (iPTR 0))),
(DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
@@ -3186,19 +3199,19 @@ let Predicates = [HasAVX] in {
(f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
FR64:$src))))),
(VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
- def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
(fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
FR32:$src))), (iPTR 0))),
(VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
- def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
(fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
FR32:$src))), (iPTR 0))),
(VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
- def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
(fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
FR32:$src))), (iPTR 0))),
(VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
- def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
(fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
FR32:$src))), (iPTR 0))),
(VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
@@ -4068,6 +4081,10 @@ defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
SSE_INTALUQ_ITINS_P, 1>;
defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
SSE_INTMUL_ITINS_P, 1>;
+defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
+ SSE_INTMUL_ITINS_P, 1>;
+defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
+ SSE_INTMUL_ITINS_P, 1>;
defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
SSE_INTALU_ITINS_P, 0>;
defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
@@ -4102,10 +4119,6 @@ defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b,
int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>;
defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>;
-defm PMULHUW : PDI_binop_all_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w,
- int_x86_avx2_pmulhu_w, SSE_INTMUL_ITINS_P, 1>;
-defm PMULHW : PDI_binop_all_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w,
- int_x86_avx2_pmulh_w, SSE_INTMUL_ITINS_P, 1>;
defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
defm PAVGB : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
@@ -6515,7 +6528,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
!strconcat(asm,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
- (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
+ (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
Sched<[WriteFShuffle]>;
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, f32mem:$src2, u32u8imm:$src3),
@@ -6524,7 +6537,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
!strconcat(asm,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
- (X86insrtps VR128:$src1,
+ (X86insertps VR128:$src1,
(v4f32 (scalar_to_vector (loadf32 addr:$src2))),
imm:$src3))], itins.rm>,
Sched<[WriteFShuffleLd, ReadAfterLd]>;
@@ -6537,6 +6550,29 @@ let ExeDomain = SSEPackedSingle in {
defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
}
+let Predicates = [UseSSE41] in {
+ // If we're inserting an element from a load or a null pshuf of a load,
+ // fold the load into the insertps instruction.
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32
+ (scalar_to_vector (loadf32 addr:$src2))), (i8 0)),
+ imm:$src3)),
+ (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd
+ (loadv4f32 addr:$src2), (i8 0)), imm:$src3)),
+ (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+}
+
+let Predicates = [UseAVX] in {
+ // If we're inserting an element from a vbroadcast of a load, fold the
+ // load into the X86insertps instruction.
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+ (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
+ (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+ (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
+ (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+}
+
//===----------------------------------------------------------------------===//
// SSE4.1 - Round Instructions
//===----------------------------------------------------------------------===//
@@ -6990,6 +7026,31 @@ multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
+/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst
+/// types.
+multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType DstVT, ValueType SrcVT, RegisterClass RC,
+ PatFrag memop_frag, X86MemOperand x86memop,
+ OpndItins itins,
+ bit IsCommutable = 0, bit Is2Addr = 1> {
+ let isCommutable = IsCommutable in
+ def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
+ Sched<[itins.Sched]>;
+ def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
+ (bitconvert (memop_frag addr:$src2)))))]>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
let Predicates = [HasAVX] in {
let isCommutable = 0 in
defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
@@ -7018,8 +7079,9 @@ let Predicates = [HasAVX] in {
defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128,
loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
VEX_4V;
- defm VPMULDQ : SS41I_binop_rm_int<0x28, "vpmuldq", int_x86_sse41_pmuldq,
- 0, DEFAULT_ITINS_VECIMULSCHED>, VEX_4V;
+ defm VPMULDQ : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
+ VR128, loadv2i64, i128mem,
+ SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
}
let Predicates = [HasAVX2] in {
@@ -7051,9 +7113,9 @@ let Predicates = [HasAVX2] in {
defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256,
loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
VEX_4V, VEX_L;
- defm VPMULDQ : SS41I_binop_rm_int_y<0x28, "vpmuldq",
- int_x86_avx2_pmul_dq, WriteVecIMul>,
- VEX_4V, VEX_L;
+ defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
+ VR256, loadv4i64, i256mem,
+ SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
}
let Constraints = "$src1 = $dst" in {
@@ -7076,8 +7138,9 @@ let Constraints = "$src1 = $dst" in {
memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128,
memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
- defm PMULDQ : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq,
- 1, SSE_INTMUL_ITINS_P>;
+ defm PMULDQ : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32,
+ VR128, memopv2i64, i128mem,
+ SSE_INTMUL_ITINS_P, 1>;
}
let Predicates = [HasAVX] in {
@@ -7394,6 +7457,7 @@ let Predicates = [UseSSE41] in {
}
+let SchedRW = [WriteLoad] in {
let Predicates = [HasAVX] in
def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"vmovntdqa\t{$src, $dst|$dst, $src}",
@@ -7407,6 +7471,7 @@ def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"movntdqa\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
+} // SchedRW
//===----------------------------------------------------------------------===//
// SSE4.2 - Compare Instructions
@@ -7831,18 +7896,20 @@ def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
multiclass pclmul_alias<string asm, int immop> {
def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
- (PCLMULQDQrr VR128:$dst, VR128:$src, immop)>;
+ (PCLMULQDQrr VR128:$dst, VR128:$src, immop), 0>;
def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
- (PCLMULQDQrm VR128:$dst, i128mem:$src, immop)>;
+ (PCLMULQDQrm VR128:$dst, i128mem:$src, immop), 0>;
def : InstAlias<!strconcat("vpclmul", asm,
"dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
- (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop)>;
+ (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop),
+ 0>;
def : InstAlias<!strconcat("vpclmul", asm,
"dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
- (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop)>;
+ (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop),
+ 0>;
}
defm : pclmul_alias<"hqhq", 0x11>;
defm : pclmul_alias<"hqlq", 0x01>;
@@ -8291,6 +8358,12 @@ let Predicates = [HasF16C] in {
defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
+
+ // Pattern match vcvtph2ps of a scalar i64 load.
+ def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)),
+ (VCVTPH2PSrm addr:$src)>;
+ def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
+ (VCVTPH2PSrm addr:$src)>;
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 9d3aa1c..b5595cb 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -19,7 +19,7 @@ let Defs = [RAX, RDX] in
TB;
let Defs = [RAX, RCX, RDX] in
- def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", []>, TB;
+ def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)]>, TB;
// CPU flow control instructions
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
index e99f2d9..e969ef2 100644
--- a/lib/Target/X86/X86JITInfo.cpp
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -11,7 +11,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "jit"
#include "X86JITInfo.h"
#include "X86Relocations.h"
#include "X86Subtarget.h"
@@ -24,6 +23,8 @@
#include <cstring>
using namespace llvm;
+#define DEBUG_TYPE "jit"
+
// Determine the platform we're running on
#if defined (__x86_64__) || defined (_M_AMD64) || defined (_M_X64)
# define X86_64_JIT
@@ -427,9 +428,14 @@ X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
TsanIgnoreWritesEnd();
#if defined (X86_32_JIT) && !defined (_MSC_VER)
+#if defined(__SSE__)
+ // SSE Callback should be called for SSE-enabled LLVM.
+ return X86CompilationCallback_SSE;
+#else
if (Subtarget->hasSSE1())
return X86CompilationCallback_SSE;
#endif
+#endif
return X86CompilationCallback;
}
@@ -437,7 +443,7 @@ X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
X86JITInfo::X86JITInfo(X86TargetMachine &tm) : TM(tm) {
Subtarget = &TM.getSubtarget<X86Subtarget>();
useGOT = 0;
- TLSOffset = 0;
+ TLSOffset = nullptr;
}
void *X86JITInfo::emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 6d7f3cb..0190080 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -120,7 +120,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
MachineModuleInfoImpl::StubValueTy &StubSym =
getMachOMMI().getGVStubEntry(Sym);
- if (StubSym.getPointer() == 0) {
+ if (!StubSym.getPointer()) {
assert(MO.isGlobal() && "Extern symbol not handled yet");
StubSym =
MachineModuleInfoImpl::
@@ -132,7 +132,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: {
MachineModuleInfoImpl::StubValueTy &StubSym =
getMachOMMI().getHiddenGVStubEntry(Sym);
- if (StubSym.getPointer() == 0) {
+ if (!StubSym.getPointer()) {
assert(MO.isGlobal() && "Extern symbol not handled yet");
StubSym =
MachineModuleInfoImpl::
@@ -168,7 +168,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
MCSymbol *Sym) const {
// FIXME: We would like an efficient form for this, so we don't have to do a
// lot of extra uniquing.
- const MCExpr *Expr = 0;
+ const MCExpr *Expr = nullptr;
MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
switch (MO.getTargetFlags()) {
@@ -223,7 +223,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
break;
}
- if (Expr == 0)
+ if (!Expr)
Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp
index 746d0d6..6639875 100644
--- a/lib/Target/X86/X86PadShortFunction.cpp
+++ b/lib/Target/X86/X86PadShortFunction.cpp
@@ -15,9 +15,9 @@
#include <algorithm>
-#define DEBUG_TYPE "x86-pad-short-functions"
#include "X86.h"
#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -30,6 +30,8 @@
using namespace llvm;
+#define DEBUG_TYPE "x86-pad-short-functions"
+
STATISTIC(NumBBsPadded, "Number of basic blocks padded");
namespace {
@@ -49,7 +51,7 @@ namespace {
struct PadShortFunc : public MachineFunctionPass {
static char ID;
PadShortFunc() : MachineFunctionPass(ID)
- , Threshold(4), TM(0), TII(0) {}
+ , Threshold(4), TM(nullptr), TII(nullptr) {}
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -100,6 +102,9 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
}
TM = &MF.getTarget();
+ if (!TM->getSubtarget<X86Subtarget>().padShortFunctions())
+ return false;
+
TII = TM->getInstrInfo();
// Search through basic blocks and mark the ones that have early returns
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 85aa9b5..a83e1e4 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -38,11 +38,11 @@
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
#define GET_REGINFO_TARGET_DESC
#include "X86GenRegisterInfo.inc"
-using namespace llvm;
-
cl::opt<bool>
ForceStackAlign("force-align-stack",
cl::desc("Force align the stack to the minimum alignment"
@@ -129,7 +129,7 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
if (!Is64Bit && SubIdx == X86::sub_8bit) {
A = X86GenRegisterInfo::getSubClassWithSubReg(A, X86::sub_8bit_hi);
if (!A)
- return 0;
+ return nullptr;
}
return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx);
}
@@ -231,7 +231,7 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
}
}
-const uint16_t *
+const MCPhysReg *
X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 6a71113..2289d91 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -100,7 +100,7 @@ public:
/// getCalleeSavedRegs - Return a null-terminated list of all of the
/// callee-save registers on this target.
- const uint16_t *
+ const MCPhysReg *
getCalleeSavedRegs(const MachineFunction* MF) const override;
const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
const uint32_t *getNoPreservedMask() const;
@@ -122,7 +122,7 @@ public:
void eliminateFrameIndex(MachineBasicBlock::iterator MI,
int SPAdj, unsigned FIOperandNum,
- RegScavenger *RS = NULL) const override;
+ RegScavenger *RS = nullptr) const override;
// Debug information queries.
unsigned getFrameRegister(const MachineFunction &MF) const override;
diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td
index f5b51ee..6966d61 100644
--- a/lib/Target/X86/X86SchedHaswell.td
+++ b/lib/Target/X86/X86SchedHaswell.td
@@ -20,6 +20,9 @@ def HaswellModel : SchedMachineModel {
let LoadLatency = 4;
let MispredictPenalty = 16;
+ // Based on the LSD (loop-stream detector) queue size and benchmarking data.
+ let LoopMicroOpBufferSize = 50;
+
// FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
// the scheduler to assign a default model to unrecognized opcodes.
let CompleteModel = 0;
diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td
index a58859a..83f0534 100644
--- a/lib/Target/X86/X86SchedSandyBridge.td
+++ b/lib/Target/X86/X86SchedSandyBridge.td
@@ -21,6 +21,9 @@ def SandyBridgeModel : SchedMachineModel {
let LoadLatency = 4;
let MispredictPenalty = 16;
+ // Based on the LSD (loop-stream detector) queue size.
+ let LoopMicroOpBufferSize = 28;
+
// FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
// the scheduler to assign a default model to unrecognized opcodes.
let CompleteModel = 0;
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index ba72f29..3256ee7 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -535,5 +535,9 @@ def AtomModel : SchedMachineModel {
let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
let HighLatency = 30;// Expected, may be overriden by OperandCycles.
+ // On the Atom, the throughput for taken branches is 2 cycles. For small
+ // simple loops, expand by a small factor to hide the backedge cost.
+ let LoopMicroOpBufferSize = 10;
+
let Itineraries = AtomItineraries;
}
diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td
index 6c2a304..823d101 100644
--- a/lib/Target/X86/X86ScheduleSLM.td
+++ b/lib/Target/X86/X86ScheduleSLM.td
@@ -1,4 +1,4 @@
-//===- X86ScheduleSLM.td - X86 Atom Scheduling Definitions -*- tablegen -*-==//
+//=- X86ScheduleSLM.td - X86 Silvermont Scheduling -----------*- tablegen -*-=//
//
// The LLVM Compiler Infrastructure
//
@@ -7,662 +7,225 @@
//
//===----------------------------------------------------------------------===//
//
-// This file defines the itinerary class data for the Intel Atom
-// (Silvermont) processor.
+// This file defines the machine model for Intel Silvermont to support
+// instruction scheduling and other instruction cost heuristics.
//
//===----------------------------------------------------------------------===//
-def IEC_RSV0 : FuncUnit;
-def IEC_RSV1 : FuncUnit;
-def FPC_RSV0 : FuncUnit;
-def FPC_RSV1 : FuncUnit;
-def MEC_RSV : FuncUnit;
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-def SLMItineraries : ProcessorItineraries<
- [ IEC_RSV0, IEC_RSV1, FPC_RSV0, FPC_RSV1, MEC_RSV ],
- [], [
- // [InstrStage<N, [FPC_RSV0, FPC_RSV1]>]
- // [InstrStage<N, [FPC_RSV0, FPC_RSV1], 0>, InstrStage<N, [MEC_RSV]>]
- // [InstrStage<N, [IEC_RSV0, IEC_RSV1]>]
- // [InstrStage<N, [IEC_RSV0, IEC_RSV1], 0>,InstrStage<N,[MEC_RSV]>]
- //
- // Default is 1 cycle, IEC_RSV0 or IEC_RSV1
- //InstrItinData<IIC_DEFAULT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_ALU_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_LEA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_LEA_16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- // mul
- InstrItinData<IIC_MUL8, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MUL16_MEM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<4, [MEC_RSV]>] >,
- InstrItinData<IIC_MUL16_REG, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MUL32_MEM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<3, [MEC_RSV]>] >,
- InstrItinData<IIC_MUL32_REG, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MUL64, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
- // imul by al, ax, eax, rax
- InstrItinData<IIC_IMUL8, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_IMUL16_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<6, [MEC_RSV]>] >,
- InstrItinData<IIC_IMUL16_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_IMUL32_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<6, [MEC_RSV]>] >,
- InstrItinData<IIC_IMUL32_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_IMUL64, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
- // imul reg by reg|mem
- InstrItinData<IIC_IMUL16_RM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<4, [MEC_RSV]>] >,
- InstrItinData<IIC_IMUL16_RR, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_IMUL32_RM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<3, [MEC_RSV]>] >,
- InstrItinData<IIC_IMUL32_RR, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_IMUL64_RM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<4, [MEC_RSV]>] >,
- InstrItinData<IIC_IMUL64_RR, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
- // imul reg = reg/mem * imm
- InstrItinData<IIC_IMUL16_RRI, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_IMUL32_RRI, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_IMUL64_RRI, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_IMUL16_RMI, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<4, [MEC_RSV]>] >,
- InstrItinData<IIC_IMUL32_RMI, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<3, [MEC_RSV]>] >,
- InstrItinData<IIC_IMUL64_RMI, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<4, [MEC_RSV]>] >,
- // idiv - min latency
- InstrItinData<IIC_IDIV8, [InstrStage<34, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_IDIV16, [InstrStage<35, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_IDIV32, [InstrStage<35, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_IDIV64, [InstrStage<49, [IEC_RSV0, IEC_RSV1]>] >,
- // div - min latency
- InstrItinData<IIC_DIV8_REG, [InstrStage<25, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_DIV8_MEM, [InstrStage<25, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<25, [MEC_RSV]>] >,
- InstrItinData<IIC_DIV16, [InstrStage<26, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_DIV32, [InstrStage<26, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_DIV64, [InstrStage<38, [IEC_RSV0, IEC_RSV1]>] >,
- // neg/not/inc/dec
- InstrItinData<IIC_UNARY_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- // add/sub/and/or/xor/adc/sbc/cmp/test
- InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_BIN_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- // adc/sbb
- InstrItinData<IIC_BIN_CARRY_NONMEM, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_BIN_CARRY_MEM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<2, [MEC_RSV]>] >,
- // shift/rotate
- InstrItinData<IIC_SR, [InstrStage<1, [IEC_RSV0], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- // shift double
- InstrItinData<IIC_SHD16_REG_IM, [InstrStage<2, [IEC_RSV0]>] >,
- InstrItinData<IIC_SHD16_REG_CL, [InstrStage<4, [IEC_RSV0]>] >,
- InstrItinData<IIC_SHD16_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>,
- InstrStage<2, [MEC_RSV]>] >,
- InstrItinData<IIC_SHD16_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>,
- InstrStage<4, [MEC_RSV]>] >,
- InstrItinData<IIC_SHD32_REG_IM, [InstrStage<2, [IEC_RSV0]>] >,
- InstrItinData<IIC_SHD32_REG_CL, [InstrStage<4, [IEC_RSV0]>] >,
- InstrItinData<IIC_SHD32_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>,
- InstrStage<2, [MEC_RSV]>] >,
- InstrItinData<IIC_SHD32_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>,
- InstrStage<4, [MEC_RSV]>] >,
- InstrItinData<IIC_SHD64_REG_IM, [InstrStage<2, [IEC_RSV0]>] >,
- InstrItinData<IIC_SHD64_REG_CL, [InstrStage<4, [IEC_RSV0]>] >,
- InstrItinData<IIC_SHD64_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>,
- InstrStage<2, [MEC_RSV]>] >,
- InstrItinData<IIC_SHD64_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>,
- InstrStage<4, [MEC_RSV]>] >,
- // cmov
- InstrItinData<IIC_CMOV16_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<2, [MEC_RSV]>] >,
- InstrItinData<IIC_CMOV16_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_CMOV32_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<2, [MEC_RSV]>] >,
- InstrItinData<IIC_CMOV32_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_CMOV64_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<2, [MEC_RSV]>] >,
- InstrItinData<IIC_CMOV64_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
- // set
- InstrItinData<IIC_SET_M, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- InstrItinData<IIC_SET_R, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- // jcc
- InstrItinData<IIC_Jcc, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- // jcxz/jecxz/jrcxz
- InstrItinData<IIC_JCXZ, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- // jmp rel
- InstrItinData<IIC_JMP_REL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- // jmp indirect
- InstrItinData<IIC_JMP_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_JMP_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- // jmp far
- InstrItinData<IIC_JMP_FAR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- InstrItinData<IIC_JMP_FAR_PTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- // loop/loope/loopne
- InstrItinData<IIC_LOOP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_LOOPE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_LOOPNE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- // call - all but reg/imm
- InstrItinData<IIC_CALL_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_CALL_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- InstrItinData<IIC_CALL_FAR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- InstrItinData<IIC_CALL_FAR_PTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- //ret
- InstrItinData<IIC_RET, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_RET_IMM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- //sign extension movs
- InstrItinData<IIC_MOVSX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MOVSX_R16_R8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MOVSX_R16_M8, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- InstrItinData<IIC_MOVSX_R16_R16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MOVSX_R32_R32, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- //zero extension movs
- InstrItinData<IIC_MOVZX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
-
- InstrItinData<IIC_REP_MOVS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_REP_STOS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-
- // SSE binary operations
- // arithmetic fp scalar
- InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_ALU_F32S_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<3, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_ALU_F64S_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_ALU_F64S_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<3, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_MUL_F32S_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_MUL_F32S_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_MUL_F64S_RR, [InstrStage<2, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_MUL_F64S_RM, [InstrStage<2, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<2, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_DIV_F32S_RR, [InstrStage<13, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_DIV_F32S_RM, [InstrStage<13, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<13, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_DIV_F64S_RR, [InstrStage<13, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_DIV_F64S_RM, [InstrStage<13, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<13, [MEC_RSV]>] >,
-
- InstrItinData<IIC_SSE_COMIS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_COMIS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-
- InstrItinData<IIC_SSE_HADDSUB_RR, [InstrStage<6, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_HADDSUB_RM, [InstrStage<6, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<6, [MEC_RSV]>] >,
-
- // arithmetic fp parallel
- InstrItinData<IIC_SSE_ALU_F32P_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_ALU_F32P_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<3, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_ALU_F64P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_ALU_F64P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<4, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_MUL_F32P_RR, [InstrStage<2, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_MUL_F32P_RM, [InstrStage<2, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<2, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_MUL_F64P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_MUL_F64P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<4, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_DIV_F32P_RR, [InstrStage<27, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_DIV_F32P_RM, [InstrStage<27, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<27, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_DIV_F64P_RR, [InstrStage<27, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_DIV_F64P_RM, [InstrStage<27, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<27, [MEC_RSV]>] >,
-
- // bitwise parallel
- InstrItinData<IIC_SSE_BIT_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_BIT_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
-
- // arithmetic int parallel
- InstrItinData<IIC_SSE_INTALU_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_INTALU_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_INTALUQ_P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_INTALUQ_P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<4, [MEC_RSV]>] >,
-
- // multiply int parallel
- InstrItinData<IIC_SSE_INTMUL_P_RR, [InstrStage<5, [FPC_RSV0]>] >,
- InstrItinData<IIC_SSE_INTMUL_P_RM, [InstrStage<5, [FPC_RSV0], 0>,
- InstrStage<5, [MEC_RSV]>] >,
-
- // shift parallel
- InstrItinData<IIC_SSE_INTSH_P_RR, [InstrStage<2, [FPC_RSV0]>] >,
- InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<2, [FPC_RSV0], 0>,
- InstrStage<2, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [FPC_RSV0]>] >,
-
- InstrItinData<IIC_SSE_INTSHDQ_P_RI, [InstrStage<1, [FPC_RSV0]>] >,
-
- InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [FPC_RSV0]>] >,
- InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [FPC_RSV0]>] >,
- InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [FPC_RSV0], 0>,
- InstrStage<1, [MEC_RSV]>] >,
-
- InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [FPC_RSV0]>] >,
-
- InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<26, [FPC_RSV0]>] >,
- InstrItinData<IIC_SSE_SQRTPS_RM, [InstrStage<26, [FPC_RSV0], 0>,
- InstrStage<26, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_SQRTSS_RR, [InstrStage<13, [FPC_RSV0]>] >,
- InstrItinData<IIC_SSE_SQRTSS_RM, [InstrStage<13, [FPC_RSV0], 0>,
- InstrStage<13, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_SQRTPD_RR, [InstrStage<26, [FPC_RSV0]>] >,
- InstrItinData<IIC_SSE_SQRTPD_RM, [InstrStage<26, [FPC_RSV0], 0>,
- InstrStage<26, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<13, [FPC_RSV0]>] >,
- InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<13, [FPC_RSV0], 0>,
- InstrStage<13, [MEC_RSV]>] >,
-
- InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [FPC_RSV0]>] >,
- InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<9, [FPC_RSV0], 0>,
- InstrStage<9, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [FPC_RSV0]>] >,
- InstrItinData<IIC_SSE_RCPS_RM, [InstrStage<4, [FPC_RSV0], 0>,
- InstrStage<4, [MEC_RSV]>] >,
-
- InstrItinData<IIC_SSE_MOVMSK, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_MASKMOV, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
-
- InstrItinData<IIC_SSE_PEXTRW, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_PINSRW, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-
- InstrItinData<IIC_SSE_PABS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_PABS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
-
- InstrItinData<IIC_SSE_MOV_S_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_MOV_S_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_MOV_S_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
-
- InstrItinData<IIC_SSE_MOVA_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_MOVA_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_MOVA_P_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
-
- InstrItinData<IIC_SSE_MOVU_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_MOVU_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_MOVU_P_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
-
- InstrItinData<IIC_SSE_MOV_LH, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-
- InstrItinData<IIC_SSE_LDDQU, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-
- InstrItinData<IIC_SSE_MOVDQ, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_MOVD_ToGP, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_MOVQ_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-
- InstrItinData<IIC_SSE_MOVNT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-
- InstrItinData<IIC_SSE_PREFETCH, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_SSE_PAUSE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_SSE_LFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_SSE_MFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_SSE_SFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_SSE_LDMXCSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_SSE_STMXCSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-
- InstrItinData<IIC_SSE_PHADDSUBD_RR, [InstrStage<6, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_PHADDSUBD_RM, [InstrStage<6, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<6, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_PHADDSUBSW_RR, [InstrStage<9, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_PHADDSUBSW_RM, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<9, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_PHADDSUBW_RR, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<9, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_PHADDSUBW_RM, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<9, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_PSHUFB_RR, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_PSHUFB_RM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<5, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_PSIGN_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_PSIGN_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
-
- InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_PALIGNRR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_PALIGNRM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_MWAIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_SSE_MONITOR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-
- // conversions
- // to/from PD ...
- InstrItinData<IIC_SSE_CVT_PD_RR, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_CVT_PD_RM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<5, [MEC_RSV]>] >,
- // to/from PS except to/from PD and PS2PI
- InstrItinData<IIC_SSE_CVT_PS_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_CVT_PS_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<4, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_CVT_Scalar_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_CVT_Scalar_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_CVT_SS2SI32_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_CVT_SS2SI32_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<4, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<4, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<4, [MEC_RSV]>] >,
-
- // MMX MOVs
- InstrItinData<IIC_MMX_MOV_MM_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- // other MMX
- InstrItinData<IIC_MMX_ALU_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_ALU_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_PMUL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_PSADBW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_PCK_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_PCK_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_PSHUF, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_PEXTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_PINSRW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- // conversions
- // from/to PD
- InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- // from/to PI
- InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-
- InstrItinData<IIC_CMPX_LOCK, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-
- InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-
- InstrItinData<IIC_FILD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FLD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FLD80, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+def SLMModel : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and SLM can decode 2
+ // instructions per cycle.
+ let IssueWidth = 2;
+ let MicroOpBufferSize = 32; // Based on the reorder buffer.
+ let LoadLatency = 3;
+ let MispredictPenalty = 10;
+
+ // For small loops, expand by a small factor to hide the backedge cost.
+ let LoopMicroOpBufferSize = 10;
+
+ // FIXME: SSE4 is unimplemented. This flag is set to allow
+ // the scheduler to assign a default model to unrecognized opcodes.
+ let CompleteModel = 0;
+}
- InstrItinData<IIC_FST, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FST80, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FIST, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+let SchedModel = SLMModel in {
+
+// Silvermont has 5 reservation stations for micro-ops
+
+def IEC_RSV0 : ProcResource<1>;
+def IEC_RSV1 : ProcResource<1>;
+def FPC_RSV0 : ProcResource<1> { let BufferSize = 1; }
+def FPC_RSV1 : ProcResource<1> { let BufferSize = 1; }
+def MEC_RSV : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def IEC_RSV01 : ProcResGroup<[IEC_RSV0, IEC_RSV1]>;
+def FPC_RSV01 : ProcResGroup<[FPC_RSV0, FPC_RSV1]>;
+
+def SMDivider : ProcResource<1>;
+def SMFPMultiplier : ProcResource<1>;
+def SMFPDivider : ProcResource<1>;
+
+// Loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 3>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SMWriteResPair<X86FoldableSchedWrite SchedRW,
+ ProcResourceKind ExePort,
+ int Lat> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+ // Memory variant also uses a cycle on MEC_RSV and adds 3 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, [MEC_RSV, ExePort]> {
+ let Latency = !add(Lat, 3);
+ }
+}
- InstrItinData<IIC_FLDZ, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FUCOM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FUCOMI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FCOMI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FNSTSW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FNSTCW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FLDCW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FNINIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FFREE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FNCLEX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_WAIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FXAM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FNOP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FLDL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_F2XM1, [InstrStage<88, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_FYL2X, [InstrStage<296, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_FPTAN, [InstrStage<281, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_FPATAN, [InstrStage<296, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_FXTRACT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FPREM1, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FPSTP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FPREM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FYL2XP1, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FSINCOS, [InstrStage<281, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_FRNDINT, [InstrStage<25, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_FSCALE, [InstrStage<74, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_FCOMPP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FXSAVE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FXRSTOR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_FXCH, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+// A folded store needs a cycle on MEC_RSV for the store data, but it does not
+// need an extra port cycle to recompute the address.
+def : WriteRes<WriteRMW, [MEC_RSV]>;
+
+def : WriteRes<WriteStore, [IEC_RSV01, MEC_RSV]>;
+def : WriteRes<WriteLoad, [MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteMove, [IEC_RSV01]>;
+def : WriteRes<WriteZero, []>;
+
+defm : SMWriteResPair<WriteALU, IEC_RSV01, 1>;
+defm : SMWriteResPair<WriteIMul, IEC_RSV1, 3>;
+defm : SMWriteResPair<WriteShift, IEC_RSV0, 1>;
+defm : SMWriteResPair<WriteJump, IEC_RSV1, 1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [IEC_RSV1]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [IEC_RSV01, SMDivider]> {
+ let Latency = 25;
+ let ResourceCycles = [1, 25];
+}
+def : WriteRes<WriteIDivLd, [MEC_RSV, IEC_RSV01, SMDivider]> {
+ let Latency = 29;
+ let ResourceCycles = [1, 1, 25];
+}
- // System instructions
- InstrItinData<IIC_CPUID, [InstrStage<60, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_INT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_INT3, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_INVD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_INVLPG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_IRET, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_HLT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_LXS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_LTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_RDTSC, [InstrStage<30, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_RSM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_SIDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_SGDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_SLDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_STR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_SWAPGS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_SYSCALL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+// Scalar and vector floating point.
+defm : SMWriteResPair<WriteFAdd, FPC_RSV1, 3>;
+defm : SMWriteResPair<WriteFRcp, FPC_RSV0, 5>;
+defm : SMWriteResPair<WriteFSqrt, FPC_RSV0, 15>;
+defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteCvtF2F, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteFShuffle, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteFBlend, FPC_RSV0, 1>;
+
+// This is quite rough, latency depends on precision
+def : WriteRes<WriteFMul, [FPC_RSV0, SMFPMultiplier]> {
+ let Latency = 5;
+ let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteFMulLd, [MEC_RSV, FPC_RSV0, SMFPMultiplier]> {
+ let Latency = 8;
+ let ResourceCycles = [1, 1, 2];
+}
- InstrItinData<IIC_IN_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_IN_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_OUT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_OUT_IR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_INS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+def : WriteRes<WriteFDiv, [FPC_RSV0, SMFPDivider]> {
+ let Latency = 34;
+ let ResourceCycles = [1, 34];
+}
+def : WriteRes<WriteFDivLd, [MEC_RSV, FPC_RSV0, SMFPDivider]> {
+ let Latency = 37;
+ let ResourceCycles = [1, 1, 34];
+}
- InstrItinData<IIC_MOV_REG_DR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MOV_DR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- // worst case for mov REG_CRx
- InstrItinData<IIC_MOV_REG_CR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MOV_CR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+// Vector integer operations.
+defm : SMWriteResPair<WriteVecShift, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteVecLogic, FPC_RSV01, 1>;
+defm : SMWriteResPair<WriteVecALU, FPC_RSV01, 1>;
+defm : SMWriteResPair<WriteVecIMul, FPC_RSV0, 4>;
+defm : SMWriteResPair<WriteShuffle, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteBlend, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteMPSAD, FPC_RSV0, 7>;
+
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [FPC_RSV0]> {
+ let Latency = 13;
+ let ResourceCycles = [13];
+}
+def : WriteRes<WritePCmpIStrMLd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 13;
+ let ResourceCycles = [13, 1];
+}
- InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MOV_MEM_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MOV_SR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MOV_SR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- // LAR
- InstrItinData<IIC_LAR_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_LAR_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- // LSL
- InstrItinData<IIC_LSL_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_LSL_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [FPC_RSV0]> {
+ let Latency = 17;
+ let ResourceCycles = [17];
+}
+def : WriteRes<WritePCmpEStrMLd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 17;
+ let ResourceCycles = [17, 1];
+}
- InstrItinData<IIC_LGDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_LIDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_LLDT_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_LLDT_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- // push control register, segment registers
- InstrItinData<IIC_PUSH_CS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_PUSH_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- // pop control register, segment registers
- InstrItinData<IIC_POP_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_POP_SR_SS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- // VERR, VERW
- InstrItinData<IIC_VERR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_VERW_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_VERW_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- // WRMSR, RDMSR
- InstrItinData<IIC_WRMSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_RDMSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_RDPMC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- // SMSW, LMSW
- InstrItinData<IIC_SMSW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_LMSW_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_LMSW_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [FPC_RSV0]> {
+ let Latency = 17;
+ let ResourceCycles = [17];
+}
+def : WriteRes<WritePCmpIStrILd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 17;
+ let ResourceCycles = [17, 1];
+}
- InstrItinData<IIC_ENTER, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_LEAVE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [FPC_RSV0]> {
+ let Latency = 21;
+ let ResourceCycles = [21];
+}
+def : WriteRes<WritePCmpEStrILd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 21;
+ let ResourceCycles = [21, 1];
+}
- InstrItinData<IIC_POP_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_POP_REG16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_POP_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_POP_F, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_POP_FD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_POP_A, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [FPC_RSV0]> {
+ let Latency = 8;
+ let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESDecEncLd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 8;
+ let ResourceCycles = [5, 1];
+}
- InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_PUSH_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_PUSH_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_PUSH_F, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_PUSH_A, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+def : WriteRes<WriteAESIMC, [FPC_RSV0]> {
+ let Latency = 8;
+ let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESIMCLd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 8;
+ let ResourceCycles = [5, 1];
+}
- InstrItinData<IIC_BSWAP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<10, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<10, [MEC_RSV]>] >,
- InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<10, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MOVS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_STOS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_SCAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_CMPS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MOV, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_MOV_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- InstrItinData<IIC_AHF, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_BT_MI, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- InstrItinData<IIC_BT_MR, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- InstrItinData<IIC_BT_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_BT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_BTX_MI, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- InstrItinData<IIC_BTX_MR, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- InstrItinData<IIC_BTX_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_BTX_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_XCHG_REG, [InstrStage<5, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_XCHG_MEM, [InstrStage<5, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<5, [MEC_RSV]>] >,
- InstrItinData<IIC_XADD_REG, [InstrStage<5, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_XADD_MEM, [InstrStage<5, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<5, [MEC_RSV]>] >,
- InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_CMPXCHG_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<6, [MEC_RSV]>] >,
- InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<6, [MEC_RSV]>] >,
- InstrItinData<IIC_CMPXCHG_8B, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_CMPXCHG_16B, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_LODS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_OUTS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_CLC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_CLD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_CLI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_CMC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_CLTS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_STC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_STI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_STD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_XLAT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_AAA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_AAD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_AAM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_AAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_DAA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_DAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_BOUND, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_ARPL_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_ARPL_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- InstrItinData<IIC_MOVBE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_AES, [InstrStage<8, [FPC_RSV0]>] >,
- InstrItinData<IIC_BLEND_NOMEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_BLEND_MEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<4, [MEC_RSV]>] >,
- InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<10, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<10, [MEC_RSV]>] >,
- InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<10, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_CBW, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_CRC32_REG, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
- InstrItinData<IIC_CRC32_MEM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
- InstrStage<3, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_DPPD_RR, [InstrStage<12, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_DPPD_RM, [InstrStage<12, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<12, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_DPPS_RR, [InstrStage<15, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_DPPS_RM, [InstrStage<15, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<15, [MEC_RSV]>] >,
- InstrItinData<IIC_MMX_EMMS, [InstrStage<10, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_EXTRACTPS_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_EXTRACTPS_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<4, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_INSERTPS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_INSERTPS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_MPSADBW_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_MPSADBW_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<1, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_PMULLD_RR, [InstrStage<11, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_PMULLD_RM, [InstrStage<11, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<11, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_ROUNDPS_REG, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_ROUNDPS_MEM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<5, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_ROUNDPD_REG, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
- InstrItinData<IIC_SSE_ROUNDPD_MEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
- InstrStage<4, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_POPCNT_RR, [InstrStage<4, [IEC_RSV1]>] >,
- InstrItinData<IIC_SSE_POPCNT_RM, [InstrStage<4, [IEC_RSV1], 0>,
- InstrStage<4, [MEC_RSV]>] >,
- InstrItinData<IIC_SSE_PCLMULQDQ_RR, [InstrStage<10, [IEC_RSV1]>] >,
- InstrItinData<IIC_SSE_PCLMULQDQ_RM, [InstrStage<10, [IEC_RSV1], 0>,
- InstrStage<10, [MEC_RSV]>] >,
+def : WriteRes<WriteAESKeyGen, [FPC_RSV0]> {
+ let Latency = 8;
+ let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESKeyGenLd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 8;
+ let ResourceCycles = [5, 1];
+}
- InstrItinData<IIC_NOP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >
- ]>;
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [FPC_RSV0]> {
+ let Latency = 10;
+ let ResourceCycles = [10];
+}
+def : WriteRes<WriteCLMulLd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 10;
+ let ResourceCycles = [10, 1];
+}
-// Silvermont machine model.
-def SLMModel : SchedMachineModel {
- let IssueWidth = 2; // Allows 2 instructions per scheduling group.
- let MinLatency = 1; // InstrStage cycles overrides MinLatency.
- // OperandCycles may be used for expected latency.
- let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
- let HighLatency = 30;// Expected, may be overriden by OperandCycles.
- let Itineraries = SLMItineraries;
-}
+def : WriteRes<WriteSystem, [FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteFence, [MEC_RSV]>;
+def : WriteRes<WriteNop, []>;
+
+// AVX is not supported on that architecture, but we should define the basic
+// scheduling resources anyway.
+def : WriteRes<WriteIMulH, [FPC_RSV0]>;
+defm : SMWriteResPair<WriteVarBlend, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteFVarBlend, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteFShuffle256, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteShuffle256, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteVarVecShift, FPC_RSV0, 1>;
+} // SchedModel
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index b9c620f..744890d 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -11,12 +11,13 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "x86-selectiondag-info"
#include "X86TargetMachine.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/IR/DerivedTypes.h"
using namespace llvm;
+#define DEBUG_TYPE "x86-selectiondag-info"
+
X86SelectionDAGInfo::X86SelectionDAGInfo(const X86TargetMachine &TM) :
TargetSelectionDAGInfo(TM),
Subtarget(&TM.getSubtarget<X86Subtarget>()),
@@ -50,7 +51,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
if (const char *bzeroEntry = V &&
- V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
+ V->isNullValue() ? Subtarget->getBZeroEntry() : nullptr) {
EVT IntPtr = TLI.getPointerTy();
Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
TargetLowering::ArgListTy Args;
@@ -60,15 +61,14 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
Args.push_back(Entry);
Entry.Node = Size;
Args.push_back(Entry);
- TargetLowering::
- CallLoweringInfo CLI(Chain, Type::getVoidTy(*DAG.getContext()),
- false, false, false, false,
- 0, CallingConv::C, /*isTailCall=*/false,
- /*doesNotRet=*/false, /*isReturnValueUsed=*/false,
- DAG.getExternalSymbol(bzeroEntry, IntPtr), Args,
- DAG, dl);
- std::pair<SDValue,SDValue> CallResult =
- TLI.LowerCallTo(CLI);
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl).setChain(Chain)
+ .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+ DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0)
+ .setDiscardResult();
+
+ std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
return CallResult.second;
}
@@ -77,7 +77,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
}
uint64_t SizeVal = ConstantSize->getZExtValue();
- SDValue InFlag(0, 0);
+ SDValue InFlag;
EVT AVT;
SDValue Count;
ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
@@ -139,7 +139,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
- Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops));
+ Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
if (TwoRepStos) {
InFlag = Chain.getValue(1);
@@ -153,7 +153,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
InFlag = Chain.getValue(1);
Tys = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
- Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops));
+ Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
} else if (BytesLeft) {
// Handle the last 1 - 7 bytes.
unsigned Offset = SizeVal - BytesLeft;
@@ -225,7 +225,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
SDValue Count = DAG.getIntPtrConstant(CountVal);
unsigned BytesLeft = SizeVal % UBytes;
- SDValue InFlag(0, 0);
+ SDValue InFlag;
Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
X86::ECX,
Count, InFlag);
@@ -241,8 +241,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
- SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops,
- array_lengthof(Ops));
+ SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops);
SmallVector<SDValue, 4> Results;
Results.push_back(RepMovs);
@@ -263,6 +262,5 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
SrcPtrInfo.getWithOffset(Offset)));
}
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
- &Results[0], Results.size());
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
}
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 207d0ba..989e0d6 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -11,12 +11,12 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "subtarget"
#include "X86Subtarget.h"
#include "X86InstrInfo.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Host.h"
@@ -24,15 +24,24 @@
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+using namespace llvm;
+
+#define DEBUG_TYPE "subtarget"
+
#define GET_SUBTARGETINFO_TARGET_DESC
#define GET_SUBTARGETINFO_CTOR
#include "X86GenSubtargetInfo.inc"
-using namespace llvm;
+// Temporary option to control early if-conversion for x86 while adding machine
+// models.
+static cl::opt<bool>
+X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
+ cl::desc("Enable early if-conversion on X86"));
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif
/// ClassifyBlockAddressReference - Classify a blockaddress reference for the
/// current subtarget according to how we should reference it in a non-pcrel
@@ -153,7 +162,7 @@ const char *X86Subtarget::getBZeroEntry() const {
!getTargetTriple().isMacOSXVersionLT(10, 6))
return "__bzero";
- return 0;
+ return nullptr;
}
bool X86Subtarget::hasSinCos() const {
@@ -173,251 +182,16 @@ bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
}
-static bool OSHasAVXSupport() {
-#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
- || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
-#if defined(__GNUC__)
- // Check xgetbv; this uses a .byte sequence instead of the instruction
- // directly because older assemblers do not include support for xgetbv and
- // there is no easy way to conditionally compile based on the assembler used.
- int rEAX, rEDX;
- __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
-#elif defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
- unsigned long long rEAX = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
-#else
- int rEAX = 0; // Ensures we return false
-#endif
- return (rEAX & 6) == 6;
-#else
- return false;
-#endif
-}
-
-void X86Subtarget::AutoDetectSubtargetFeatures() {
- unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
- unsigned MaxLevel;
- union {
- unsigned u[3];
- char c[12];
- } text;
-
- if (X86_MC::GetCpuIDAndInfo(0, &MaxLevel, text.u+0, text.u+2, text.u+1) ||
- MaxLevel < 1)
- return;
-
- X86_MC::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
-
- if ((EDX >> 15) & 1) { HasCMov = true; ToggleFeature(X86::FeatureCMOV); }
- if ((EDX >> 23) & 1) { X86SSELevel = MMX; ToggleFeature(X86::FeatureMMX); }
- if ((EDX >> 25) & 1) { X86SSELevel = SSE1; ToggleFeature(X86::FeatureSSE1); }
- if ((EDX >> 26) & 1) { X86SSELevel = SSE2; ToggleFeature(X86::FeatureSSE2); }
- if (ECX & 0x1) { X86SSELevel = SSE3; ToggleFeature(X86::FeatureSSE3); }
- if ((ECX >> 9) & 1) { X86SSELevel = SSSE3; ToggleFeature(X86::FeatureSSSE3);}
- if ((ECX >> 19) & 1) { X86SSELevel = SSE41; ToggleFeature(X86::FeatureSSE41);}
- if ((ECX >> 20) & 1) { X86SSELevel = SSE42; ToggleFeature(X86::FeatureSSE42);}
- if (((ECX >> 27) & 1) && ((ECX >> 28) & 1) && OSHasAVXSupport()) {
- X86SSELevel = AVX; ToggleFeature(X86::FeatureAVX);
- }
-
- bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0;
- bool IsAMD = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0;
-
- if ((ECX >> 1) & 0x1) {
- HasPCLMUL = true;
- ToggleFeature(X86::FeaturePCLMUL);
- }
- if ((ECX >> 12) & 0x1) {
- HasFMA = true;
- ToggleFeature(X86::FeatureFMA);
- }
- if (IsIntel && ((ECX >> 22) & 0x1)) {
- HasMOVBE = true;
- ToggleFeature(X86::FeatureMOVBE);
- }
- if ((ECX >> 23) & 0x1) {
- HasPOPCNT = true;
- ToggleFeature(X86::FeaturePOPCNT);
- }
- if ((ECX >> 25) & 0x1) {
- HasAES = true;
- ToggleFeature(X86::FeatureAES);
- }
- if ((ECX >> 29) & 0x1) {
- HasF16C = true;
- ToggleFeature(X86::FeatureF16C);
- }
- if (IsIntel && ((ECX >> 30) & 0x1)) {
- HasRDRAND = true;
- ToggleFeature(X86::FeatureRDRAND);
- }
-
- if ((ECX >> 13) & 0x1) {
- HasCmpxchg16b = true;
- ToggleFeature(X86::FeatureCMPXCHG16B);
- }
-
- if (IsIntel || IsAMD) {
- // Determine if bit test memory instructions are slow.
- unsigned Family = 0;
- unsigned Model = 0;
- X86_MC::DetectFamilyModel(EAX, Family, Model);
- if (IsAMD || (Family == 6 && Model >= 13)) {
- IsBTMemSlow = true;
- ToggleFeature(X86::FeatureSlowBTMem);
- }
-
- // Determine if SHLD/SHRD instructions have higher latency then the
- // equivalent series of shifts/or instructions.
- // FIXME: Add Intel's processors that have SHLD instructions with very
- // poor latency.
- if (IsAMD) {
- IsSHLDSlow = true;
- ToggleFeature(X86::FeatureSlowSHLD);
- }
-
- // If it's an Intel chip since Nehalem and not an Atom chip, unaligned
- // memory access is fast. We hard code model numbers here because they
- // aren't strictly increasing for Intel chips it seems.
- if (IsIntel &&
- ((Family == 6 && Model == 0x1E) || // Nehalem: Clarksfield, Lynnfield,
- // Jasper Froest
- (Family == 6 && Model == 0x1A) || // Nehalem: Bloomfield, Nehalem-EP
- (Family == 6 && Model == 0x2E) || // Nehalem: Nehalem-EX
- (Family == 6 && Model == 0x25) || // Westmere: Arrandale, Clarksdale
- (Family == 6 && Model == 0x2C) || // Westmere: Gulftown, Westmere-EP
- (Family == 6 && Model == 0x2F) || // Westmere: Westmere-EX
- (Family == 6 && Model == 0x2A) || // SandyBridge
- (Family == 6 && Model == 0x2D) || // SandyBridge: SandyBridge-E*
- (Family == 6 && Model == 0x3A) || // IvyBridge
- (Family == 6 && Model == 0x3E) || // IvyBridge EP
- (Family == 6 && Model == 0x3C) || // Haswell
- (Family == 6 && Model == 0x3F) || // ...
- (Family == 6 && Model == 0x45) || // ...
- (Family == 6 && Model == 0x46))) { // ...
- IsUAMemFast = true;
- ToggleFeature(X86::FeatureFastUAMem);
- }
-
- // Set processor type. Currently only Atom or Silvermont (SLM) is detected.
- if (Family == 6 &&
- (Model == 28 || Model == 38 || Model == 39 ||
- Model == 53 || Model == 54)) {
- X86ProcFamily = IntelAtom;
-
- UseLeaForSP = true;
- ToggleFeature(X86::FeatureLeaForSP);
- }
- else if (Family == 6 &&
- (Model == 55 || Model == 74 || Model == 77)) {
- X86ProcFamily = IntelSLM;
- }
-
- unsigned MaxExtLevel;
- X86_MC::GetCpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX);
-
- if (MaxExtLevel >= 0x80000001) {
- X86_MC::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
- if ((EDX >> 29) & 0x1) {
- HasX86_64 = true;
- ToggleFeature(X86::Feature64Bit);
- }
- if ((ECX >> 5) & 0x1) {
- HasLZCNT = true;
- ToggleFeature(X86::FeatureLZCNT);
- }
- if (IsIntel && ((ECX >> 8) & 0x1)) {
- HasPRFCHW = true;
- ToggleFeature(X86::FeaturePRFCHW);
- }
- if (IsAMD) {
- if ((ECX >> 6) & 0x1) {
- HasSSE4A = true;
- ToggleFeature(X86::FeatureSSE4A);
- }
- if ((ECX >> 11) & 0x1) {
- HasXOP = true;
- ToggleFeature(X86::FeatureXOP);
- }
- if ((ECX >> 16) & 0x1) {
- HasFMA4 = true;
- ToggleFeature(X86::FeatureFMA4);
- }
- }
- }
- }
-
- if (MaxLevel >= 7) {
- if (!X86_MC::GetCpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX)) {
- if (IsIntel && (EBX & 0x1)) {
- HasFSGSBase = true;
- ToggleFeature(X86::FeatureFSGSBase);
- }
- if ((EBX >> 3) & 0x1) {
- HasBMI = true;
- ToggleFeature(X86::FeatureBMI);
- }
- if ((EBX >> 4) & 0x1) {
- HasHLE = true;
- ToggleFeature(X86::FeatureHLE);
- }
- if (IsIntel && ((EBX >> 5) & 0x1)) {
- X86SSELevel = AVX2;
- ToggleFeature(X86::FeatureAVX2);
- }
- if (IsIntel && ((EBX >> 8) & 0x1)) {
- HasBMI2 = true;
- ToggleFeature(X86::FeatureBMI2);
- }
- if (IsIntel && ((EBX >> 11) & 0x1)) {
- HasRTM = true;
- ToggleFeature(X86::FeatureRTM);
- }
- if (IsIntel && ((EBX >> 16) & 0x1)) {
- X86SSELevel = AVX512F;
- ToggleFeature(X86::FeatureAVX512);
- }
- if (IsIntel && ((EBX >> 18) & 0x1)) {
- HasRDSEED = true;
- ToggleFeature(X86::FeatureRDSEED);
- }
- if (IsIntel && ((EBX >> 19) & 0x1)) {
- HasADX = true;
- ToggleFeature(X86::FeatureADX);
- }
- if (IsIntel && ((EBX >> 26) & 0x1)) {
- HasPFI = true;
- ToggleFeature(X86::FeaturePFI);
- }
- if (IsIntel && ((EBX >> 27) & 0x1)) {
- HasERI = true;
- ToggleFeature(X86::FeatureERI);
- }
- if (IsIntel && ((EBX >> 28) & 0x1)) {
- HasCDI = true;
- ToggleFeature(X86::FeatureCDI);
- }
- if (IsIntel && ((EBX >> 29) & 0x1)) {
- HasSHA = true;
- ToggleFeature(X86::FeatureSHA);
- }
- }
- if (IsAMD && ((ECX >> 21) & 0x1)) {
- HasTBM = true;
- ToggleFeature(X86::FeatureTBM);
- }
- }
-}
-
void X86Subtarget::resetSubtargetFeatures(const MachineFunction *MF) {
AttributeSet FnAttrs = MF->getFunction()->getAttributes();
- Attribute CPUAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
- "target-cpu");
- Attribute FSAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
- "target-features");
+ Attribute CPUAttr =
+ FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
+ Attribute FSAttr =
+ FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
std::string CPU =
- !CPUAttr.hasAttribute(Attribute::None) ?CPUAttr.getValueAsString() : "";
+ !CPUAttr.hasAttribute(Attribute::None) ? CPUAttr.getValueAsString() : "";
std::string FS =
- !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : "";
+ !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : "";
if (!FS.empty()) {
initializeEnvironment();
resetSubtargetFeatures(CPU, FS);
@@ -426,54 +200,23 @@ void X86Subtarget::resetSubtargetFeatures(const MachineFunction *MF) {
void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
std::string CPUName = CPU;
- if (!FS.empty() || !CPU.empty()) {
- if (CPUName.empty()) {
-#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
- || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
- CPUName = sys::getHostCPUName();
-#else
- CPUName = "generic";
-#endif
- }
-
- // Make sure 64-bit features are available in 64-bit mode. (But make sure
- // SSE2 can be turned off explicitly.)
- std::string FullFS = FS;
- if (In64BitMode) {
- if (!FullFS.empty())
- FullFS = "+64bit,+sse2," + FullFS;
- else
- FullFS = "+64bit,+sse2";
- }
-
- // If feature string is not empty, parse features string.
- ParseSubtargetFeatures(CPUName, FullFS);
- } else {
- if (CPUName.empty()) {
-#if defined (__x86_64__) || defined(__i386__)
- CPUName = sys::getHostCPUName();
-#else
- CPUName = "generic";
-#endif
- }
- // Otherwise, use CPUID to auto-detect feature set.
- AutoDetectSubtargetFeatures();
-
- // Make sure 64-bit features are available in 64-bit mode.
- if (In64BitMode) {
- if (!HasX86_64) { HasX86_64 = true; ToggleFeature(X86::Feature64Bit); }
- if (!HasCMov) { HasCMov = true; ToggleFeature(X86::FeatureCMOV); }
-
- if (X86SSELevel < SSE2) {
- X86SSELevel = SSE2;
- ToggleFeature(X86::FeatureSSE1);
- ToggleFeature(X86::FeatureSSE2);
- }
- }
+ if (CPUName.empty())
+ CPUName = "generic";
+
+ // Make sure 64-bit features are available in 64-bit mode. (But make sure
+ // SSE2 can be turned off explicitly.)
+ std::string FullFS = FS;
+ if (In64BitMode) {
+ if (!FullFS.empty())
+ FullFS = "+64bit,+sse2," + FullFS;
+ else
+ FullFS = "+64bit,+sse2";
}
- // CPUName may have been set by the CPU detection code. Make sure the
- // new MCSchedModel is used.
+ // If feature string is not empty, parse features string.
+ ParseSubtargetFeatures(CPUName, FullFS);
+
+ // Make sure the right MCSchedModel is used.
InitCPUSchedModel(CPUName);
if (X86ProcFamily == IntelAtom || X86ProcFamily == IntelSLM)
@@ -547,33 +290,36 @@ void X86Subtarget::initializeEnvironment() {
PadShortFunctions = false;
CallRegIndirect = false;
LEAUsesAG = false;
+ SlowLEA = false;
stackAlignment = 4;
// FIXME: this is a known good value for Yonah. How about others?
MaxInlineSizeThreshold = 128;
}
X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
- const std::string &FS,
- unsigned StackAlignOverride)
- : X86GenSubtargetInfo(TT, CPU, FS)
- , X86ProcFamily(Others)
- , PICStyle(PICStyles::None)
- , TargetTriple(TT)
- , StackAlignOverride(StackAlignOverride)
- , In64BitMode(TargetTriple.getArch() == Triple::x86_64)
- , In32BitMode(TargetTriple.getArch() == Triple::x86 &&
- TargetTriple.getEnvironment() != Triple::CODE16)
- , In16BitMode(TargetTriple.getArch() == Triple::x86 &&
- TargetTriple.getEnvironment() == Triple::CODE16) {
+ const std::string &FS, unsigned StackAlignOverride)
+ : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others),
+ PICStyle(PICStyles::None), TargetTriple(TT),
+ StackAlignOverride(StackAlignOverride),
+ In64BitMode(TargetTriple.getArch() == Triple::x86_64),
+ In32BitMode(TargetTriple.getArch() == Triple::x86 &&
+ TargetTriple.getEnvironment() != Triple::CODE16),
+ In16BitMode(TargetTriple.getArch() == Triple::x86 &&
+ TargetTriple.getEnvironment() == Triple::CODE16) {
initializeEnvironment();
resetSubtargetFeatures(CPU, FS);
}
-bool X86Subtarget::enablePostRAScheduler(
- CodeGenOpt::Level OptLevel,
- TargetSubtargetInfo::AntiDepBreakMode& Mode,
- RegClassVector& CriticalPathRCs) const {
+bool
+X86Subtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+ TargetSubtargetInfo::AntiDepBreakMode &Mode,
+ RegClassVector &CriticalPathRCs) const {
Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
CriticalPathRCs.clear();
return PostRAScheduler && OptLevel >= CodeGenOpt::Default;
}
+
+bool
+X86Subtarget::enableEarlyIfConversion() const {
+ return hasCMov() && X86EarlyIfConv;
+}
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 52986b9..703559a 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -178,6 +178,9 @@ protected:
/// address generation (AG) time.
bool LEAUsesAG;
+ /// SlowLEA - True if the LEA instruction with certain arguments is slow
+ bool SlowLEA;
+
/// Processor has AVX-512 PreFetch Instructions
bool HasPFI;
@@ -235,10 +238,6 @@ public:
/// subtarget options. Definition of function is auto generated by tblgen.
void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
- /// AutoDetectSubtargetFeatures - Auto-detect CPU features using CPUID
- /// instruction.
- void AutoDetectSubtargetFeatures();
-
/// \brief Reset the features for the X86 target.
void resetSubtargetFeatures(const MachineFunction *MF) override;
private:
@@ -319,11 +318,13 @@ public:
bool padShortFunctions() const { return PadShortFunctions; }
bool callRegIndirect() const { return CallRegIndirect; }
bool LEAusesAG() const { return LEAUsesAG; }
+ bool slowLEA() const { return SlowLEA; }
bool hasCDI() const { return HasCDI; }
bool hasPFI() const { return HasPFI; }
bool hasERI() const { return HasERI; }
bool isAtom() const { return X86ProcFamily == IntelAtom; }
+ bool isSLM() const { return X86ProcFamily == IntelSLM; }
const Triple &getTargetTriple() const { return TargetTriple; }
@@ -429,6 +430,8 @@ public:
bool postRAScheduler() const { return PostRAScheduler; }
+ bool enableEarlyIfConversion() const override;
+
/// getInstrItins = Return the instruction itineraries based on the
/// subtarget selection.
const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 6f09ccf..93760ef 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -108,6 +108,13 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
if (Options.FloatABIType == FloatABI::Default)
this->Options.FloatABIType = FloatABI::Hard;
+ // Windows stack unwinder gets confused when execution flow "falls through"
+ // after a call to 'noreturn' function.
+ // To prevent that, we emit a trap for 'unreachable' IR instructions.
+ // (which on X86, happens to be the 'ud2' instruction)
+ if (Subtarget.isTargetWin64())
+ this->Options.TrapUnreachable = true;
+
initAsmInfo();
}
@@ -119,12 +126,6 @@ UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
cl::desc("Minimize AVX to SSE transition penalty"),
cl::init(true));
-// Temporary option to control early if-conversion for x86 while adding machine
-// models.
-static cl::opt<bool>
-X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
- cl::desc("Enable early if-conversion on X86"));
-
//===----------------------------------------------------------------------===//
// X86 Analysis Pass Setup
//===----------------------------------------------------------------------===//
@@ -177,19 +178,14 @@ bool X86PassConfig::addInstSelector() {
if (getX86Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None)
addPass(createCleanupLocalDynamicTLSPass());
- // For 32-bit, prepend instructions to set the "global base reg" for PIC.
- if (!getX86Subtarget().is64Bit())
- addPass(createGlobalBaseRegPass());
+ addPass(createX86GlobalBaseRegPass());
return false;
}
bool X86PassConfig::addILPOpts() {
- if (X86EarlyIfConv && getX86Subtarget().hasCMov()) {
- addPass(&EarlyIfConverterID);
- return true;
- }
- return false;
+ addPass(&EarlyIfConverterID);
+ return true;
}
bool X86PassConfig::addPreRegAlloc() {
@@ -208,18 +204,13 @@ bool X86PassConfig::addPreEmitPass() {
ShouldPrint = true;
}
- if (getX86Subtarget().hasAVX() && UseVZeroUpper) {
+ if (UseVZeroUpper) {
addPass(createX86IssueVZeroUpperPass());
ShouldPrint = true;
}
- if (getOptLevel() != CodeGenOpt::None &&
- getX86Subtarget().padShortFunctions()) {
+ if (getOptLevel() != CodeGenOpt::None) {
addPass(createX86PadShortFunctions());
- ShouldPrint = true;
- }
- if (getOptLevel() != CodeGenOpt::None &&
- getX86Subtarget().LEAusesAG()){
addPass(createX86FixupLEAs());
ShouldPrint = true;
}
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 0a88e98..8157085 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -26,7 +26,7 @@ const MCExpr *X86_64MachoTargetObjectFile::getTTypeGlobalReference(
// On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which
// is an indirect pc-relative reference.
- if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
+ if ((Encoding & DW_EH_PE_indirect) && (Encoding & DW_EH_PE_pcrel)) {
const MCSymbol *Sym = TM.getSymbol(GV, Mang);
const MCExpr *Res =
MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
@@ -62,7 +62,7 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol(
// operation.
const SubOperator *Sub = dyn_cast<SubOperator>(CE);
if (!Sub)
- return 0;
+ return nullptr;
// Symbols must first be numbers before we can subtract them, we need to see a
// ptrtoint on both subtraction operands.
@@ -71,13 +71,13 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol(
const PtrToIntOperator *SubRHS =
dyn_cast<PtrToIntOperator>(Sub->getOperand(1));
if (!SubLHS || !SubRHS)
- return 0;
+ return nullptr;
// Our symbols should exist in address space zero, cowardly no-op if
// otherwise.
if (SubLHS->getPointerAddressSpace() != 0 ||
SubRHS->getPointerAddressSpace() != 0)
- return 0;
+ return nullptr;
// Both ptrtoint instructions must wrap global variables:
// - Only global variables are eligible for image relative relocations.
@@ -87,7 +87,7 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol(
const GlobalVariable *GVRHS =
dyn_cast<GlobalVariable>(SubRHS->getPointerOperand());
if (!GVLHS || !GVRHS)
- return 0;
+ return nullptr;
// We expect __ImageBase to be a global variable without a section, externally
// defined.
@@ -96,11 +96,11 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol(
if (GVRHS->isThreadLocal() || GVRHS->getName() != "__ImageBase" ||
!GVRHS->hasExternalLinkage() || GVRHS->hasInitializer() ||
GVRHS->hasSection())
- return 0;
+ return nullptr;
// An image-relative, thread-local, symbol makes no sense.
if (GVLHS->isThreadLocal())
- return 0;
+ return nullptr;
return MCSymbolRefExpr::Create(TM.getSymbol(GVLHS, Mang),
MCSymbolRefExpr::VK_COFF_IMGREL32,
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index c04964d..91b9d40 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -14,37 +14,24 @@
///
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "x86tti"
#include "X86.h"
#include "X86TargetMachine.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Target/CostTable.h"
#include "llvm/Target/TargetLowering.h"
using namespace llvm;
+#define DEBUG_TYPE "x86tti"
+
// Declare the pass initialization routine locally as target-specific passes
-// don't havve a target-wide initialization entry point, and so we rely on the
+// don't have a target-wide initialization entry point, and so we rely on the
// pass constructor initialization.
namespace llvm {
void initializeX86TTIPass(PassRegistry &);
}
-static cl::opt<bool>
-UsePartialUnrolling("x86-use-partial-unrolling", cl::init(true),
- cl::desc("Use partial unrolling for some X86 targets"), cl::Hidden);
-static cl::opt<unsigned>
-PartialUnrollingThreshold("x86-partial-unrolling-threshold", cl::init(0),
- cl::desc("Threshold for X86 partial unrolling"), cl::Hidden);
-static cl::opt<unsigned>
-PartialUnrollingMaxBranches("x86-partial-max-branches", cl::init(2),
- cl::desc("Threshold for taken branches in X86 partial unrolling"),
- cl::Hidden);
-
namespace {
class X86TTI final : public ImmutablePass, public TargetTransformInfo {
@@ -56,7 +43,7 @@ class X86TTI final : public ImmutablePass, public TargetTransformInfo {
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
public:
- X86TTI() : ImmutablePass(ID), ST(0), TLI(0) {
+ X86TTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) {
llvm_unreachable("This pass cannot be directly constructed");
}
@@ -87,8 +74,6 @@ public:
/// \name Scalar TTI Implementations
/// @{
PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
- void getUnrollingPreferences(Loop *L,
- UnrollingPreferences &UP) const override;
/// @}
@@ -153,93 +138,6 @@ X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software;
}
-void X86TTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const {
- if (!UsePartialUnrolling)
- return;
- // According to the Intel 64 and IA-32 Architectures Optimization Reference
- // Manual, Intel Core models and later have a loop stream detector
- // (and associated uop queue) that can benefit from partial unrolling.
- // The relevant requirements are:
- // - The loop must have no more than 4 (8 for Nehalem and later) branches
- // taken, and none of them may be calls.
- // - The loop can have no more than 18 (28 for Nehalem and later) uops.
-
- // According to the Software Optimization Guide for AMD Family 15h Processors,
- // models 30h-4fh (Steamroller and later) have a loop predictor and loop
- // buffer which can benefit from partial unrolling.
- // The relevant requirements are:
- // - The loop must have fewer than 16 branches
- // - The loop must have less than 40 uops in all executed loop branches
-
- unsigned MaxBranches, MaxOps;
- if (PartialUnrollingThreshold.getNumOccurrences() > 0) {
- MaxBranches = PartialUnrollingMaxBranches;
- MaxOps = PartialUnrollingThreshold;
- } else if (ST->isAtom()) {
- // On the Atom, the throughput for taken branches is 2 cycles. For small
- // simple loops, expand by a small factor to hide the backedge cost.
- MaxBranches = 2;
- MaxOps = 10;
- } else if (ST->hasFSGSBase() && ST->hasXOP() /* Steamroller and later */) {
- MaxBranches = 16;
- MaxOps = 40;
- } else if (ST->hasFMA4() /* Any other recent AMD */) {
- return;
- } else if (ST->hasAVX() || ST->hasSSE42() /* Nehalem and later */) {
- MaxBranches = 8;
- MaxOps = 28;
- } else if (ST->hasSSSE3() /* Intel Core */) {
- MaxBranches = 4;
- MaxOps = 18;
- } else {
- return;
- }
-
- // Scan the loop: don't unroll loops with calls, and count the potential
- // number of taken branches (this is somewhat conservative because we're
- // counting all block transitions as potential branches while in reality some
- // of these will become implicit via block placement).
- unsigned MaxDepth = 0;
- for (df_iterator<BasicBlock*> DI = df_begin(L->getHeader()),
- DE = df_end(L->getHeader()); DI != DE;) {
- if (!L->contains(*DI)) {
- DI.skipChildren();
- continue;
- }
-
- MaxDepth = std::max(MaxDepth, DI.getPathLength());
- if (MaxDepth > MaxBranches)
- return;
-
- for (BasicBlock::iterator I = DI->begin(), IE = DI->end(); I != IE; ++I)
- if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
- ImmutableCallSite CS(I);
- if (const Function *F = CS.getCalledFunction()) {
- if (!isLoweredToCall(F))
- continue;
- }
-
- return;
- }
-
- ++DI;
- }
-
- // Enable runtime and partial unrolling up to the specified size.
- UP.Partial = UP.Runtime = true;
- UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps;
-
- // Set the maximum count based on the loop depth. The maximum number of
- // branches taken in a loop (including the backedge) is equal to the maximum
- // loop depth (the DFS path length from the loop header to any block in the
- // loop). When the loop is unrolled, this depth (except for the backedge
- // itself) is multiplied by the unrolling factor. This new unrolled depth
- // must be less than the target-specific maximum branch count (which limits
- // the number of taken branches in the uop buffer).
- if (MaxDepth > 1)
- UP.MaxCount = (MaxBranches-1)/(MaxDepth-1);
-}
-
unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
if (Vector && !ST->hasSSE1())
return 0;
@@ -283,6 +181,21 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
+ static const CostTblEntry<MVT::SimpleValueType>
+ AVX2UniformConstCostTable[] = {
+ { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
+ { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
+ { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
+ { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
+ };
+
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasAVX2()) {
+ int Idx = CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second);
+ if (Idx != -1)
+ return LT.first * AVX2UniformConstCostTable[Idx].Cost;
+ }
+
static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = {
// Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
// customize them to detect the cases where shift amount is a scalar one.
@@ -350,10 +263,19 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
{ ISD::SRA, MVT::v8i16, 1 }, // psraw.
{ ISD::SRA, MVT::v4i32, 1 }, // psrad.
+
+ { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
+ { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
+ { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
+ { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
ST->hasSSE2()) {
+ // pmuldq sequence.
+ if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
+ return LT.first * 15;
+
int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second);
if (Idx != -1)
return LT.first * SSE2UniformConstCostTable[Idx].Cost;
@@ -893,6 +815,13 @@ unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
if (BitSize == 0)
return ~0U;
+ // Never hoist constants larger than 128bit, because this might lead to
+ // incorrect code generation or assertions in codegen.
+ // Fixme: Create a cost model for types larger than i128 once the codegen
+ // issues have been fixed.
+ if (BitSize > 128)
+ return TCC_Free;
+
if (Imm == 0)
return TCC_Free;
@@ -908,8 +837,10 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ // There is no cost model for constants with a bit size of 0. Return TCC_Free
+ // here, so that constant hoisting will ignore this constant.
if (BitSize == 0)
- return ~0U;
+ return TCC_Free;
unsigned ImmIdx = ~0U;
switch (Opcode) {
@@ -931,15 +862,19 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
case Instruction::SDiv:
case Instruction::URem:
case Instruction::SRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
case Instruction::ICmp:
ImmIdx = 1;
break;
+ // Always return TCC_Free for the shift value of a shift instruction.
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ if (Idx == 1)
+ return TCC_Free;
+ break;
case Instruction::Trunc:
case Instruction::ZExt:
case Instruction::SExt:
@@ -966,8 +901,10 @@ unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ // There is no cost model for constants with a bit size of 0. Return TCC_Free
+ // here, so that constant hoisting will ignore this constant.
if (BitSize == 0)
- return ~0U;
+ return TCC_Free;
switch (IID) {
default: return TCC_Free;
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index d4341b9..0bb5f99 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -14,7 +14,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "x86-vzeroupper"
#include "X86.h"
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
@@ -28,6 +27,8 @@
#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
+#define DEBUG_TYPE "x86-vzeroupper"
+
STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
namespace {
@@ -246,7 +247,8 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
/// runOnMachineFunction - Loop over all of the basic blocks, inserting
/// vzero upper instructions before function calls.
bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
- if (MF.getTarget().getSubtarget<X86Subtarget>().hasAVX512())
+ const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>();
+ if (!ST.hasAVX() || ST.hasAVX512())
return false;
TII = MF.getTarget().getInstrInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();